Whamcloud - gitweb
LU-1419 lnet: Add support for Cray's Gemini interconnect
authorJames Simmons <uja.ornl@gmail.com>
Wed, 5 Dec 2012 18:54:39 +0000 (13:54 -0500)
committerOleg Drokin <green@whamcloud.com>
Wed, 19 Dec 2012 22:31:17 +0000 (17:31 -0500)
This patch adds LNET support to use Cray's Gemini
interconnect on their newer systems. The gnilnd was
originally based off of the ralnd.

Signed-off-by: James Simmons <uja.ornl@gmail.com>
Signed-off-by: Chris Horn <hornc@cray.com>
Signed-off-by: Cory Spitz <spitzcor@cray.com>
Change-Id: Ia98a44f4f3d68773438d820c49fe554a3d551dc5
Reviewed-on: http://review.whamcloud.com/3381
Tested-by: Hudson
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Isaac Huang <he.huang@intel.com>
Reviewed-by: Doug Oucharek <doug.s.oucharek@intel.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
19 files changed:
lnet/autoconf/lustre-lnet.m4
lnet/klnds/Makefile.in
lnet/klnds/autoMakefile.am
lnet/klnds/gnilnd/Makefile.in [new file with mode: 0644]
lnet/klnds/gnilnd/autoMakefile.am [new file with mode: 0644]
lnet/klnds/gnilnd/gnilnd.c [new file with mode: 0644]
lnet/klnds/gnilnd/gnilnd.h [new file with mode: 0644]
lnet/klnds/gnilnd/gnilnd_api_wrap.h [new file with mode: 0644]
lnet/klnds/gnilnd/gnilnd_cb.c [new file with mode: 0644]
lnet/klnds/gnilnd/gnilnd_conn.c [new file with mode: 0644]
lnet/klnds/gnilnd/gnilnd_debug.c [new file with mode: 0644]
lnet/klnds/gnilnd/gnilnd_hss_ops.h [new file with mode: 0644]
lnet/klnds/gnilnd/gnilnd_modparams.c [new file with mode: 0644]
lnet/klnds/gnilnd/gnilnd_proc.c [new file with mode: 0644]
lnet/klnds/gnilnd/gnilnd_stack.c [new file with mode: 0644]
lnet/klnds/gnilnd/gnilnd_sysctl.c [new file with mode: 0644]
lnet/klnds/gnilnd/gnilnd_version.h [new file with mode: 0644]
lnet/utils/debug.c
lnet/utils/portals.c

index 4cd30eb..9d9ca11 100644 (file)
@@ -526,13 +526,88 @@ AC_SUBST(RACPPFLAGS)
 AC_SUBST(RALND)
 ])
 
+#
+# LN_CONFIG_GNILND
+#
+# check whether to use the Gemini Network Interface lnd
+#
+AC_DEFUN([LN_CONFIG_GNILND],
+[#### Gemini Network Interface
+AC_MSG_CHECKING([whether to enable GNI lnd])
+AC_ARG_ENABLE([gni],
+       AC_HELP_STRING([--enable-gni],
+                       [enable GNI lnd]),
+       [],[enable_gni='no'])
+AC_MSG_RESULT([$enable_gni])
+
+if test x$enable_gni = xyes ; then
+       AC_MSG_CHECKING([if GNI kernel headers are present])
+       # placeholder
+       # GNICPPFLAGS was set in spec file
+       EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS"
+       EXTRA_KCFLAGS="$EXTRA_KCFLAGS $GNICPPFLAGS"
+       LB_LINUX_TRY_COMPILE([
+               #include <linux/types.h>
+               #include <gni_pub.h>
+       ],[
+               gni_cdm_handle_t        kgni_domain;
+               gni_return_t            rc;
+               int                     rrc;
+
+               rc = gni_cdm_create(0, 1, 1, 0, &kgni_domain);
+
+               rrc = (rc == GNI_RC_SUCCESS) ? 0 : 1;
+
+               return rrc;
+       ],[
+               AC_MSG_RESULT([yes])
+               GNILND="gnilnd"
+       ],[
+               AC_MSG_RESULT([no])
+               AC_MSG_ERROR([can't compile gnilnd with given GNICPPFLAGS: $GNICPPFLAGS])
+       ])
+       # at this point, we have gnilnd basic support, now check for extra features
+       AC_MSG_CHECKING([to use RCA in gnilnd])
+       LB_LINUX_TRY_COMPILE([
+               #include <linux/types.h>
+               #include <gni_pub.h>
+               #include <krca_lib.h>
+       ],[
+               gni_cdm_handle_t        kgni_domain;
+               gni_return_t            rc;
+               krca_ticket_t           ticket = KRCA_NULL_TICKET;
+               int                     rrc;
+               __u32                   nid = 0, nic_addr;
+
+               rc = gni_cdm_create(0, 1, 1, 0, &kgni_domain);
+
+               rrc = (rc == GNI_RC_SUCCESS) ? 0 : 1;
+
+               rrc += krca_nid_to_nicaddrs(nid, 1, &nic_addr);
+
+               rrc += krca_register(&ticket, RCA_MAKE_SERVICE_INDEX(RCA_IO_CLASS, 9), 99, 0);
+
+               return rrc;
+       ],[
+               AC_MSG_RESULT([yes])
+               GNICPPFLAGS="$GNICPPFLAGS -DGNILND_USE_RCA=1"
+               GNILNDRCA="gnilndrca"
+       ],[
+               AC_MSG_RESULT([no])
+       ])
+       EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save"
+fi
+AC_SUBST(GNICPPFLAGS)
+AC_SUBST(GNILNDRCA)
+AC_SUBST(GNILND)
+])
 
 
 #
 #
 # LN_CONFIG_USERSPACE
 #
-# This is defined but empty because it is called from 
+# This is defined but empty because it is called from
 # build/autconf/lustre-build.m4 which is shared by all branches.
 #
 AC_DEFUN([LN_CONFIG_USERSPACE],
@@ -598,6 +673,7 @@ LN_CONFIG_BACKOFF
 LN_CONFIG_QUADRICS
 LN_CONFIG_O2IB
 LN_CONFIG_RALND
+LN_CONFIG_GNILND
 LN_CONFIG_PTLLND
 LN_CONFIG_MX
 # 2.6.32
@@ -740,6 +816,8 @@ AC_DEFUN([LN_CONDITIONALS],
 AM_CONDITIONAL(BUILD_MXLND, test x$MXLND = "xmxlnd")
 AM_CONDITIONAL(BUILD_O2IBLND, test x$O2IBLND = "xo2iblnd")
 AM_CONDITIONAL(BUILD_RALND, test x$RALND = "xralnd")
+AM_CONDITIONAL(BUILD_GNILND, test x$GNILND = "xgnilnd")
+AM_CONDITIONAL(BUILD_GNILND_RCA, test x$GNILNDRCA = "xgnilndrca")
 AM_CONDITIONAL(BUILD_PTLLND, test x$PTLLND = "xptllnd")
 AM_CONDITIONAL(BUILD_USOCKLND, test x$USOCKLND = "xusocklnd")
 ])
@@ -769,6 +847,8 @@ lnet/klnds/qswlnd/Makefile
 lnet/klnds/qswlnd/autoMakefile
 lnet/klnds/ralnd/Makefile
 lnet/klnds/ralnd/autoMakefile
+lnet/klnds/gnilnd/Makefile
+lnet/klnds/gnilnd/autoMakefile
 lnet/klnds/socklnd/Makefile
 lnet/klnds/socklnd/autoMakefile
 lnet/klnds/ptllnd/Makefile
index f0586ae..0d99a87 100644 (file)
@@ -1,5 +1,6 @@
 @BUILD_MXLND_TRUE@subdir-m += mxlnd
 @BUILD_RALND_TRUE@subdir-m += ralnd
+@BUILD_GNILND_TRUE@subdir-m += gnilnd
 @BUILD_O2IBLND_TRUE@subdir-m += o2iblnd
 @BUILD_QSWLND_TRUE@subdir-m += qswlnd
 @BUILD_PTLLND_TRUE@subdir-m += ptllnd
index 57d709c..1591d87 100644 (file)
@@ -34,4 +34,4 @@
 # Lustre is a trademark of Sun Microsystems, Inc.
 #
 
-SUBDIRS = socklnd qswlnd mxlnd ralnd ptllnd o2iblnd
+SUBDIRS = socklnd qswlnd mxlnd ralnd gnilnd ptllnd o2iblnd
diff --git a/lnet/klnds/gnilnd/Makefile.in b/lnet/klnds/gnilnd/Makefile.in
new file mode 100644 (file)
index 0000000..14e8c30
--- /dev/null
@@ -0,0 +1,9 @@
+MODULES := kgnilnd
+kgnilnd-objs := gnilnd.o gnilnd_cb.o gnilnd_modparams.o gnilnd_debug.o gnilnd_proc.o \
+               gnilnd_sysctl.o gnilnd_stack.o gnilnd_conn.o
+
+EXTRA_POST_CFLAGS := -D"SVN_CODE_REV=KBUILD_STR(${SVN_CODE_REV})" @GNICPPFLAGS@
+
+EXTRA_DIST = $(kgnilnd-objs:%.o=%.c) gnilnd.h gnilnd_api_wrap.h
+
+@INCLUDE_RULES@
diff --git a/lnet/klnds/gnilnd/autoMakefile.am b/lnet/klnds/gnilnd/autoMakefile.am
new file mode 100644 (file)
index 0000000..888b68e
--- /dev/null
@@ -0,0 +1,12 @@
+# Copyright (C) 2009  Cray, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+if MODULES
+if BUILD_GNILND
+modulenet_DATA = kgnilnd$(KMODEXT)
+endif
+endif
+
+MOSTLYCLEANFILES = @MOSTLYCLEANFILES@
diff --git a/lnet/klnds/gnilnd/gnilnd.c b/lnet/klnds/gnilnd/gnilnd.c
new file mode 100644 (file)
index 0000000..fcc05fa
--- /dev/null
@@ -0,0 +1,2698 @@
+/*
+ * Copyright (C) 2012 Cray, Inc.
+ *
+ *   Author: Igor Gorodetsky <iogordet@cray.com>
+ *   Author: Nic Henke <nic@cray.com>
+ *   Author: James Shimek <jshimek@cray.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#include "gnilnd.h"
+
+/* Primary entry points from LNET.  There are no guarantees against reentrance. */
+lnd_t the_kgnilnd = {
+       .lnd_type       = GNILND,
+       .lnd_startup    = kgnilnd_startup,
+       .lnd_shutdown   = kgnilnd_shutdown,
+       .lnd_ctl        = kgnilnd_ctl,
+       .lnd_send       = kgnilnd_send,
+       .lnd_recv       = kgnilnd_recv,
+       .lnd_eager_recv = kgnilnd_eager_recv,
+       .lnd_query      = kgnilnd_query,
+};
+
+kgn_data_t      kgnilnd_data;
+kgn_hssops_t   kgnilnd_hssops;
+
+/* needs write_lock on kgn_peer_conn_lock */
+int
+kgnilnd_close_stale_conns_locked(kgn_peer_t *peer, kgn_conn_t *newconn)
+{
+       kgn_conn_t         *conn;
+       struct list_head   *ctmp, *cnxt;
+       int                 loopback;
+       int                 count = 0;
+
+       loopback = peer->gnp_nid == peer->gnp_net->gnn_ni->ni_nid;
+
+       list_for_each_safe(ctmp, cnxt, &peer->gnp_conns) {
+               conn = list_entry(ctmp, kgn_conn_t, gnc_list);
+
+               if (conn->gnc_state != GNILND_CONN_ESTABLISHED)
+                       continue;
+
+               if (conn == newconn)
+                       continue;
+
+               if (conn->gnc_device != newconn->gnc_device)
+                       continue;
+
+               /* This is a two connection loopback - one talking to the other */
+               if (loopback &&
+                   newconn->gnc_my_connstamp == conn->gnc_peer_connstamp &&
+                   newconn->gnc_peer_connstamp == conn->gnc_my_connstamp) {
+                       CDEBUG(D_NET, "skipping prune of %p, "
+                               "loopback and matching stamps"
+                               " connstamp "LPU64"("LPU64")"
+                               " peerstamp "LPU64"("LPU64")\n",
+                               conn, newconn->gnc_my_connstamp,
+                               conn->gnc_peer_connstamp,
+                               newconn->gnc_peer_connstamp,
+                               conn->gnc_my_connstamp);
+                       continue;
+               }
+
+               if (conn->gnc_peerstamp != newconn->gnc_peerstamp) {
+                       LASSERTF(conn->gnc_peerstamp < newconn->gnc_peerstamp,
+                               "conn 0x%p peerstamp "LPU64" >= "
+                               "newconn 0x%p peerstamp "LPU64"\n",
+                               conn, conn->gnc_peerstamp,
+                               newconn, newconn->gnc_peerstamp);
+
+                       CDEBUG(D_NET, "Closing stale conn nid: %s "
+                              " peerstamp:"LPX64"("LPX64")\n",
+                              libcfs_nid2str(peer->gnp_nid),
+                              conn->gnc_peerstamp, newconn->gnc_peerstamp);
+               } else {
+
+                       LASSERTF(conn->gnc_peer_connstamp < newconn->gnc_peer_connstamp,
+                               "conn 0x%p peer_connstamp "LPU64" >= "
+                               "newconn 0x%p peer_connstamp "LPU64"\n",
+                               conn, conn->gnc_peer_connstamp,
+                               newconn, newconn->gnc_peer_connstamp);
+
+                       CDEBUG(D_NET, "Closing stale conn nid: %s"
+                              " connstamp:"LPU64"("LPU64")\n",
+                              libcfs_nid2str(peer->gnp_nid),
+                              conn->gnc_peer_connstamp, newconn->gnc_peer_connstamp);
+               }
+
+               count++;
+               kgnilnd_close_conn_locked(conn, -ESTALE);
+       }
+
+       if (count != 0) {
+               CWARN("Closed %d stale conns to %s\n", count, libcfs_nid2str(peer->gnp_nid));
+       }
+
+       RETURN(count);
+}
+
+int
+kgnilnd_conn_isdup_locked(kgn_peer_t *peer, kgn_conn_t *newconn)
+{
+       kgn_conn_t       *conn;
+       struct list_head *tmp;
+       int               loopback;
+       ENTRY;
+
+       loopback = peer->gnp_nid == peer->gnp_net->gnn_ni->ni_nid;
+
+       list_for_each(tmp, &peer->gnp_conns) {
+               conn = list_entry(tmp, kgn_conn_t, gnc_list);
+               CDEBUG(D_NET, "checking conn 0x%p for peer %s"
+                       " lo %d new "LPU64" existing "LPU64
+                       " new peer "LPU64" existing peer "LPU64
+                       " new dev %p existing dev %p\n",
+                       conn, libcfs_nid2str(peer->gnp_nid),
+                       loopback,
+                       newconn->gnc_peerstamp, conn->gnc_peerstamp,
+                       newconn->gnc_peer_connstamp, conn->gnc_peer_connstamp,
+                       newconn->gnc_device, conn->gnc_device);
+
+               /* conn is in the process of closing */
+               if (conn->gnc_state != GNILND_CONN_ESTABLISHED)
+                       continue;
+
+               /* 'newconn' is from an earlier version of 'peer'!!! */
+               if (newconn->gnc_peerstamp < conn->gnc_peerstamp)
+                       RETURN(1);
+
+               /* 'conn' is from an earlier version of 'peer': it will be
+                * removed when we cull stale conns later on... */
+               if (newconn->gnc_peerstamp > conn->gnc_peerstamp)
+                       continue;
+
+               /* Different devices are OK */
+               if (conn->gnc_device != newconn->gnc_device)
+                       continue;
+
+               /* It's me connecting to myself */
+               if (loopback &&
+                   newconn->gnc_my_connstamp == conn->gnc_peer_connstamp &&
+                   newconn->gnc_peer_connstamp == conn->gnc_my_connstamp)
+                       continue;
+
+               /* 'newconn' is an earlier connection from 'peer'!!! */
+               if (newconn->gnc_peer_connstamp < conn->gnc_peer_connstamp)
+                       RETURN(2);
+
+               /* 'conn' is an earlier connection from 'peer': it will be
+                * removed when we cull stale conns later on... */
+               if (newconn->gnc_peer_connstamp > conn->gnc_peer_connstamp)
+                       continue;
+
+               /* 'newconn' has the SAME connection stamp; 'peer' isn't
+                * playing the game... */
+               RETURN(3);
+       }
+
+       RETURN(0);
+}
+
+int
+kgnilnd_create_conn(kgn_conn_t **connp, kgn_device_t *dev)
+{
+       kgn_conn_t    *conn;
+       gni_return_t   rrc;
+       int            rc = 0;
+
+       LASSERT (!in_interrupt());
+       atomic_inc(&kgnilnd_data.kgn_nconns);
+
+       /* divide by 2 to allow for complete reset and immediate reconnect */
+       if (atomic_read(&kgnilnd_data.kgn_nconns) >= GNILND_MAX_CQID/2) {
+               CERROR("Too many conn are live: %d > %d\n",
+                       atomic_read(&kgnilnd_data.kgn_nconns), GNILND_MAX_CQID/2);
+               atomic_dec(&kgnilnd_data.kgn_nconns);
+               return -E2BIG;
+       }
+
+       LIBCFS_ALLOC(conn, sizeof(*conn));
+       if (conn == NULL) {
+               atomic_dec(&kgnilnd_data.kgn_nconns);
+               return -ENOMEM;
+       }
+
+       LIBCFS_ALLOC(conn->gnc_tx_ref_table, GNILND_MAX_MSG_ID * sizeof(void *));
+       if (conn->gnc_tx_ref_table == NULL) {
+               CERROR("Can't allocate conn tx_ref_table\n");
+               rc = -ENOMEM;
+               GOTO(failed, rc);
+       }
+
+       atomic_set(&conn->gnc_refcount, 1);
+       atomic_set(&conn->gnc_reaper_noop, 0);
+       atomic_set(&conn->gnc_sched_noop, 0);
+       INIT_LIST_HEAD(&conn->gnc_list);
+       INIT_LIST_HEAD(&conn->gnc_hashlist);
+       INIT_LIST_HEAD(&conn->gnc_schedlist);
+       INIT_LIST_HEAD(&conn->gnc_fmaq);
+       INIT_LIST_HEAD(&conn->gnc_mdd_list);
+       spin_lock_init(&conn->gnc_list_lock);
+       spin_lock_init(&conn->gnc_tx_lock);
+
+       /* set tx id to nearly the end to make sure we find wrapping
+        * issues soon */
+       conn->gnc_next_tx = (int) GNILND_MAX_MSG_ID - 10;
+
+       /* if this fails, we have conflicts and MAX_TX is too large */
+       CLASSERT(GNILND_MAX_MSG_ID < GNILND_MSGID_CLOSE);
+
+       /* get a new unique CQ id for this conn */
+       write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+       conn->gnc_my_connstamp = kgnilnd_data.kgn_connstamp++;
+       conn->gnc_cqid = kgnilnd_get_cqid_locked();
+       write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       if (conn->gnc_cqid == 0) {
+               CERROR("Could not allocate unique CQ ID for conn 0x%p\n", conn);
+               rc = -E2BIG;
+               GOTO(failed, rc);
+       }
+
+       CDEBUG(D_NET, "alloc cqid %u for conn 0x%p\n",
+               conn->gnc_cqid, conn);
+
+       /* need to be set before gnc_ephandle to allow kgnilnd_destroy_conn_ep to
+        * check context */
+       conn->gnc_device = dev;
+
+       conn->gnc_timeout = MAX(*kgnilnd_tunables.kgn_timeout,
+                               GNILND_MIN_TIMEOUT);
+       kgnilnd_update_reaper_timeout(conn->gnc_timeout);
+
+       /* this is the ep_handle for doing SMSG & BTE */
+       mutex_lock(&dev->gnd_cq_mutex);
+       rrc = kgnilnd_ep_create(dev->gnd_handle, dev->gnd_snd_fma_cqh,
+                               &conn->gnc_ephandle);
+       mutex_unlock(&dev->gnd_cq_mutex);
+       if (rrc != GNI_RC_SUCCESS) {
+               rc = -ENETDOWN;
+               GOTO(failed, rc);
+       }
+
+       CDEBUG(D_NET, "created conn 0x%p ep_hndl 0x%p\n",
+              conn, conn->gnc_ephandle);
+
+       /* add ref for EP canceling */
+       kgnilnd_conn_addref(conn);
+       atomic_inc(&dev->gnd_neps);
+
+       *connp = conn;
+       return 0;
+
+failed:
+       atomic_dec(&kgnilnd_data.kgn_nconns);
+       LIBCFS_FREE(conn->gnc_tx_ref_table, GNILND_MAX_MSG_ID * sizeof(void *));
+       LIBCFS_FREE(conn, sizeof(*conn));
+       return rc;
+}
+
+/* needs to be called with kgn_peer_conn_lock held (read or write) */
+kgn_conn_t *
+kgnilnd_find_conn_locked(kgn_peer_t *peer)
+{
+       kgn_conn_t      *conn = NULL;
+       ENTRY;
+
+       /* if we are in reset, this conn is going to die soon */
+       if (unlikely(kgnilnd_data.kgn_in_reset)) {
+               RETURN(NULL);
+       }
+
+       /* just return the first ESTABLISHED connection */
+       list_for_each_entry(conn, &peer->gnp_conns, gnc_list) {
+               /* kgnilnd_finish_connect doesn't put connections on the
+                * peer list until they are actually established */
+               LASSERTF(conn->gnc_state >= GNILND_CONN_ESTABLISHED,
+                       "found conn %p state %s on peer %p (%s)\n",
+                       conn, kgnilnd_conn_state2str(conn), peer,
+                       libcfs_nid2str(peer->gnp_nid));
+               if (conn->gnc_state != GNILND_CONN_ESTABLISHED)
+                       continue;
+
+               RETURN(conn);
+       }
+       RETURN(NULL);
+}
+
+/* needs write_lock on kgn_peer_conn_lock held */
+kgn_conn_t *
+kgnilnd_find_or_create_conn_locked(kgn_peer_t *peer) {
+
+       kgn_device_t    *dev = peer->gnp_net->gnn_dev;
+       kgn_conn_t      *conn;
+
+       conn = kgnilnd_find_conn_locked(peer);
+
+       if (conn != NULL) {
+               return conn;
+       }
+
+       /* if the peer was previously connecting, check if we should
+        * trigger another connection attempt yet. */
+       if (time_before(jiffies, peer->gnp_reconnect_time)) {
+               return NULL;
+       }
+
+       /* This check prevents us from creating a new connection to a peer while we are
+        * still in the process of closing an existing connection to the peer.
+        */
+       list_for_each_entry(conn, &peer->gnp_conns, gnc_list) {
+               if (conn->gnc_ephandle != NULL) {
+                       CDEBUG(D_NET, "Not connecting non-null ephandle found peer 0x%p->%s\n", peer,
+                               libcfs_nid2str(peer->gnp_nid));
+                       return NULL;
+               }
+       }
+
+       if (peer->gnp_connecting != GNILND_PEER_IDLE) {
+               /* if we are not connecting, fire up a new connection */
+               /* or if we are anything but IDLE DONT start a new connection */
+              return NULL;
+       }
+
+       CDEBUG(D_NET, "starting connect to %s\n",
+               libcfs_nid2str(peer->gnp_nid));
+       peer->gnp_connecting = GNILND_PEER_CONNECT;
+       kgnilnd_peer_addref(peer); /* extra ref for connd */
+
+       spin_lock(&dev->gnd_connd_lock);
+       list_add_tail(&peer->gnp_connd_list, &dev->gnd_connd_peers);
+       spin_unlock(&dev->gnd_connd_lock);
+
+       kgnilnd_schedule_dgram(dev);
+       CDEBUG(D_NETTRACE, "scheduling new connect\n");
+
+       return NULL;
+}
+
+/* Caller is responsible for deciding if/when to call this */
+void
+kgnilnd_destroy_conn_ep(kgn_conn_t *conn)
+{
+       gni_return_t    rrc;
+       gni_ep_handle_t tmp_ep;
+
+       /* only if we actually initialized it,
+        *  then set NULL to tell kgnilnd_destroy_conn to leave it alone */
+
+       tmp_ep = xchg(&conn->gnc_ephandle, NULL);
+       if (tmp_ep != NULL) {
+               /* we never re-use the EP, so unbind is not needed */
+               mutex_lock(&conn->gnc_device->gnd_cq_mutex);
+               rrc = kgnilnd_ep_destroy(tmp_ep);
+
+               mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+
+               /* if this fails, it could hork up kgni smsg retransmit and others
+                * since we could free the SMSG mbox memory, etc. */
+               LASSERTF(rrc == GNI_RC_SUCCESS, "rrc %d conn 0x%p ep 0x%p\n",
+                        rrc, conn, conn->gnc_ephandle);
+
+               atomic_dec(&conn->gnc_device->gnd_neps);
+
+               /* clear out count added in kgnilnd_close_conn_locked
+                * conn will have a peer once it hits finish_connect, where it
+                * is the first spot we'll mark it ESTABLISHED as well */
+               if (conn->gnc_peer) {
+                       kgnilnd_admin_decref(conn->gnc_peer->gnp_dirty_eps);
+               }
+
+               /* drop ref for EP */
+               kgnilnd_conn_decref(conn);
+       }
+}
+
+void
+kgnilnd_destroy_conn(kgn_conn_t *conn)
+{
+       LASSERTF(!in_interrupt() &&
+               !conn->gnc_scheduled &&
+               !conn->gnc_in_purgatory &&
+               conn->gnc_ephandle == NULL &&
+               list_empty(&conn->gnc_list) &&
+               list_empty(&conn->gnc_hashlist) &&
+               list_empty(&conn->gnc_schedlist) &&
+               list_empty(&conn->gnc_mdd_list),
+               "conn 0x%p->%s IRQ %d sched %d purg %d ep 0x%p lists %d/%d/%d/%d\n",
+               conn, conn->gnc_peer ? libcfs_nid2str(conn->gnc_peer->gnp_nid)
+                                    : "<?>",
+               !!in_interrupt(), conn->gnc_scheduled,
+               conn->gnc_in_purgatory,
+               conn->gnc_ephandle,
+               list_empty(&conn->gnc_list),
+               list_empty(&conn->gnc_hashlist),
+               list_empty(&conn->gnc_schedlist),
+               list_empty(&conn->gnc_mdd_list));
+
+       /* Tripping these is especially bad, as it means we have items on the
+        *  lists that didn't keep their refcount on the connection - or
+        *  somebody evil released their own */
+       LASSERTF(list_empty(&conn->gnc_fmaq) &&
+                atomic_read(&conn->gnc_nlive_fma) == 0 &&
+                atomic_read(&conn->gnc_nlive_rdma) == 0,
+                "conn 0x%p fmaq %d@0x%p nfma %d nrdma %d\n",
+                conn, kgnilnd_count_list(&conn->gnc_fmaq), &conn->gnc_fmaq,
+                atomic_read(&conn->gnc_nlive_fma), atomic_read(&conn->gnc_nlive_rdma));
+
+       CDEBUG(D_NET, "destroying conn %p ephandle %p error %d\n",
+               conn, conn->gnc_ephandle, conn->gnc_error);
+
+       /* if there is an FMA blk left here, we'll tear it down */
+       if (conn->gnc_fma_blk) {
+               kgnilnd_release_mbox(conn, 0);
+       }
+
+       if (conn->gnc_peer != NULL)
+               kgnilnd_peer_decref(conn->gnc_peer);
+
+       if (conn->gnc_tx_ref_table != NULL) {
+               LIBCFS_FREE(conn->gnc_tx_ref_table,
+                           GNILND_MAX_MSG_ID * sizeof(void *));
+       }
+
+       LIBCFS_FREE(conn, sizeof(*conn));
+       atomic_dec(&kgnilnd_data.kgn_nconns);
+}
+
+/* peer_alive and peer_notify done in the style of the o2iblnd */
+void
+kgnilnd_peer_alive(kgn_peer_t *peer)
+{
+       set_mb(peer->gnp_last_alive, jiffies);
+}
+
+void
+kgnilnd_peer_notify(kgn_peer_t *peer, int error)
+{
+       int                     tell_lnet = 0;
+       int                     nnets = 0;
+       int                     rc;
+       int                     i, j;
+       kgn_conn_t             *conn;
+       kgn_net_t             **nets;
+       kgn_net_t              *net;
+
+
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_DONT_NOTIFY))
+               return;
+
+       /* Tell LNet we are giving ups on this peer - but only
+        * if it isn't already reconnected or trying to reconnect */
+       read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       /* use kgnilnd_find_conn_locked to avoid any conns in the process of being nuked
+        *
+        * don't tell LNet if we are in reset - we assume that everyone will be able to
+        * reconnect just fine
+        */
+       conn = kgnilnd_find_conn_locked(peer);
+
+       CDEBUG(D_NETTRACE, "peer 0x%p->%s ting %d conn 0x%p, rst %d error %d\n",
+              peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting, conn,
+              kgnilnd_data.kgn_in_reset, error);
+
+       if ((peer->gnp_connecting == GNILND_PEER_IDLE) &&
+           (conn == NULL) &&
+           (!kgnilnd_data.kgn_in_reset) &&
+           (!kgnilnd_conn_clean_errno(error))) {
+               tell_lnet = 1;
+       }
+
+       read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       if (!tell_lnet) {
+               /* short circuit if we dont need to notify Lnet */
+               return;
+       }
+
+       rc = down_read_trylock(&kgnilnd_data.kgn_net_rw_sem);
+
+       if (rc) {
+           /* dont do this if this fails since LNET is in shutdown or something else
+            */
+
+               for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) {
+                       list_for_each_entry(net , &kgnilnd_data.kgn_nets[i], gnn_list) {
+                               /* if gnn_shutdown set for any net shutdown is in progress just return */
+                               if (net->gnn_shutdown) {
+                                       up_read(&kgnilnd_data.kgn_net_rw_sem);
+                                       return;
+                               }
+                               nnets++;
+                       }
+               }
+
+               if (nnets == 0) {
+                       /* shutdown in progress most likely */
+                       up_read(&kgnilnd_data.kgn_net_rw_sem);
+                       return;
+               }
+
+               LIBCFS_ALLOC(nets, nnets * sizeof(*nets));
+
+               if (nets == NULL) {
+                       up_read(&kgnilnd_data.kgn_net_rw_sem);
+                       CERROR("Failed to allocate nets[%d]\n", nnets);
+                       return;
+               }
+
+               j = 0;
+               for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) {
+                       list_for_each_entry(net, &kgnilnd_data.kgn_nets[i], gnn_list) {
+                               nets[j] = net;
+                               kgnilnd_net_addref(net);
+                               j++;
+                       }
+               }
+               up_read(&kgnilnd_data.kgn_net_rw_sem);
+
+               for (i = 0; i < nnets; i++) {
+                       lnet_nid_t peer_nid;
+
+                       net = nets[i];
+
+                       peer_nid = kgnilnd_lnd2lnetnid(net->gnn_ni->ni_nid,
+                                                                peer->gnp_nid);
+
+                       CDEBUG(D_NET, "peer 0x%p->%s last_alive %lu (%lus ago)\n",
+                               peer, libcfs_nid2str(peer_nid), peer->gnp_last_alive,
+                               cfs_duration_sec(jiffies - peer->gnp_last_alive));
+
+                       lnet_notify(net->gnn_ni, peer_nid, 0, peer->gnp_last_alive);
+
+
+                       kgnilnd_net_decref(net);
+               }
+
+               LIBCFS_FREE(nets, nnets * sizeof(*nets));
+       }
+}
+
+/* need write_lock on kgn_peer_conn_lock */
+void
+kgnilnd_close_conn_locked(kgn_conn_t *conn, int error)
+{
+       kgn_peer_t        *peer = conn->gnc_peer;
+       ENTRY;
+
+       LASSERT(!in_interrupt());
+
+       /* store error for tx completion */
+       conn->gnc_error = error;
+       peer->gnp_last_errno = error;
+
+       /* use real error from peer if possible */
+       if (error == -ECONNRESET) {
+               error = conn->gnc_peer_error;
+       }
+
+       /* if we NETERROR, make sure it is rate limited */
+       if (!kgnilnd_conn_clean_errno(error)) {
+               CNETERR("closing conn to %s: error %d\n",
+                      libcfs_nid2str(peer->gnp_nid), error);
+       } else {
+               CDEBUG(D_NET, "closing conn to %s: error %d\n",
+                      libcfs_nid2str(peer->gnp_nid), error);
+       }
+
+       LASSERTF(conn->gnc_state == GNILND_CONN_ESTABLISHED,
+               "conn %p to %s with bogus state %s\n", conn,
+               libcfs_nid2str(conn->gnc_peer->gnp_nid),
+               kgnilnd_conn_state2str(conn));
+       LASSERT(!list_empty(&conn->gnc_hashlist));
+       LASSERT(!list_empty(&conn->gnc_list));
+
+
+       /* mark peer count here so any place the EP gets destroyed will
+        * open up the peer count so that a new ESTABLISHED conn is then free
+        * to send new messages -- sending before the previous EPs are destroyed
+        * could end up with messages on the network for the old conn _after_
+        * the new conn and break the mbox safety protocol */
+       kgnilnd_admin_addref(conn->gnc_peer->gnp_dirty_eps);
+
+       /* Remove from conn hash table: no new callbacks */
+       list_del_init(&conn->gnc_hashlist);
+       kgnilnd_data.kgn_conn_version++;
+
+       /* if we are in reset, go right to CLOSED as there is no scheduler
+        * thread to move from CLOSING to CLOSED */
+       if (unlikely(kgnilnd_data.kgn_in_reset)) {
+               conn->gnc_state = GNILND_CONN_CLOSED;
+       } else {
+               conn->gnc_state = GNILND_CONN_CLOSING;
+       }
+
+       /* leave on peer->gnp_conns to make sure we don't let the reaper
+        * or others try to unlink this peer until the conn is fully
+        * processed for closing */
+
+       if (kgnilnd_check_purgatory_conn(conn)) {
+               kgnilnd_add_purgatory_locked(conn, conn->gnc_peer);
+       }
+
+       /* Reset RX timeout to ensure we wait for an incoming CLOSE
+        * for the full timeout.  If we get a CLOSE we know the
+        * peer has stopped all RDMA.  Otherwise if we wait for
+        * the full timeout we can also be sure all RDMA has stopped. */
+       conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies;
+       mb();
+
+       /* schedule sending CLOSE - if we are in quiesce, this adds to
+        * gnd_ready_conns and allows us to find it in quiesce processing */
+       kgnilnd_schedule_conn(conn);
+
+       /* lose peer's ref */
+       kgnilnd_conn_decref(conn);
+       /* -1 for conn table */
+       kgnilnd_conn_decref(conn);
+
+       EXIT;
+}
+
+void
+kgnilnd_close_conn(kgn_conn_t *conn, int error)
+{
+       write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+       /* need to check the state here - this call is racy and we don't
+        * know the state until after the lock is grabbed */
+       if (conn->gnc_state == GNILND_CONN_ESTABLISHED) {
+               kgnilnd_close_conn_locked(conn, error);
+       }
+       write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+}
+
+void
+kgnilnd_complete_closed_conn(kgn_conn_t *conn)
+{
+       LIST_HEAD               (sinners);
+       kgn_tx_t               *tx, *txn;
+       int                     nlive = 0;
+       int                     nrdma = 0;
+       int                     nq_rdma = 0;
+       int                     logmsg;
+       ENTRY;
+
+       /* Dump log  on cksum error - wait until complete phase to let
+        * RX of error happen */
+       if (*kgnilnd_tunables.kgn_checksum_dump &&
+           (conn != NULL && conn->gnc_peer_error == -ENOKEY)) {
+               libcfs_debug_dumplog();
+       }
+
+       /* _CLOSED set in kgnilnd_process_fmaq once we decide to
+        * send the CLOSE or not */
+       LASSERTF(conn->gnc_state == GNILND_CONN_CLOSED,
+                "conn 0x%p->%s with bad state %s\n",
+                conn, conn->gnc_peer ?
+                       libcfs_nid2str(conn->gnc_peer->gnp_nid) :
+                       "<?>",
+                kgnilnd_conn_state2str(conn));
+
+       LASSERT(list_empty(&conn->gnc_hashlist));
+
+       /* we've sent the close, start nuking */
+
+       /* we don't use lists to track things that we can get out of the
+        * tx_ref table... */
+
+       /* need to hold locks for tx_list_state, sampling it is too racy:
+        * - the lock actually protects tx != NULL, but we can't take the proper
+        *   lock until we check tx_list_state, which would be too late and
+        *   we could have the TX change under us.
+        * gnd_rdmaq_lock and gnd_lock and not used together, so taking both
+        * should be fine */
+       spin_lock(&conn->gnc_device->gnd_rdmaq_lock);
+       spin_lock(&conn->gnc_device->gnd_lock);
+
+       for (nrdma = 0; nrdma < GNILND_MAX_MSG_ID; nrdma++) {
+               tx = conn->gnc_tx_ref_table[nrdma];
+
+               if (tx != NULL) {
+                       /* only print the first error and if not CLOSE, we often don't see
+                        * CQ events for that by the time we get here... and really don't care */
+                       if (nlive || tx->tx_msg.gnm_type == GNILND_MSG_CLOSE)
+                               tx->tx_state |= GNILND_TX_QUIET_ERROR;
+                       nlive++;
+                       GNIDBG_TX(D_NET, tx, "cleaning up on close, nlive %d", nlive);
+
+                       /* don't worry about gnc_lock here as nobody else should be
+                        * touching this conn */
+                       kgnilnd_tx_del_state_locked(tx, NULL, conn, GNILND_TX_ALLOCD);
+                       list_add_tail(&tx->tx_list, &sinners);
+               }
+       }
+       spin_unlock(&conn->gnc_device->gnd_lock);
+       spin_unlock(&conn->gnc_device->gnd_rdmaq_lock);
+
+       /* nobody should have marked this as needing scheduling after
+        * we called close - so only ref should be us handling it */
+       LASSERTF(conn->gnc_scheduled == GNILND_CONN_PROCESS,
+                "conn 0x%p scheduled %d\n", conn, conn->gnc_scheduled);
+
+       /* now reset a few to actual counters... */
+       nrdma = atomic_read(&conn->gnc_nlive_rdma);
+       nq_rdma = atomic_read(&conn->gnc_nq_rdma);
+
+       if (!list_empty(&sinners)) {
+               list_for_each_entry_safe(tx, txn, &sinners, tx_list) {
+                       /* clear tx_list to make tx_add_list_locked happy */
+                       list_del_init(&tx->tx_list);
+                       /* The error codes determine if we hold onto the MDD */
+                       kgnilnd_tx_done(tx, conn->gnc_error);
+               }
+       }
+
+       logmsg = (nlive + nrdma + nq_rdma);
+
+       if (logmsg) {
+               if (conn->gnc_peer_error != 0) {
+                       CNETERR("Closed conn 0x%p->%s (errno %d, peer errno %d): "
+                               "canceled %d TX, %d/%d RDMA\n",
+                               conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+                               conn->gnc_error, conn->gnc_peer_error,
+                               nlive, nq_rdma, nrdma);
+               } else {
+                       CNETERR("Closed conn 0x%p->%s (errno %d): "
+                               "canceled %d TX, %d/%d RDMA\n",
+                               conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+                               conn->gnc_error,
+                               nlive, nq_rdma, nrdma);
+               }
+       }
+
+       kgnilnd_destroy_conn_ep(conn);
+
+       /* Bug 765042 - race this with completing a new conn to same peer - we need
+        * finish_connect to detach purgatory before we can do it ourselves here */
+       CFS_RACE(CFS_FAIL_GNI_FINISH_PURG);
+
+       /* now it is safe to remove from peer list - anyone looking at
+        * gnp_conns now is free to unlink if not on purgatory */
+       write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       conn->gnc_state = GNILND_CONN_DONE;
+
+       /* Decrement counter if we are marked by del_conn_or_peers for closing
+        */
+       if (conn->gnc_needs_closing)
+               kgnilnd_admin_decref(kgnilnd_data.kgn_npending_conns);
+
+       /* Remove from peer's list of valid connections if its not in purgatory */
+       if (!conn->gnc_in_purgatory) {
+               list_del_init(&conn->gnc_list);
+       }
+
+       /* NB - only unlinking if we set pending in del_peer_locked from admin or
+        * shutdown */
+       if (kgnilnd_peer_active(conn->gnc_peer) &&
+           conn->gnc_peer->gnp_pending_unlink &&
+           kgnilnd_can_unlink_peer_locked(conn->gnc_peer)) {
+               kgnilnd_unlink_peer_locked(conn->gnc_peer);
+       }
+
+       write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       /* I'm telling Mommy! - use peer_error if they initiated close */
+       kgnilnd_peer_notify(conn->gnc_peer,
+                           conn->gnc_error == -ECONNRESET ? conn->gnc_peer_error
+                                                          : conn->gnc_error);
+
+       EXIT;
+}
+
+int
+kgnilnd_set_conn_params(kgn_dgram_t *dgram)
+{
+       kgn_conn_t             *conn = dgram->gndg_conn;
+       kgn_connreq_t          *connreq = &dgram->gndg_conn_in;
+       kgn_gniparams_t        *rem_param = &connreq->gncr_gnparams;
+       gni_return_t            rrc;
+       int                     rc = 0;
+
+       /* set timeout vals in conn early so we can use them for the NAK */
+
+       /* use max of the requested and our timeout, peer will do the same */
+       conn->gnc_timeout = MAX(conn->gnc_timeout, connreq->gncr_timeout);
+
+       /* only ep_bind really mucks around with the CQ */
+       /* only ep bind if we are not connecting to ourself and the dstnid is not a wildcard. this check
+        * is necessary as you can only bind an ep once and we must make sure we dont bind when already bound.
+        */
+       if (connreq->gncr_dstnid != LNET_NID_ANY && dgram->gndg_conn_out.gncr_dstnid != connreq->gncr_srcnid) {
+               mutex_lock(&conn->gnc_device->gnd_cq_mutex);
+               rrc = kgnilnd_ep_bind(conn->gnc_ephandle,
+                       connreq->gncr_gnparams.gnpr_host_id,
+                       conn->gnc_cqid);
+               mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+               if (rrc != GNI_RC_SUCCESS) {
+                       rc = -ECONNABORTED;
+                       goto return_out;
+               }
+       }
+
+       rrc = kgnilnd_ep_set_eventdata(conn->gnc_ephandle, conn->gnc_cqid,
+                        connreq->gncr_gnparams.gnpr_cqid);
+       if (rrc != GNI_RC_SUCCESS) {
+               rc = -ECONNABORTED;
+               goto cleanup_out;
+       }
+
+       /* Initialize SMSG */
+       rrc = kgnilnd_smsg_init(conn->gnc_ephandle, &conn->gnpr_smsg_attr,
+                       &connreq->gncr_gnparams.gnpr_smsg_attr);
+       if (unlikely(rrc == GNI_RC_INVALID_PARAM)) {
+               gni_smsg_attr_t *local = &conn->gnpr_smsg_attr;
+               gni_smsg_attr_t *remote = &connreq->gncr_gnparams.gnpr_smsg_attr;
+               /* help folks figure out if there is a tunable off, etc. */
+               LCONSOLE_ERROR("SMSG attribute mismatch. Data from local/remote:"
+                              " type %d/%d msg_maxsize %u/%u"
+                              " mbox_maxcredit %u/%u. Please check kgni"
+                              " logs for further data\n",
+                              local->msg_type, remote->msg_type,
+                              local->msg_maxsize, remote->msg_maxsize,
+                              local->mbox_maxcredit, remote->mbox_maxcredit);
+       }
+       if (rrc != GNI_RC_SUCCESS) {
+               rc = -ECONNABORTED;
+               goto cleanup_out;
+       }
+
+       /* log this for help in debuggin SMSG buffer re-use */
+       CDEBUG(D_NET, "conn %p src %s dst %s smsg %p acquired"
+               " local cqid %u SMSG %p->%u hndl "LPX64"."LPX64
+               " remote cqid %u SMSG %p->%u hndl "LPX64"."LPX64"\n",
+               conn, libcfs_nid2str(connreq->gncr_srcnid),
+               libcfs_nid2str(connreq->gncr_dstnid),
+               &conn->gnpr_smsg_attr,
+               conn->gnc_cqid,
+               conn->gnpr_smsg_attr.msg_buffer,
+               conn->gnpr_smsg_attr.mbox_offset,
+               conn->gnpr_smsg_attr.mem_hndl.qword1,
+               conn->gnpr_smsg_attr.mem_hndl.qword2,
+               rem_param->gnpr_cqid,
+               rem_param->gnpr_smsg_attr.msg_buffer,
+               rem_param->gnpr_smsg_attr.mbox_offset,
+               rem_param->gnpr_smsg_attr.mem_hndl.qword1,
+               rem_param->gnpr_smsg_attr.mem_hndl.qword2);
+
+       conn->gnc_peerstamp = connreq->gncr_peerstamp;
+       conn->gnc_peer_connstamp = connreq->gncr_connstamp;
+
+       /* We update the reaper timeout once we have a valid conn and timeout */
+       kgnilnd_update_reaper_timeout(GNILND_TO2KA(conn->gnc_timeout));
+
+       return 0;
+
+cleanup_out:
+       rrc = kgnilnd_ep_unbind(conn->gnc_ephandle);
+       /* not sure I can just let this fly */
+       LASSERTF(rrc == GNI_RC_SUCCESS,
+               "bad rc from gni_ep_unbind trying to cleanup: %d\n", rrc);
+
+return_out:
+       LASSERTF(rc != 0, "SOFTWARE BUG: rc == 0\n");
+       CERROR("Error setting connection params from %s: %d\n",
+              libcfs_nid2str(connreq->gncr_srcnid), rc);
+       return rc;
+}
+
+/* needs down_read on kgn_net_rw_sem held from before this call until
+ * after the write_lock on kgn_peer_conn_lock - this ensures we stay sane
+ * with kgnilnd_shutdown - it'll get the sem and set shutdown, then get the
+ * kgn_peer_conn_lock to start del_peer'ing. If we hold the sem until after
+ * kgn_peer_conn_lock is held, we guarantee that nobody calls
+ * kgnilnd_add_peer_locked without checking gnn_shutdown */
+int
+kgnilnd_create_peer_safe(kgn_peer_t **peerp, lnet_nid_t nid, kgn_net_t *net)
+{
+       kgn_peer_t    *peer;
+       int            rc;
+
+       LASSERT(nid != LNET_NID_ANY);
+
+       /* We dont pass the net around in the dgram anymore so here is where we find it
+        * this will work unless its in shutdown or the nid has a net that is invalid.
+        * Either way error code needs to be returned in that case.
+        *
+        * If the net passed in is not NULL then we can use it, this alleviates looking it
+        * when the calling function has access to the data.
+        */
+       if (net == NULL) {
+               rc = kgnilnd_find_net(nid, &net);
+               if (rc < 0)
+                       return rc;
+       } else {
+               /* find net adds a reference on the net if we are not using
+                * it we must do it manually so the net references are
+                * correct when tearing down the net
+                */
+               kgnilnd_net_addref(net);
+       }
+
+       LIBCFS_ALLOC(peer, sizeof(*peer));
+       if (peer == NULL) {
+               kgnilnd_net_decref(net);
+               return -ENOMEM;
+       }
+       peer->gnp_nid = nid;
+
+       /* translate from nid to nic addr & store */
+       rc = kgnilnd_nid_to_nicaddrs(LNET_NIDADDR(nid), 1, &peer->gnp_host_id);
+       if (rc <= 0) {
+               kgnilnd_net_decref(net);
+               LIBCFS_FREE(peer, sizeof(*peer));
+               return -ESRCH;
+       }
+       CDEBUG(D_NET, "peer 0x%p->%s -> NIC 0x%x\n", peer,
+               libcfs_nid2str(nid), peer->gnp_host_id);
+
+       atomic_set(&peer->gnp_refcount, 1);     /* 1 ref for caller */
+       atomic_set(&peer->gnp_dirty_eps, 0);
+
+       INIT_LIST_HEAD(&peer->gnp_list);
+       INIT_LIST_HEAD(&peer->gnp_connd_list);
+       INIT_LIST_HEAD(&peer->gnp_conns);
+       INIT_LIST_HEAD(&peer->gnp_tx_queue);
+
+       /* the first reconnect should happen immediately, so we leave
+        * gnp_reconnect_interval set to 0 */
+
+       LASSERTF(net != NULL, "peer 0x%p->%s with NULL net\n",
+                peer, libcfs_nid2str(nid));
+
+       /* must have kgn_net_rw_sem held for this...  */
+       if (net->gnn_shutdown) {
+               /* shutdown has started already */
+               kgnilnd_net_decref(net);
+               LIBCFS_FREE(peer, sizeof(*peer));
+               return -ESHUTDOWN;
+       }
+
+       peer->gnp_net = net;
+
+       atomic_inc(&kgnilnd_data.kgn_npeers);
+
+       *peerp = peer;
+       return 0;
+}
+
+void
+kgnilnd_destroy_peer(kgn_peer_t *peer)
+{
+       CDEBUG(D_NET, "peer %s %p deleted\n",
+              libcfs_nid2str(peer->gnp_nid), peer);
+       LASSERTF(atomic_read(&peer->gnp_refcount) == 0,
+                "peer 0x%p->%s refs %d\n",
+                peer, libcfs_nid2str(peer->gnp_nid),
+                atomic_read(&peer->gnp_refcount));
+       LASSERTF(atomic_read(&peer->gnp_dirty_eps) == 0,
+                "peer 0x%p->%s dirty eps %d\n",
+                peer, libcfs_nid2str(peer->gnp_nid),
+                atomic_read(&peer->gnp_dirty_eps));
+       LASSERTF(peer->gnp_net != NULL, "peer %p (%s) with NULL net\n",
+                peer, libcfs_nid2str(peer->gnp_nid));
+       LASSERTF(!kgnilnd_peer_active(peer),
+                "peer 0x%p->%s\n",
+               peer, libcfs_nid2str(peer->gnp_nid));
+       LASSERTF(peer->gnp_connecting == GNILND_PEER_IDLE || peer->gnp_connecting == GNILND_PEER_KILL,
+                "peer 0x%p->%s, connecting %d\n",
+               peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
+       LASSERTF(list_empty(&peer->gnp_conns),
+                "peer 0x%p->%s\n",
+               peer, libcfs_nid2str(peer->gnp_nid));
+       LASSERTF(list_empty(&peer->gnp_tx_queue),
+                "peer 0x%p->%s\n",
+               peer, libcfs_nid2str(peer->gnp_nid));
+       LASSERTF(list_empty(&peer->gnp_connd_list),
+                "peer 0x%p->%s\n",
+               peer, libcfs_nid2str(peer->gnp_nid));
+
+       /* NB a peer's connections keep a reference on their peer until
+        * they are destroyed, so we can be assured that _all_ state to do
+        * with this peer has been cleaned up when its refcount drops to
+        * zero. */
+
+       atomic_dec(&kgnilnd_data.kgn_npeers);
+       kgnilnd_net_decref(peer->gnp_net);
+
+       LIBCFS_FREE(peer, sizeof(*peer));
+}
+
+/* the conn might not have made it all the way through to a connected
+ * state - but we need to purgatory any conn that a remote peer might
+ * have seen through a posted dgram as well */
+void
+kgnilnd_add_purgatory_locked(kgn_conn_t *conn, kgn_peer_t *peer)
+{
+       kgn_mbox_info_t *mbox = NULL;
+       ENTRY;
+
+       /* NB - the caller should own conn by removing him from the
+        * scheduler thread when finishing the close */
+
+       LASSERTF(peer != NULL, "conn %p with NULL peer\n", conn);
+
+       /* If this is still true, need to add the calls to unlink back in and
+        * figure out how to close the hole on loopback conns */
+       LASSERTF(kgnilnd_peer_active(peer), "can't use inactive peer %s (%p)"
+               " we'll never recover the resources\n",
+               libcfs_nid2str(peer->gnp_nid), peer);
+
+       CDEBUG(D_NET, "conn %p peer %p dev %p\n", conn, peer,
+               conn->gnc_device);
+
+       /* add ref for mbox purgatory hold */
+       kgnilnd_peer_addref(peer);
+       kgnilnd_conn_addref(conn);
+       conn->gnc_in_purgatory = 1;
+
+       mbox = &conn->gnc_fma_blk->gnm_mbox_info[conn->gnc_mbox_id];
+       mbox->mbx_prev_nid = peer->gnp_nid;
+       mbox->mbx_add_purgatory = jiffies;
+       kgnilnd_release_mbox(conn, 1);
+
+       LASSERTF(list_empty(&conn->gnc_mdd_list),
+               "conn 0x%p->%s with active purgatory hold MDD %d\n",
+               conn, libcfs_nid2str(peer->gnp_nid),
+               kgnilnd_count_list(&conn->gnc_mdd_list));
+
+       EXIT;
+}
+
+/* Instead of detaching everything from purgatory here we just mark the conn as needing
+ * detach, when the reaper checks the conn the next time it will detach it.
+ * Calling function requires write_lock held on kgn_peer_conn_lock
+ */
+void
+kgnilnd_mark_for_detach_purgatory_all_locked(kgn_peer_t *peer) {
+       kgn_conn_t       *conn;
+
+       list_for_each_entry(conn, &peer->gnp_conns, gnc_list) {
+               if (conn->gnc_in_purgatory && !conn->gnc_needs_detach) {
+                       conn->gnc_needs_detach = 1;
+                       kgnilnd_admin_addref(kgnilnd_data.kgn_npending_detach);
+               }
+       }
+}
+
+/* Calling function needs a write_lock held on kgn_peer_conn_lock */
+void
+kgnilnd_detach_purgatory_locked(kgn_conn_t *conn, struct list_head *conn_list)
+{
+       kgn_mbox_info_t *mbox = NULL;
+
+       /* if needed, add the conn purgatory data to the list passed in */
+       if (conn->gnc_in_purgatory) {
+               CDEBUG(D_NET, "peer %p->%s purg_conn %p@%s mdd_list #tx %d\n",
+                       conn->gnc_peer, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+                       conn, kgnilnd_conn_state2str(conn),
+                       kgnilnd_count_list(&conn->gnc_mdd_list));
+
+               mbox = &conn->gnc_fma_blk->gnm_mbox_info[conn->gnc_mbox_id];
+               mbox->mbx_detach_of_purgatory = jiffies;
+
+               /* conn->gnc_list is the entry point on peer->gnp_conns, so detaching it
+                * here removes it from the list of 'valid' peer connections.
+                * We put the current conn onto a list of conns to call kgnilnd_release_purgatory_locked()
+                * and as such the caller of kgnilnd_detach_purgatory_locked() now owns that conn, since its not
+                * on the peer's conn_list anymore.
+                */
+
+               kgnilnd_peer_decref(conn->gnc_peer);
+               list_del_init(&conn->gnc_list);
+
+               /* NB - only unlinking if we set pending in del_peer_locked from admin or
+                * shutdown */
+               if (kgnilnd_peer_active(conn->gnc_peer) &&
+                   conn->gnc_peer->gnp_pending_unlink &&
+                   kgnilnd_can_unlink_peer_locked(conn->gnc_peer)) {
+                       kgnilnd_unlink_peer_locked(conn->gnc_peer);
+               }
+               /* The reaper will not call detach unless the conn is fully through kgnilnd_complete_closed_conn.
+                * If the conn is not in a DONE state somehow we are attempting to detach even though
+                * the conn has not been fully cleaned up. If we detach while the conn is still closing
+                * we will end up with an orphaned connection that has valid ep_handle, that is not on a
+                * peer.
+                */
+
+               LASSERTF(conn->gnc_state == GNILND_CONN_DONE, "Conn in invalid state  %p@%s \n",
+                               conn, kgnilnd_conn_state2str(conn));
+
+               /* move from peer to the delayed release list */
+               list_add_tail(&conn->gnc_list, conn_list);
+       }
+}
+
+void
+kgnilnd_release_purgatory_list(struct list_head *conn_list)
+{
+       kgn_device_t            *dev;
+       kgn_conn_t              *conn, *connN;
+       kgn_mdd_purgatory_t     *gmp, *gmpN;
+
+       list_for_each_entry_safe(conn, connN, conn_list, gnc_list) {
+               dev = conn->gnc_device;
+
+               kgnilnd_release_mbox(conn, -1);
+               conn->gnc_in_purgatory = 0;
+
+               list_del_init(&conn->gnc_list);
+
+               /* gnc_needs_detach is set in kgnilnd_del_conn_or_peer. It is used to keep track
+                * of conns that have been marked for detach by kgnilnd_del_conn_or_peer.
+                * The function uses kgn_npending_detach to verify the conn has
+                * actually been detached.
+                */
+
+               if (conn->gnc_needs_detach)
+                       kgnilnd_admin_decref(kgnilnd_data.kgn_npending_detach);
+
+               /* if this guy is really dead (we are doing release from reaper),
+                * make sure we tell LNet - if this is from other context,
+                * the checks in the function will prevent an errant
+                * notification */
+               kgnilnd_peer_notify(conn->gnc_peer, conn->gnc_error);
+
+               list_for_each_entry_safe(gmp, gmpN, &conn->gnc_mdd_list,
+                                        gmp_list) {
+                       CDEBUG(D_NET,
+                              "dev %p releasing held mdd "LPX64"."LPX64"\n",
+                              conn->gnc_device, gmp->gmp_map_key.qword1,
+                              gmp->gmp_map_key.qword2);
+
+                       atomic_dec(&dev->gnd_n_mdd_held);
+                       kgnilnd_mem_mdd_release(conn->gnc_device->gnd_handle,
+                                               &gmp->gmp_map_key);
+                       /* ignoring the return code - if kgni/ghal can't find it
+                        * it must be released already */
+
+                       list_del_init(&gmp->gmp_list);
+                       LIBCFS_FREE(gmp, sizeof(*gmp));
+               }
+               /* lose conn ref for purgatory */
+               kgnilnd_conn_decref(conn);
+       }
+}
+
+/* needs write_lock on kgnilnd_data.kgn_peer_conn_lock held */
+void
+kgnilnd_peer_increase_reconnect_locked(kgn_peer_t *peer)
+{
+       int current_to;
+
+       current_to = peer->gnp_reconnect_interval;
+
+       /* we'll try to reconnect fast the first time, then back-off */
+       if (current_to == 0) {
+               peer->gnp_reconnect_time = jiffies - 1;
+               current_to = *kgnilnd_tunables.kgn_min_reconnect_interval;
+       } else {
+               peer->gnp_reconnect_time = jiffies + cfs_time_seconds(current_to);
+               /* add 50% of min timeout & retry */
+               current_to += *kgnilnd_tunables.kgn_min_reconnect_interval / 2;
+       }
+
+       current_to = MIN(current_to,
+                               *kgnilnd_tunables.kgn_max_reconnect_interval);
+
+       peer->gnp_reconnect_interval = current_to;
+       CDEBUG(D_NET, "peer %s can reconnect at %lu interval %lu\n",
+              libcfs_nid2str(peer->gnp_nid), peer->gnp_reconnect_time,
+              peer->gnp_reconnect_interval);
+}
+
+/* needs kgnilnd_data.kgn_peer_conn_lock held */
+kgn_peer_t *
+kgnilnd_find_peer_locked(lnet_nid_t nid)
+{
+       struct list_head *peer_list = kgnilnd_nid2peerlist(nid);
+       kgn_peer_t       *peer;
+
+       /* Chopping nid down to only NIDADDR using LNET_NIDADDR so we only
+        * have a single peer per device instead of a peer per nid/net combo.
+        */
+
+       list_for_each_entry(peer, peer_list, gnp_list) {
+               if (LNET_NIDADDR(nid) != LNET_NIDADDR(peer->gnp_nid))
+                       continue;
+
+               CDEBUG(D_NET, "got peer [%p] -> %s c %d (%d)\n",
+                      peer, libcfs_nid2str(nid),
+                      peer->gnp_connecting,
+                      atomic_read(&peer->gnp_refcount));
+               return peer;
+       }
+       return NULL;
+}
+
+/* need write_lock on kgn_peer_conn_lock */
+void
+kgnilnd_unlink_peer_locked(kgn_peer_t *peer)
+{
+       LASSERTF(list_empty(&peer->gnp_conns),
+               "peer 0x%p->%s\n",
+                peer, libcfs_nid2str(peer->gnp_nid));
+       LASSERTF(list_empty(&peer->gnp_tx_queue),
+               "peer 0x%p->%s\n",
+                peer, libcfs_nid2str(peer->gnp_nid));
+       LASSERTF(kgnilnd_peer_active(peer),
+               "peer 0x%p->%s\n",
+                peer, libcfs_nid2str(peer->gnp_nid));
+       CDEBUG(D_NET, "unlinking peer 0x%p->%s\n",
+               peer, libcfs_nid2str(peer->gnp_nid));
+
+       list_del_init(&peer->gnp_list);
+       kgnilnd_data.kgn_peer_version++;
+       kgnilnd_admin_decref(kgnilnd_data.kgn_npending_unlink);
+       /* lose peerlist's ref */
+       kgnilnd_peer_decref(peer);
+}
+
+int
+kgnilnd_get_peer_info(int index,
+                     kgn_peer_t **found_peer,
+                     lnet_nid_t *id, __u32 *nic_addr,
+                     int *refcount, int *connecting)
+{
+       struct list_head  *ptmp;
+       kgn_peer_t        *peer;
+       int               i;
+       int               rc = -ENOENT;
+
+       read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
+
+               list_for_each(ptmp, &kgnilnd_data.kgn_peers[i]) {
+                       peer = list_entry(ptmp, kgn_peer_t, gnp_list);
+
+                       if (peer->gnp_nid != *id)
+                               continue;
+
+                       if (index-- > 0)
+                               continue;
+
+                       CDEBUG(D_NET, "found peer %p (%s) at index %d\n",
+                              peer, libcfs_nid2str(peer->gnp_nid), index);
+
+                       *found_peer  = peer;
+                       *id          = peer->gnp_nid;
+                       *nic_addr    = peer->gnp_host_id;
+                       *refcount    = atomic_read(&peer->gnp_refcount);
+                       *connecting  = peer->gnp_connecting;
+
+                       rc = 0;
+                       goto out;
+               }
+       }
+out:
+       read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+       if (rc)
+               CDEBUG(D_NET, "no gni peer at index %d\n", index);
+       return rc;
+}
+
+/* requires write_lock on kgn_peer_conn_lock held */
+void
+kgnilnd_add_peer_locked(lnet_nid_t nid, kgn_peer_t *new_stub_peer, kgn_peer_t **peerp)
+{
+       kgn_peer_t        *peer, *peer2;
+
+       LASSERTF(new_stub_peer != NULL, "bad stub peer for nid %s\n",
+                libcfs_nid2str(nid));
+
+       peer2 = kgnilnd_find_peer_locked(nid);
+       if (peer2 != NULL) {
+               /* A peer was created during the lock transition, so drop
+                * the new one we created */
+               kgnilnd_peer_decref(new_stub_peer);
+               peer = peer2;
+       } else {
+               peer = new_stub_peer;
+               /* peer table takes existing ref on peer */
+
+               LASSERTF(!kgnilnd_peer_active(peer),
+                       "peer 0x%p->%s already in peer table\n",
+                       peer, libcfs_nid2str(peer->gnp_nid));
+               list_add_tail(&peer->gnp_list,
+                             kgnilnd_nid2peerlist(nid));
+               kgnilnd_data.kgn_peer_version++;
+       }
+
+       LASSERTF(peer->gnp_net != NULL, "peer 0x%p->%s with NULL net\n",
+                peer, libcfs_nid2str(peer->gnp_nid));
+       *peerp = peer;
+}
+
+int
+kgnilnd_add_peer(kgn_net_t *net, lnet_nid_t nid, kgn_peer_t **peerp)
+{
+       kgn_peer_t        *peer;
+       int                rc;
+       ENTRY;
+
+       if (nid == LNET_NID_ANY)
+               return -EINVAL;
+
+       /* NB - this will not block during normal operations -
+        * the only writer of this is in the startup/shutdown path. */
+       rc = down_read_trylock(&kgnilnd_data.kgn_net_rw_sem);
+       if (!rc) {
+               rc = -ESHUTDOWN;
+               RETURN(rc);
+       }
+       rc = kgnilnd_create_peer_safe(&peer, nid, net);
+       if (rc != 0) {
+               up_read(&kgnilnd_data.kgn_net_rw_sem);
+               RETURN(rc);
+       }
+
+       write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+       up_read(&kgnilnd_data.kgn_net_rw_sem);
+
+       kgnilnd_add_peer_locked(nid, peer, peerp);
+
+       CDEBUG(D_NET, "peer 0x%p->%s connecting %d\n",
+              peerp, libcfs_nid2str((*peerp)->gnp_nid),
+              (*peerp)->gnp_connecting);
+
+       write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+       RETURN(0);
+}
+
+/* needs write_lock on kgn_peer_conn_lock */
+void
+kgnilnd_cancel_peer_connect_locked(kgn_peer_t *peer, struct list_head *zombies)
+{
+       kgn_tx_t        *tx, *txn;
+
+       /* we do care about state of gnp_connecting - we could be between
+        * reconnect attempts, so try to find the dgram and cancel the TX
+        * anyways. If we are in the process of posting DONT do anything;
+        * once it fails or succeeds we can nuke the connect attempt.
+        * We have no idea where in kgnilnd_post_dgram we are so we cant
+        * attempt to cancel until the function is done.
+        */
+
+       /* make sure peer isn't in process of connecting or waiting for connect*/
+       spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
+       if (!(list_empty(&peer->gnp_connd_list))) {
+               list_del_init(&peer->gnp_connd_list);
+               /* remove connd ref */
+               kgnilnd_peer_decref(peer);
+       }
+       spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
+
+       if (peer->gnp_connecting == GNILND_PEER_POSTING || peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
+               peer->gnp_connecting = GNILND_PEER_NEEDS_DEATH;
+               /* We are in process of posting right now the xchg set it up for us to
+                * cancel the connect so we are finished for now */
+       } else {
+               /* no need for exchange we have the peer lock and its ready for us to nuke */
+               LASSERTF(peer->gnp_connecting != GNILND_PEER_POSTING,
+                       "Peer in invalid state 0x%p->%s, connecting %d\n",
+                       peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
+               peer->gnp_connecting = GNILND_PEER_IDLE;
+               set_mb(peer->gnp_last_dgram_errno, -ETIMEDOUT);
+               kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev,
+                                                     peer->gnp_nid);
+       }
+
+       /* The least we can do is nuke the tx's no matter what.... */
+       list_for_each_entry_safe(tx, txn, &peer->gnp_tx_queue, tx_list) {
+               kgnilnd_tx_del_state_locked(tx, peer, NULL,
+                                          GNILND_TX_ALLOCD);
+               list_add_tail(&tx->tx_list, zombies);
+       }
+}
+
+/* needs write_lock on kgn_peer_conn_lock */
+void
+kgnilnd_del_peer_locked(kgn_peer_t *peer, int error)
+{
+       /* this peer could be passive and only held for purgatory,
+        * take a ref to ensure it doesn't disappear in this function */
+       kgnilnd_peer_addref(peer);
+
+       CFS_RACE(CFS_FAIL_GNI_FIND_TARGET);
+
+       /* if purgatory release cleared it out, don't try again */
+       if (kgnilnd_peer_active(peer)) {
+               /* always do this to allow kgnilnd_start_connect and
+                * kgnilnd_finish_connect to catch this before they
+                * wrap up their operations */
+               if (kgnilnd_can_unlink_peer_locked(peer)) {
+                       /* already released purgatory, so only active
+                        * conns hold it */
+                       kgnilnd_unlink_peer_locked(peer);
+               } else {
+                       kgnilnd_close_peer_conns_locked(peer, error);
+                       /* peer unlinks itself when last conn is closed */
+               }
+       }
+
+       /* we are done, release back to the wild */
+       kgnilnd_peer_decref(peer);
+}
+
+int
+kgnilnd_del_conn_or_peer(kgn_net_t *net, lnet_nid_t nid, int command,
+                         int error)
+{
+       LIST_HEAD               (souls);
+       LIST_HEAD               (zombies);
+       struct list_head        *ptmp, *pnxt;
+       kgn_peer_t              *peer;
+       int                     lo;
+       int                     hi;
+       int                     i;
+       int                     rc = -ENOENT;
+
+       write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       if (nid != LNET_NID_ANY)
+               lo = hi = kgnilnd_nid2peerlist(nid) - kgnilnd_data.kgn_peers;
+       else {
+               lo = 0;
+               hi = *kgnilnd_tunables.kgn_peer_hash_size - 1;
+               /* wildcards always succeed */
+               rc = 0;
+       }
+
+       for (i = lo; i <= hi; i++) {
+               list_for_each_safe(ptmp, pnxt, &kgnilnd_data.kgn_peers[i]) {
+                       peer = list_entry(ptmp, kgn_peer_t, gnp_list);
+
+                       LASSERTF(peer->gnp_net != NULL,
+                               "peer %p (%s) with NULL net\n",
+                                peer, libcfs_nid2str(peer->gnp_nid));
+
+                       if (net != NULL && peer->gnp_net != net)
+                               continue;
+
+                       if (!(nid == LNET_NID_ANY || LNET_NIDADDR(peer->gnp_nid) == LNET_NIDADDR(nid)))
+                               continue;
+
+                       /* In both cases, we want to stop any in-flight
+                        * connect attempts */
+                       kgnilnd_cancel_peer_connect_locked(peer, &zombies);
+
+                       switch (command) {
+                       case GNILND_DEL_CONN:
+                               kgnilnd_close_peer_conns_locked(peer, error);
+                               break;
+                       case GNILND_DEL_PEER:
+                               peer->gnp_pending_unlink = 1;
+                               kgnilnd_admin_addref(kgnilnd_data.kgn_npending_unlink);
+                               kgnilnd_mark_for_detach_purgatory_all_locked(peer);
+                               kgnilnd_del_peer_locked(peer, error);
+                               break;
+                       case GNILND_CLEAR_PURGATORY:
+                               /* Mark everything ready for detach reaper will cleanup
+                                * once we release the kgn_peer_conn_lock
+                                */
+                               kgnilnd_mark_for_detach_purgatory_all_locked(peer);
+                               peer->gnp_last_errno = -EISCONN;
+                               /* clear reconnect so he can reconnect soon */
+                               peer->gnp_reconnect_time = 0;
+                               peer->gnp_reconnect_interval = 0;
+                               break;
+                       default:
+                               CERROR("bad command %d\n", command);
+                               LBUG();
+                       }
+                       /* we matched something */
+                       rc = 0;
+               }
+       }
+
+       write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       /* release all of the souls found held in purgatory */
+       kgnilnd_release_purgatory_list(&souls);
+
+       /* nuke peer TX */
+       kgnilnd_txlist_done(&zombies, error);
+
+       /* This function does not return until the commands it initiated have completed,
+        * since they have to work there way through the other threads. In the case of shutdown
+        * threads are not woken up until after this call is initiated so we cannot wait, we just
+        * need to return. The same applies for stack reset we shouldnt wait as the reset thread
+        * handles closing.
+        */
+
+       CFS_RACE(CFS_FAIL_GNI_RACE_RESET);
+
+       if (error == -ENOTRECOVERABLE || error == -ESHUTDOWN) {
+               return rc;
+       }
+
+       i = 4;
+       while (atomic_read(&kgnilnd_data.kgn_npending_conns)   ||
+              atomic_read(&kgnilnd_data.kgn_npending_detach)  ||
+              atomic_read(&kgnilnd_data.kgn_npending_unlink)) {
+
+               cfs_pause(cfs_time_seconds(1));
+               i++;
+
+               CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, "Waiting on %d peers %d closes %d detaches\n",
+                               atomic_read(&kgnilnd_data.kgn_npending_unlink),
+                               atomic_read(&kgnilnd_data.kgn_npending_conns),
+                               atomic_read(&kgnilnd_data.kgn_npending_detach));
+       }
+
+       return rc;
+}
+
+kgn_conn_t *
+kgnilnd_get_conn_by_idx(int index)
+{
+       kgn_peer_t        *peer;
+       struct list_head  *ptmp;
+       kgn_conn_t        *conn;
+       struct list_head  *ctmp;
+       int                i;
+
+
+       for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
+               read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+               list_for_each(ptmp, &kgnilnd_data.kgn_peers[i]) {
+
+                       peer = list_entry(ptmp, kgn_peer_t, gnp_list);
+
+                       list_for_each(ctmp, &peer->gnp_conns) {
+                               conn = list_entry(ctmp, kgn_conn_t, gnc_list);
+
+                               if (conn->gnc_state != GNILND_CONN_ESTABLISHED)
+                                       continue;
+
+                               if (index-- > 0)
+                                       continue;
+
+                               CDEBUG(D_NET, "++conn[%p] -> %s (%d)\n", conn,
+                                      libcfs_nid2str(conn->gnc_peer->gnp_nid),
+                                      atomic_read(&conn->gnc_refcount));
+                               kgnilnd_conn_addref(conn);
+                               read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+                               return conn;
+                       }
+               }
+               read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+       }
+
+       return NULL;
+}
+
+int
+kgnilnd_get_conn_info(kgn_peer_t *peer,
+                     int *device_id, __u64 *peerstamp,
+                     int *tx_seq, int *rx_seq,
+                     int *fmaq_len, int *nfma, int *nrdma)
+{
+       kgn_conn_t        *conn;
+       int               rc = 0;
+
+       read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       conn = kgnilnd_find_conn_locked(peer);
+       if (conn == NULL) {
+               rc = -ENOENT;
+               goto out;
+       }
+
+       *device_id = conn->gnc_device->gnd_host_id;
+       *peerstamp = conn->gnc_peerstamp;
+       *tx_seq = conn->gnc_tx_seq;
+       *rx_seq = conn->gnc_rx_seq;
+       *fmaq_len = kgnilnd_count_list(&conn->gnc_fmaq);
+       *nfma = atomic_read(&conn->gnc_nlive_fma);
+       *nrdma = atomic_read(&conn->gnc_nlive_rdma);
+out:
+       read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+       return rc;
+}
+
+/* needs write_lock on kgn_peer_conn_lock */
+int
+kgnilnd_close_peer_conns_locked(kgn_peer_t *peer, int why)
+{
+       kgn_conn_t         *conn;
+       struct list_head   *ctmp, *cnxt;
+       int                 count = 0;
+
+       list_for_each_safe(ctmp, cnxt, &peer->gnp_conns) {
+               conn = list_entry(ctmp, kgn_conn_t, gnc_list);
+
+               if (conn->gnc_state != GNILND_CONN_ESTABLISHED)
+                       continue;
+
+               count++;
+               /* we mark gnc_needs closing and increment kgn_npending_conns so that
+                * kgnilnd_del_conn_or_peer can wait on the other threads closing
+                * and cleaning up the connection.
+                */
+               if (!conn->gnc_needs_closing) {
+                       conn->gnc_needs_closing = 1;
+                       kgnilnd_admin_addref(kgnilnd_data.kgn_npending_conns);
+               }
+               kgnilnd_close_conn_locked(conn, why);
+       }
+       return count;
+}
+
+int
+kgnilnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
+{
+       struct libcfs_ioctl_data *data = arg;
+       kgn_net_t                *net = ni->ni_data;
+       int                       rc = -EINVAL;
+
+       LASSERT(ni == net->gnn_ni);
+
+       switch (cmd) {
+       case IOC_LIBCFS_GET_PEER: {
+               lnet_nid_t   nid = 0;
+               kgn_peer_t  *peer = NULL;
+               __u32 nic_addr = 0;
+               __u64 peerstamp = 0;
+               int peer_refcount = 0, peer_connecting = 0;
+               int device_id = 0;
+               int tx_seq = 0, rx_seq = 0;
+               int fmaq_len = 0, nfma = 0, nrdma = 0;
+
+               rc = kgnilnd_get_peer_info(data->ioc_count, &peer,
+                                          &nid, &nic_addr, &peer_refcount,
+                                          &peer_connecting);
+               if (rc)
+                       break;
+
+               /* Barf */
+               /* LNET_MKNID is used to mask from lnet the multiplexing/demultiplexing of connections and peers
+                * LNET assumes a conn and peer per net, the LNET_MKNID/LNET_NIDADDR allows us to let Lnet see what it
+                * wants to see instead of the underlying network that is being used to send the data
+                */
+               data->ioc_nid    = LNET_MKNID(LNET_NIDNET(ni->ni_nid), LNET_NIDADDR(nid));
+               data->ioc_flags  = peer_connecting;
+               data->ioc_count  = peer_refcount;
+
+               rc = kgnilnd_get_conn_info(peer, &device_id, &peerstamp,
+                                          &tx_seq, &rx_seq, &fmaq_len,
+                                          &nfma, &nrdma);
+
+               /* This is allowable - a persistent peer could not
+                * have a connection */
+               if (rc) {
+                       /* flag to indicate we are not connected -
+                        * need to print as such */
+                       data->ioc_flags |= (1<<16);
+                       rc = 0;
+               } else {
+                       /* still barf */
+                       data->ioc_net = device_id;
+                       data->ioc_u64[0] = peerstamp;
+                       data->ioc_u32[0] = fmaq_len;
+                       data->ioc_u32[1] = nfma;
+                       data->ioc_u32[2] = tx_seq;
+                       data->ioc_u32[3] = rx_seq;
+                       data->ioc_u32[4] = nrdma;
+               }
+               break;
+       }
+       case IOC_LIBCFS_ADD_PEER: {
+               /* just dummy value to allow using common interface */
+               kgn_peer_t      *peer;
+               rc = kgnilnd_add_peer(net, data->ioc_nid, &peer);
+               break;
+       }
+       case IOC_LIBCFS_DEL_PEER: {
+               /* NULL is passed in so it affects all peers in existence without regard to network
+                * as the peer may not exist on the network LNET believes it to be on.
+                */
+               rc = kgnilnd_del_conn_or_peer(NULL, data->ioc_nid,
+                                             GNILND_DEL_PEER, -EUCLEAN);
+               break;
+       }
+       case IOC_LIBCFS_GET_CONN: {
+               kgn_conn_t *conn = kgnilnd_get_conn_by_idx(data->ioc_count);
+
+               if (conn == NULL)
+                       rc = -ENOENT;
+               else {
+                       rc = 0;
+                       /* LNET_MKNID is used to build the correct address based on what LNET wants to see instead of
+                        * the generic connection that is used to send the data
+                        */
+                       data->ioc_nid    = LNET_MKNID(LNET_NIDNET(ni->ni_nid), LNET_NIDADDR(conn->gnc_peer->gnp_nid));
+                       data->ioc_u32[0] = conn->gnc_device->gnd_id;
+                       kgnilnd_conn_decref(conn);
+               }
+               break;
+       }
+       case IOC_LIBCFS_CLOSE_CONNECTION: {
+               /* use error = -ENETRESET to indicate it was lctl disconnect */
+               /* NULL is passed in so it affects all the nets as the connection is virtual
+                * and may not exist on the network LNET believes it to be on.
+                */
+               rc = kgnilnd_del_conn_or_peer(NULL, data->ioc_nid,
+                                             GNILND_DEL_CONN, -ENETRESET);
+               break;
+       }
+       case IOC_LIBCFS_PUSH_CONNECTION: {
+               /* we use this to flush purgatory */
+               rc = kgnilnd_del_conn_or_peer(NULL, data->ioc_nid,
+                                             GNILND_CLEAR_PURGATORY, -EUCLEAN);
+               break;
+       }
+       case IOC_LIBCFS_REGISTER_MYNID: {
+               /* Ignore if this is a noop */
+               if (data->ioc_nid == ni->ni_nid) {
+                       rc = 0;
+               } else {
+                       CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
+                              libcfs_nid2str(data->ioc_nid),
+                              libcfs_nid2str(ni->ni_nid));
+                       rc = -EINVAL;
+               }
+               break;
+       }
+       }
+
+       return rc;
+}
+
+void
+kgnilnd_query(lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
+{
+       kgn_net_t               *net = ni->ni_data;
+       kgn_tx_t                *tx;
+       kgn_peer_t              *peer = NULL;
+       kgn_conn_t              *conn = NULL;
+       lnet_process_id_t       id = {.nid = nid, .pid = LUSTRE_SRV_LNET_PID};
+       ENTRY;
+
+       /* I expect to find him, so only take a read lock */
+       read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+       peer = kgnilnd_find_peer_locked(nid);
+       if (peer != NULL) {
+               /* LIE if in a quiesce - we will update the timeouts after,
+                * but we don't want sends failing during it */
+               if (kgnilnd_data.kgn_quiesce_trigger) {
+                       *when = jiffies;
+                       read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+                       GOTO(out, 0);
+               }
+
+               /* Update to best guess, might refine on later checks */
+               *when = peer->gnp_last_alive;
+
+               /* we have a peer, how about a conn? */
+               conn = kgnilnd_find_conn_locked(peer);
+
+               if (conn == NULL)  {
+                       /* if there is no conn, check peer last errno to see if clean disconnect
+                        * - if it was, we lie to LNet because we believe a TX would complete
+                        * on reconnect */
+                       if (kgnilnd_conn_clean_errno(peer->gnp_last_errno)) {
+                               *when = jiffies;
+                       }
+                       /* we still want to fire a TX and new conn in this case */
+               } else {
+                       /* gnp_last_alive is valid, run for the hills */
+                       read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+                       GOTO(out, 0);
+               }
+       }
+       /* if we get here, either we have no peer or no conn for him, so fire off
+        * new TX to trigger conn setup */
+       read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       /* if we couldn't find him, we'll fire up a TX and get connected -
+        * if we don't do this, after ni_peer_timeout, LNet will declare him dead.
+        * So really we treat kgnilnd_query as a bit of a 'connect now' type
+        * event because it'll only do this when it wants to send
+        *
+        * Use a real TX for this to get the proper gnp_tx_queue behavior, etc
+        * normally we'd use kgnilnd_send_ctlmsg for this, but we don't really
+        * care that this goes out quickly since we already know we need a new conn
+        * formed */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_NOOP_SEND))
+               return;
+
+       tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP, ni->ni_nid);
+       if (tx != NULL) {
+               kgnilnd_launch_tx(tx, net, &id);
+       }
+out:
+       CDEBUG(D_NETTRACE, "peer 0x%p->%s when %lu\n", peer,
+              libcfs_nid2str(nid), *when);
+       EXIT;
+}
+
+int
+kgnilnd_dev_init(kgn_device_t *dev)
+{
+       gni_return_t      rrc;
+       int               rc = 0;
+       unsigned int      cq_size;
+       ENTRY;
+
+       /* size of these CQs should be able to accommodate the outgoing
+        * RDMA and SMSG transactions.  Since we really don't know what we
+        * really need here, we'll take credits * 2 * 3 to allow a bunch.
+        * We need to dig into this more with the performance work. */
+       cq_size = *kgnilnd_tunables.kgn_credits * 2 * 3;
+
+       rrc = kgnilnd_cdm_create(dev->gnd_id, *kgnilnd_tunables.kgn_ptag,
+                                GNILND_COOKIE, 0,
+                                &dev->gnd_domain);
+       if (rrc != GNI_RC_SUCCESS) {
+               CERROR("Can't create CDM %d (%d)\n", dev->gnd_id, rrc);
+               rc = -ENODEV;
+               GOTO(failed, rc);
+       }
+
+       rrc = kgnilnd_cdm_attach(dev->gnd_domain, dev->gnd_id,
+                                &dev->gnd_host_id, &dev->gnd_handle);
+       if (rrc != GNI_RC_SUCCESS) {
+               CERROR("Can't attach CDM to device %d (%d)\n",
+                       dev->gnd_id, rrc);
+               rc = -ENODEV;
+               GOTO(failed, rc);
+       }
+
+       rc = kgnilnd_setup_nic_translation(dev->gnd_host_id);
+       if (rc != 0) {
+               rc = -ENODEV;
+               GOTO(failed, rc);
+       }
+
+       /* only dev 0 gets the errors - no need to reset the stack twice
+        * - this works because we have a single PTAG, if we had more
+        * then we'd need to have multiple handlers */
+       if (dev->gnd_id == 0) {
+               rrc = kgnilnd_subscribe_errors(dev->gnd_handle, GNI_ERRMASK_CRITICAL,
+                                             0, NULL, kgnilnd_critical_error,
+                                             &dev->gnd_err_handle);
+               if (rrc != GNI_RC_SUCCESS) {
+                       CERROR("Can't subscribe for errors on device %d: rc %d\n",
+                               dev->gnd_id, rrc);
+                       rc = -ENODEV;
+                       GOTO(failed, rc);
+               }
+
+               rc = kgnilnd_set_quiesce_callback(dev->gnd_handle,
+                                                 kgnilnd_quiesce_end_callback);
+               if (rc != GNI_RC_SUCCESS) {
+                       CERROR("Can't subscribe for quiesce callback on device %d: rc %d\n",
+                               dev->gnd_id, rrc);
+                       rc = -ENODEV;
+                       GOTO(failed, rc);
+               }
+       }
+
+       rc = kgnilnd_nicaddr_to_nid(dev->gnd_host_id, &dev->gnd_nid);
+       if (rc < 0) {
+               /* log messages during startup */
+               if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) {
+                       CERROR("couldn't translate host_id 0x%x to nid. rc %d\n",
+                               dev->gnd_host_id, rc);
+               }
+               rc = -ESRCH;
+               GOTO(failed, rc);
+       }
+       CDEBUG(D_NET, "NIC %x -> NID %d\n", dev->gnd_host_id, dev->gnd_nid);
+
+       rrc = kgnilnd_cq_create(dev->gnd_handle, cq_size,
+                               0, kgnilnd_device_callback,
+                               dev->gnd_id, &dev->gnd_snd_rdma_cqh);
+       if (rrc != GNI_RC_SUCCESS) {
+               CERROR("Can't create rdma send cq size %u for device "
+                      "%d (%d)\n", cq_size, dev->gnd_id, rrc);
+               rc = -EINVAL;
+               GOTO(failed, rc);
+       }
+
+       rrc = kgnilnd_cq_create(dev->gnd_handle, cq_size,
+                       0, kgnilnd_device_callback, dev->gnd_id,
+                       &dev->gnd_snd_fma_cqh);
+       if (rrc != GNI_RC_SUCCESS) {
+               CERROR("Can't create fma send cq size %u for device %d (%d)\n",
+                      cq_size, dev->gnd_id, rrc);
+               rc = -EINVAL;
+               GOTO(failed, rc);
+       }
+
+       /* This one we size differently - overflows are possible and it needs to be
+        * sized based on machine size */
+       rrc = kgnilnd_cq_create(dev->gnd_handle,
+                       *kgnilnd_tunables.kgn_fma_cq_size,
+                       0, kgnilnd_device_callback, dev->gnd_id,
+                       &dev->gnd_rcv_fma_cqh);
+       if (rrc != GNI_RC_SUCCESS) {
+               CERROR("Can't create fma cq size %d for device %d (%d)\n",
+                      *kgnilnd_tunables.kgn_fma_cq_size, dev->gnd_id, rrc);
+               rc = -EINVAL;
+               GOTO(failed, rc);
+       }
+
+       RETURN(0);
+
+failed:
+       kgnilnd_dev_fini(dev);
+       RETURN(rc);
+}
+
+void
+kgnilnd_dev_fini(kgn_device_t *dev)
+{
+       gni_return_t rrc;
+       ENTRY;
+
+       /* At quiesce or rest time, need to loop through and clear gnd_ready_conns ?*/
+       LASSERTF(list_empty(&dev->gnd_ready_conns) &&
+                list_empty(&dev->gnd_map_tx) &&
+                list_empty(&dev->gnd_rdmaq),
+                "dev 0x%p ready_conns %d@0x%p map_tx %d@0x%p rdmaq %d@0x%p\n",
+                dev, kgnilnd_count_list(&dev->gnd_ready_conns), &dev->gnd_ready_conns,
+                kgnilnd_count_list(&dev->gnd_map_tx), &dev->gnd_map_tx,
+                kgnilnd_count_list(&dev->gnd_rdmaq), &dev->gnd_rdmaq);
+
+       /* These should follow from tearing down all connections */
+       LASSERTF(dev->gnd_map_nphys == 0 && dev->gnd_map_physnop == 0,
+               "%d physical mappings of %d pages still mapped\n",
+                dev->gnd_map_nphys, dev->gnd_map_physnop);
+
+       LASSERTF(dev->gnd_map_nvirt == 0 && dev->gnd_map_virtnob == 0,
+               "%d virtual mappings of "LPU64" bytes still mapped\n",
+                dev->gnd_map_nvirt, dev->gnd_map_virtnob);
+
+       LASSERTF(atomic_read(&dev->gnd_n_mdd) == 0 &&
+                atomic_read(&dev->gnd_n_mdd_held) == 0 &&
+                atomic64_read(&dev->gnd_nbytes_map) == 0,
+               "%d SMSG mappings of %ld bytes still mapped or held %d\n",
+                atomic_read(&dev->gnd_n_mdd),
+                atomic64_read(&dev->gnd_nbytes_map), atomic_read(&dev->gnd_n_mdd_held));
+
+       LASSERT(list_empty(&dev->gnd_map_list));
+
+       /* What other assertions needed to ensure all connections torn down ? */
+
+       /* check all counters == 0 (EP, MDD, etc) */
+
+       /* if we are resetting due to quiese (stack reset), don't check
+        * thread states */
+       LASSERTF(kgnilnd_data.kgn_quiesce_trigger ||
+               atomic_read(&kgnilnd_data.kgn_nthreads) == 0,
+               "tried to shutdown with threads active\n");
+
+       if (dev->gnd_rcv_fma_cqh) {
+               rrc = kgnilnd_cq_destroy(dev->gnd_rcv_fma_cqh);
+               LASSERTF(rrc == GNI_RC_SUCCESS,
+                       "bad rc from gni_cq_destroy on rcv_fma_cqh: %d\n", rrc);
+               dev->gnd_rcv_fma_cqh = NULL;
+       }
+
+       if (dev->gnd_snd_rdma_cqh) {
+               rrc = kgnilnd_cq_destroy(dev->gnd_snd_rdma_cqh);
+               LASSERTF(rrc == GNI_RC_SUCCESS,
+                       "bad rc from gni_cq_destroy on send_rdma_cqh: %d\n", rrc);
+               dev->gnd_snd_rdma_cqh = NULL;
+       }
+
+       if (dev->gnd_snd_fma_cqh) {
+               rrc = kgnilnd_cq_destroy(dev->gnd_snd_fma_cqh);
+               LASSERTF(rrc == GNI_RC_SUCCESS,
+                       "bad rc from gni_cq_destroy on snd_fma_cqh: %d\n", rrc);
+               dev->gnd_snd_fma_cqh = NULL;
+       }
+
+       if (dev->gnd_err_handle) {
+               rrc = kgnilnd_release_errors(dev->gnd_err_handle);
+               LASSERTF(rrc == GNI_RC_SUCCESS,
+                       "bad rc from gni_release_errors: %d\n", rrc);
+               dev->gnd_err_handle = NULL;
+       }
+
+       if (dev->gnd_domain) {
+               rrc = kgnilnd_cdm_destroy(dev->gnd_domain);
+               LASSERTF(rrc == GNI_RC_SUCCESS,
+                       "bad rc from gni_cdm_destroy: %d\n", rrc);
+               dev->gnd_domain = NULL;
+       }
+
+       EXIT;
+}
+
+
+int kgnilnd_base_startup(void)
+{
+       struct timeval       tv;
+       int                  pkmem = atomic_read(&libcfs_kmemory);
+       int                  rc;
+       int                  i;
+       kgn_device_t        *dev;
+       struct task_struct  *thrd;
+       ENTRY;
+
+       LASSERTF(kgnilnd_data.kgn_init == GNILND_INIT_NOTHING,
+               "init %d\n", kgnilnd_data.kgn_init);
+
+       /* zero pointers, flags etc */
+       memset(&kgnilnd_data, 0, sizeof(kgnilnd_data));
+       memset(&kgnilnd_hssops, 0, sizeof(kgnilnd_hssops));
+
+       /* CAVEAT EMPTOR: Every 'Fma' message includes the sender's NID and
+        * a unique (for all time) connstamp so we can uniquely identify
+        * the sender.  The connstamp is an incrementing counter
+        * initialised with seconds + microseconds at startup time.  So we
+        * rely on NOT creating connections more frequently on average than
+        * 1MHz to ensure we don't use old connstamps when we reboot. */
+       do_gettimeofday(&tv);
+       kgnilnd_data.kgn_connstamp =
+                kgnilnd_data.kgn_peerstamp =
+                       (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+
+       init_rwsem(&kgnilnd_data.kgn_net_rw_sem);
+
+       for (i = 0; i < GNILND_MAXDEVS; i++) {
+               kgn_device_t  *dev = &kgnilnd_data.kgn_devices[i];
+
+               dev->gnd_id = i;
+               INIT_LIST_HEAD(&dev->gnd_ready_conns);
+               INIT_LIST_HEAD(&dev->gnd_map_tx);
+               INIT_LIST_HEAD(&dev->gnd_fma_buffs);
+               mutex_init(&dev->gnd_cq_mutex);
+               sema_init(&dev->gnd_fmablk_sem, 1);
+               spin_lock_init(&dev->gnd_fmablk_lock);
+               init_waitqueue_head(&dev->gnd_waitq);
+               init_waitqueue_head(&dev->gnd_dgram_waitq);
+               init_waitqueue_head(&dev->gnd_dgping_waitq);
+               spin_lock_init(&dev->gnd_lock);
+               INIT_LIST_HEAD(&dev->gnd_map_list);
+               spin_lock_init(&dev->gnd_map_lock);
+               atomic_set(&dev->gnd_nfmablk, 0);
+               atomic_set(&dev->gnd_fmablk_vers, 1);
+               atomic_set(&dev->gnd_neps, 0);
+               atomic_set(&dev->gnd_canceled_dgrams, 0);
+               INIT_LIST_HEAD(&dev->gnd_connd_peers);
+               spin_lock_init(&dev->gnd_connd_lock);
+               spin_lock_init(&dev->gnd_dgram_lock);
+               spin_lock_init(&dev->gnd_rdmaq_lock);
+               INIT_LIST_HEAD(&dev->gnd_rdmaq);
+
+               /* alloc & setup nid based dgram table */
+               LIBCFS_ALLOC(dev->gnd_dgrams,
+                           sizeof(struct list_head) * *kgnilnd_tunables.kgn_peer_hash_size);
+
+               if (dev->gnd_dgrams == NULL) {
+                       rc = -ENOMEM;
+                       GOTO(failed, rc);
+               }
+
+               for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
+                       INIT_LIST_HEAD(&dev->gnd_dgrams[i]);
+               }
+               atomic_set(&dev->gnd_ndgrams, 0);
+
+               /* setup timer for RDMAQ processing */
+               setup_timer(&dev->gnd_rdmaq_timer, kgnilnd_schedule_device_timer,
+                           (unsigned long)dev);
+       }
+
+       /* CQID 0 isn't allowed, set to MAX_MSG_ID - 1 to check for conflicts early */
+       kgnilnd_data.kgn_next_cqid = GNILND_MAX_MSG_ID - 1;
+       kgnilnd_data.kgn_new_min_timeout = *kgnilnd_tunables.kgn_timeout;
+       init_waitqueue_head(&kgnilnd_data.kgn_reaper_waitq);
+       init_waitqueue_head(&kgnilnd_data.kgn_ruhroh_waitq);
+       spin_lock_init(&kgnilnd_data.kgn_reaper_lock);
+
+       sema_init(&kgnilnd_data.kgn_quiesce_sem, 1);
+       atomic_set(&kgnilnd_data.kgn_nquiesce, 0);
+       atomic_set(&kgnilnd_data.kgn_npending_conns, 0);
+       atomic_set(&kgnilnd_data.kgn_npending_unlink, 0);
+       atomic_set(&kgnilnd_data.kgn_npending_detach, 0);
+       /* OK to call kgnilnd_api_shutdown() to cleanup now */
+       kgnilnd_data.kgn_init = GNILND_INIT_DATA;
+       PORTAL_MODULE_USE;
+
+       rwlock_init(&kgnilnd_data.kgn_peer_conn_lock);
+
+       LIBCFS_ALLOC(kgnilnd_data.kgn_peers,
+                   sizeof(struct list_head) * *kgnilnd_tunables.kgn_peer_hash_size);
+
+       if (kgnilnd_data.kgn_peers == NULL) {
+               rc = -ENOMEM;
+               GOTO(failed, rc);
+       }
+
+       for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
+               INIT_LIST_HEAD(&kgnilnd_data.kgn_peers[i]);
+       }
+
+       LIBCFS_ALLOC(kgnilnd_data.kgn_conns,
+                   sizeof(struct list_head) * *kgnilnd_tunables.kgn_peer_hash_size);
+
+       if (kgnilnd_data.kgn_conns == NULL) {
+               rc = -ENOMEM;
+               GOTO(failed, rc);
+       }
+
+       for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
+               INIT_LIST_HEAD(&kgnilnd_data.kgn_conns[i]);
+       }
+
+       LIBCFS_ALLOC(kgnilnd_data.kgn_nets,
+                   sizeof(struct list_head) * *kgnilnd_tunables.kgn_net_hash_size);
+
+       if (kgnilnd_data.kgn_nets == NULL) {
+               rc = -ENOMEM;
+               GOTO(failed, rc);
+       }
+
+       for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) {
+               INIT_LIST_HEAD(&kgnilnd_data.kgn_nets[i]);
+       }
+
+       kgnilnd_data.kgn_mbox_cache =
+               cfs_mem_cache_create("kgn_mbox_block",
+                                    KMALLOC_MAX_SIZE,
+                                    0,    /* offset */
+                                    SLAB_HWCACHE_ALIGN);   /* flags */
+       if (kgnilnd_data.kgn_mbox_cache == NULL) {
+               CERROR("Can't create slab for physical mbox blocks\n");
+               rc = -ENOMEM;
+               GOTO(failed, rc);
+       }
+
+       kgnilnd_data.kgn_rx_cache =
+               cfs_mem_cache_create("kgn_rx_t",
+                                    sizeof(kgn_rx_t),
+                                    0,    /* offset */
+                                    0);   /* flags */
+       if (kgnilnd_data.kgn_rx_cache == NULL) {
+               CERROR("Can't create slab for kgn_rx_t descriptors\n");
+               rc = -ENOMEM;
+               GOTO(failed, rc);
+       }
+
+       kgnilnd_data.kgn_tx_cache =
+               cfs_mem_cache_create("kgn_tx_t",
+                                    sizeof(kgn_tx_t),
+                                    0,    /* offset */
+                                    0);   /* flags */
+       if (kgnilnd_data.kgn_tx_cache == NULL) {
+               CERROR("Can't create slab for kgn_tx_t\n");
+               rc = -ENOMEM;
+               GOTO(failed, rc);
+       }
+
+       kgnilnd_data.kgn_tx_phys_cache =
+               cfs_mem_cache_create("kgn_tx_phys",
+                                    LNET_MAX_IOV * sizeof(gni_mem_segment_t),
+                                    0,    /* offset */
+                                    0);   /* flags */
+       if (kgnilnd_data.kgn_tx_phys_cache == NULL) {
+               CERROR("Can't create slab for kgn_tx_phys\n");
+               rc = -ENOMEM;
+               GOTO(failed, rc);
+       }
+
+       kgnilnd_data.kgn_dgram_cache =
+               cfs_mem_cache_create("kgn_dgram_t",
+                                    sizeof(kgn_dgram_t),
+                                    0,    /* offset */
+                                    0);   /* flags */
+       if (kgnilnd_data.kgn_dgram_cache == NULL) {
+               CERROR("Can't create slab for outgoing datagrams\n");
+               rc = -ENOMEM;
+               GOTO(failed, rc);
+       }
+
+       /* allocate a MAX_IOV array of page pointers for each cpu */
+       kgnilnd_data.kgn_cksum_map_pages = kmalloc(num_possible_cpus() * sizeof (struct page *),
+                                                  GFP_KERNEL);
+       if (kgnilnd_data.kgn_cksum_map_pages == NULL) {
+               CERROR("Can't allocate vmap cksum pages\n");
+               rc = -ENOMEM;
+               GOTO(failed, rc);
+       }
+       kgnilnd_data.kgn_cksum_npages = num_possible_cpus();
+       memset(kgnilnd_data.kgn_cksum_map_pages, 0,
+               kgnilnd_data.kgn_cksum_npages * sizeof (struct page *));
+
+       for (i = 0; i < kgnilnd_data.kgn_cksum_npages; i++) {
+               kgnilnd_data.kgn_cksum_map_pages[i] = kmalloc(LNET_MAX_IOV * sizeof (struct page *),
+                                                             GFP_KERNEL);
+               if (kgnilnd_data.kgn_cksum_map_pages[i] == NULL) {
+                       CERROR("Can't allocate vmap cksum pages for cpu %d\n", i);
+                       rc = -ENOMEM;
+                       GOTO(failed, rc);
+               }
+       }
+
+       LASSERT(kgnilnd_data.kgn_ndevs == 0);
+
+       /* Use all available GNI devices */
+       for (i = 0; i < GNILND_MAXDEVS; i++) {
+               dev = &kgnilnd_data.kgn_devices[kgnilnd_data.kgn_ndevs];
+
+               rc = kgnilnd_dev_init(dev);
+               if (rc == 0) {
+                       /* Increment here so base_shutdown cleans it up */
+                       kgnilnd_data.kgn_ndevs++;
+
+                       rc = kgnilnd_allocate_phys_fmablk(dev);
+                       if (rc) {
+                               GOTO(failed, rc);
+                       }
+               }
+       }
+
+       if (kgnilnd_data.kgn_ndevs == 0) {
+               CERROR("Can't initialise any GNI devices\n");
+               rc = -ENODEV;
+               GOTO(failed, rc);
+       }
+
+       rc = kgnilnd_thread_start(kgnilnd_reaper, NULL, "kgnilnd_rpr", 0);
+       if (rc != 0) {
+               CERROR("Can't spawn gnilnd reaper: %d\n", rc);
+               GOTO(failed, rc);
+       }
+
+       /*
+        * Start ruhroh thread.  We can't use kgnilnd_thread_start() because
+        * we don't want this thread included in kgnilnd_data.kgn_nthreads
+        * count.  This thread controls quiesce, so it mustn't
+        * quiesce itself.
+        */
+       thrd = kthread_run(kgnilnd_ruhroh_thread, NULL, "%s_%02d", "kgnilnd_rr", 0);
+       if (IS_ERR(thrd)) {
+               rc = PTR_ERR(thrd);
+               CERROR("Can't spawn gnilnd ruhroh thread: %d\n", rc);
+               GOTO(failed, rc);
+       }
+
+       /* threads will load balance across devs as they are available */
+       for (i = 0; i < *kgnilnd_tunables.kgn_sched_threads; i++) {
+               rc = kgnilnd_thread_start(kgnilnd_scheduler, (void *)((long)i),
+                                         "kgnilnd_sd", i);
+               if (rc != 0) {
+                       CERROR("Can't spawn gnilnd scheduler[%d]: %d\n",
+                              i, rc);
+                       GOTO(failed, rc);
+               }
+       }
+
+       for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+               dev = &kgnilnd_data.kgn_devices[i];
+               rc = kgnilnd_thread_start(kgnilnd_dgram_mover, dev,
+                                         "kgnilnd_dg", dev->gnd_id);
+               if (rc != 0) {
+                       CERROR("Can't spawn gnilnd dgram_mover[%d]: %d\n",
+                              dev->gnd_id, rc);
+                       GOTO(failed, rc);
+               }
+
+               rc = kgnilnd_thread_start(kgnilnd_dgram_waitq, dev,
+                                         "kgnilnd_dgn", dev->gnd_id);
+               if (rc != 0) {
+                       CERROR("Can't spawn gnilnd dgram_waitq[%d]: %d\n",
+                               dev->gnd_id, rc);
+                       GOTO(failed, rc);
+               }
+
+               rc = kgnilnd_setup_wildcard_dgram(dev);
+
+               if (rc != 0) {
+                       CERROR("Can't create wildcard dgrams[%d]: %d\n",
+                               dev->gnd_id, rc);
+                       GOTO(failed, rc);
+               }
+       }
+
+
+
+       /* flag everything initialised */
+       kgnilnd_data.kgn_init = GNILND_INIT_ALL;
+       /*****************************************************/
+
+       CDEBUG(D_MALLOC, "initial kmem %d\n", pkmem);
+       RETURN(0);
+
+failed:
+       kgnilnd_base_shutdown();
+       kgnilnd_data.kgn_init = GNILND_INIT_NOTHING;
+       RETURN(rc);
+}
+
+void
+kgnilnd_base_shutdown(void)
+{
+       int           i;
+       ENTRY;
+
+       while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_PAUSE_SHUTDOWN, 1)) {};
+
+       kgnilnd_data.kgn_wc_kill = 1;
+
+       for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+               kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
+               kgnilnd_cancel_wc_dgrams(dev);
+               kgnilnd_del_conn_or_peer(NULL, LNET_NID_ANY, GNILND_DEL_PEER, -ESHUTDOWN);
+               kgnilnd_wait_for_canceled_dgrams(dev);
+       }
+
+       /* Peer state all cleaned up BEFORE setting shutdown, so threads don't
+        * have to worry about shutdown races.  NB connections may be created
+        * while there are still active connds, but these will be temporary
+        * since peer creation always fails after the listener has started to
+        * shut down.
+        * all peers should have been cleared out on the nets */
+       LASSERTF(atomic_read(&kgnilnd_data.kgn_npeers) == 0,
+               "peers left %d\n", atomic_read(&kgnilnd_data.kgn_npeers));
+
+       /* Wait for the ruhroh thread to shut down. */
+       kgnilnd_data.kgn_ruhroh_shutdown = 1;
+       wake_up(&kgnilnd_data.kgn_ruhroh_waitq);
+       i = 2;
+       while (kgnilnd_data.kgn_ruhroh_running != 0) {
+               i++;
+               CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
+                      "Waiting for ruhroh thread to terminate\n");
+               cfs_pause(cfs_time_seconds(1));
+       }
+
+       /* Flag threads to terminate */
+       kgnilnd_data.kgn_shutdown = 1;
+
+       for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+               kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
+
+               /* should clear all the MDDs */
+               kgnilnd_unmap_phys_fmablk(dev);
+
+               kgnilnd_schedule_device(dev);
+               wake_up_all(&dev->gnd_dgram_waitq);
+               wake_up_all(&dev->gnd_dgping_waitq);
+               LASSERT(list_empty(&dev->gnd_connd_peers));
+       }
+
+       spin_lock(&kgnilnd_data.kgn_reaper_lock);
+       wake_up_all(&kgnilnd_data.kgn_reaper_waitq);
+       spin_unlock(&kgnilnd_data.kgn_reaper_lock);
+
+       /* Wait for threads to exit */
+       i = 2;
+       while (atomic_read(&kgnilnd_data.kgn_nthreads) != 0) {
+               i++;
+               CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+                      "Waiting for %d threads to terminate\n",
+                      atomic_read(&kgnilnd_data.kgn_nthreads));
+               cfs_pause(cfs_time_seconds(1));
+       }
+
+       LASSERTF(atomic_read(&kgnilnd_data.kgn_npeers) == 0,
+               "peers left %d\n", atomic_read(&kgnilnd_data.kgn_npeers));
+
+       if (kgnilnd_data.kgn_peers != NULL) {
+               for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++)
+                       LASSERT(list_empty(&kgnilnd_data.kgn_peers[i]));
+
+               LIBCFS_FREE(kgnilnd_data.kgn_peers,
+                           sizeof (struct list_head) *
+                           *kgnilnd_tunables.kgn_peer_hash_size);
+       }
+
+       down_write(&kgnilnd_data.kgn_net_rw_sem);
+       if (kgnilnd_data.kgn_nets != NULL) {
+               for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++)
+                       LASSERT(list_empty(&kgnilnd_data.kgn_nets[i]));
+
+               LIBCFS_FREE(kgnilnd_data.kgn_nets,
+                           sizeof (struct list_head) *
+                           *kgnilnd_tunables.kgn_net_hash_size);
+       }
+       up_write(&kgnilnd_data.kgn_net_rw_sem);
+
+       LASSERTF(atomic_read(&kgnilnd_data.kgn_nconns) == 0,
+               "conns left %d\n", atomic_read(&kgnilnd_data.kgn_nconns));
+
+       if (kgnilnd_data.kgn_conns != NULL) {
+               for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++)
+                       LASSERT(list_empty(&kgnilnd_data.kgn_conns[i]));
+
+               LIBCFS_FREE(kgnilnd_data.kgn_conns,
+                           sizeof (struct list_head) *
+                           *kgnilnd_tunables.kgn_peer_hash_size);
+       }
+
+       for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+               kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
+               kgnilnd_dev_fini(dev);
+
+               LASSERTF(atomic_read(&dev->gnd_ndgrams) == 0,
+                       "dgrams left %d\n", atomic_read(&dev->gnd_ndgrams));
+
+               if (dev->gnd_dgrams != NULL) {
+                       for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++)
+                               LASSERT(list_empty(&dev->gnd_dgrams[i]));
+
+                       LIBCFS_FREE(dev->gnd_dgrams,
+                                   sizeof (struct list_head) *
+                                   *kgnilnd_tunables.kgn_peer_hash_size);
+               }
+
+               kgnilnd_free_phys_fmablk(dev);
+       }
+
+       if (kgnilnd_data.kgn_mbox_cache != NULL) {
+               i = cfs_mem_cache_destroy(kgnilnd_data.kgn_mbox_cache);
+               LASSERTF(i == 0, "rc %d destroying kgn_mbox_cache\n", i);
+       }
+
+       if (kgnilnd_data.kgn_rx_cache != NULL) {
+               i = cfs_mem_cache_destroy(kgnilnd_data.kgn_rx_cache);
+               LASSERTF(i == 0, "rc %d destroying kgn_rx_cache\n", i);
+       }
+
+       if (kgnilnd_data.kgn_tx_cache != NULL) {
+               i = cfs_mem_cache_destroy(kgnilnd_data.kgn_tx_cache);
+               LASSERTF(i == 0, "rc %d destroying kgn_tx_cache\n", i);
+       }
+
+       if (kgnilnd_data.kgn_tx_phys_cache != NULL) {
+               i = cfs_mem_cache_destroy(kgnilnd_data.kgn_tx_phys_cache);
+               LASSERTF(i == 0, "rc %d destroying kgn_tx_phys_cache\n", i);
+       }
+
+       if (kgnilnd_data.kgn_dgram_cache != NULL) {
+               i = cfs_mem_cache_destroy(kgnilnd_data.kgn_dgram_cache);
+               LASSERTF(i == 0, "rc %d destroying kgn_dgram_cache\n", i);
+       }
+
+       if (kgnilnd_data.kgn_cksum_map_pages != NULL) {
+               for (i = 0; i < kgnilnd_data.kgn_cksum_npages; i++) {
+                       if (kgnilnd_data.kgn_cksum_map_pages[i] != NULL) {
+                               kfree(kgnilnd_data.kgn_cksum_map_pages[i]);
+                       }
+               }
+               kfree(kgnilnd_data.kgn_cksum_map_pages);
+       }
+
+       CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+              atomic_read(&libcfs_kmemory));
+
+       kgnilnd_data.kgn_init = GNILND_INIT_NOTHING;
+       PORTAL_MODULE_UNUSE;
+
+       EXIT;
+}
+
+int
+kgnilnd_startup(lnet_ni_t *ni)
+{
+       int               rc, devno;
+       kgn_net_t        *net;
+       ENTRY;
+
+       LASSERTF(ni->ni_lnd == &the_kgnilnd,
+               "bad LND 0x%p != the_kgnilnd @ 0x%p\n",
+               ni->ni_lnd, &the_kgnilnd);
+
+       if (kgnilnd_data.kgn_init == GNILND_INIT_NOTHING) {
+               rc = kgnilnd_base_startup();
+               if (rc != 0)
+                       RETURN(rc);
+       }
+
+       /* Serialize with shutdown. */
+       down(&kgnilnd_data.kgn_quiesce_sem);
+
+       LIBCFS_ALLOC(net, sizeof(*net));
+       if (net == NULL) {
+               CERROR("could not allocate net for new interface instance\n");
+               rc = -ENOMEM;
+               /* no need to cleanup the CDM... */
+               GOTO(failed, rc);
+       }
+       INIT_LIST_HEAD(&net->gnn_list);
+       ni->ni_data = net;
+       net->gnn_ni = ni;
+       ni->ni_maxtxcredits = *kgnilnd_tunables.kgn_credits;
+       ni->ni_peertxcredits = *kgnilnd_tunables.kgn_peer_credits;
+
+       if (*kgnilnd_tunables.kgn_peer_health) {
+               int     fudge;
+
+               /* give this a bit of leeway - we don't have a hard timeout
+                * as we only check timeouts periodically - see comment in kgnilnd_reaper */
+               fudge = (GNILND_TO2KA(*kgnilnd_tunables.kgn_timeout) / GNILND_REAPER_NCHECKS);
+
+               ni->ni_peertimeout = *kgnilnd_tunables.kgn_timeout + fudge;
+
+               LCONSOLE_INFO("Enabling LNet peer health for gnilnd, timeout %ds\n",
+                             ni->ni_peertimeout);
+       }
+
+       atomic_set(&net->gnn_refcount, 1);
+
+       /* if we have multiple devices, spread the nets around */
+       net->gnn_netnum = LNET_NETNUM(LNET_NIDNET(ni->ni_nid));
+
+       devno = LNET_NIDNET(ni->ni_nid) % GNILND_MAXDEVS;
+       net->gnn_dev = &kgnilnd_data.kgn_devices[devno];
+
+       /* allocate a 'dummy' cdm for datagram use. We can only have a single
+        * datagram between a nid:inst_id and nid2:inst_id. The fake cdm
+        * give us additional inst_id to use, allowing the datagrams to flow
+        * like rivers of honey and beer */
+
+       /* the instance id for the cdm is the NETNUM offset by MAXDEVS -
+        * ensuring we'll have a unique id */
+
+
+       ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), net->gnn_dev->gnd_nid);
+       CDEBUG(D_NET, "adding net %p nid=%s on dev %d \n",
+               net, libcfs_nid2str(ni->ni_nid), net->gnn_dev->gnd_id);
+       /* until the gnn_list is set, we need to cleanup ourselves as
+        * kgnilnd_shutdown is just gonna get confused */
+
+       down_write(&kgnilnd_data.kgn_net_rw_sem);
+       list_add_tail(&net->gnn_list, kgnilnd_netnum2netlist(net->gnn_netnum));
+       up_write(&kgnilnd_data.kgn_net_rw_sem);
+
+       /* we need a separate thread to call probe_wait_by_id until
+        * we get a function callback notifier from kgni */
+       up(&kgnilnd_data.kgn_quiesce_sem);
+       RETURN(0);
+ failed:
+       up(&kgnilnd_data.kgn_quiesce_sem);
+       kgnilnd_shutdown(ni);
+       RETURN(rc);
+}
+
+void
+kgnilnd_shutdown(lnet_ni_t *ni)
+{
+       kgn_net_t     *net = ni->ni_data;
+       int           i;
+       int           rc;
+       ENTRY;
+
+       CFS_RACE(CFS_FAIL_GNI_SR_DOWN_RACE);
+
+       LASSERTF(kgnilnd_data.kgn_init == GNILND_INIT_ALL,
+               "init %d\n", kgnilnd_data.kgn_init);
+
+       /* Serialize with startup. */
+       down(&kgnilnd_data.kgn_quiesce_sem);
+       CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
+              atomic_read(&libcfs_kmemory));
+
+       if (net == NULL) {
+               CERROR("got NULL net for ni %p\n", ni);
+               rc = -EINVAL;
+               GOTO(out, rc);
+       }
+
+       LASSERTF(ni == net->gnn_ni,
+               "ni %p gnn_ni %p\n", net, net->gnn_ni);
+
+       ni->ni_data = NULL;
+
+       LASSERT(!net->gnn_shutdown);
+       LASSERTF(atomic_read(&net->gnn_refcount) != 0,
+               "net %p refcount %d\n",
+                net, atomic_read(&net->gnn_refcount));
+
+       if (!list_empty(&net->gnn_list)) {
+               /* serialize with peer creation */
+               down_write(&kgnilnd_data.kgn_net_rw_sem);
+               net->gnn_shutdown = 1;
+               up_write(&kgnilnd_data.kgn_net_rw_sem);
+
+               kgnilnd_cancel_net_dgrams(net);
+
+               kgnilnd_del_conn_or_peer(net, LNET_NID_ANY, GNILND_DEL_PEER, -ESHUTDOWN);
+
+               /* if we are quiesced, need to wake up - we need those threads
+                * alive to release peers, etc */
+               if (GNILND_IS_QUIESCED) {
+                       set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_IDLE);
+                       kgnilnd_quiesce_wait("shutdown");
+               }
+
+               kgnilnd_wait_for_canceled_dgrams(net->gnn_dev);
+
+               /* We wait until the nets ref's are 1, we will release final ref which is ours
+                * this allows us to make sure everything else is done before we free the
+                * net.
+                */
+               i = 4;
+               while (atomic_read(&net->gnn_refcount) != 1) {
+                       i++;
+                       CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
+                               "Waiting for %d references to clear on net %d\n",
+                               atomic_read(&net->gnn_refcount),
+                               net->gnn_netnum);
+                       cfs_pause(cfs_time_seconds(1));
+               }
+
+               /* release ref from kgnilnd_startup */
+               kgnilnd_net_decref(net);
+               /* serialize with reaper and conn_task looping */
+               down_write(&kgnilnd_data.kgn_net_rw_sem);
+               list_del_init(&net->gnn_list);
+               up_write(&kgnilnd_data.kgn_net_rw_sem);
+
+       }
+
+       /* not locking, this can't race with writers */
+       LASSERTF(atomic_read(&net->gnn_refcount) == 0,
+               "net %p refcount %d\n",
+                net, atomic_read(&net->gnn_refcount));
+       LIBCFS_FREE(net, sizeof(*net));
+
+out:
+       down_read(&kgnilnd_data.kgn_net_rw_sem);
+       for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) {
+               if (!list_empty(&kgnilnd_data.kgn_nets[i])) {
+                       up_read(&kgnilnd_data.kgn_net_rw_sem);
+                       break;
+               }
+
+               if (i == *kgnilnd_tunables.kgn_net_hash_size - 1) {
+                       up_read(&kgnilnd_data.kgn_net_rw_sem);
+                       kgnilnd_base_shutdown();
+               }
+       }
+       CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+              atomic_read(&libcfs_kmemory));
+
+       up(&kgnilnd_data.kgn_quiesce_sem);
+       EXIT;
+       return;
+}
+
+void __exit
+kgnilnd_module_fini(void)
+{
+       lnet_unregister_lnd(&the_kgnilnd);
+       kgnilnd_proc_fini();
+       kgnilnd_remove_sysctl();
+       kgnilnd_tunables_fini();
+}
+
+int __init
+kgnilnd_module_init(void)
+{
+       int    rc;
+
+       rc = kgnilnd_tunables_init();
+       if (rc != 0)
+               return rc;
+
+       printk(KERN_INFO "Lustre: kgnilnd build version: "KGNILND_BUILD_REV"\n");
+
+       kgnilnd_insert_sysctl();
+       kgnilnd_proc_init();
+
+       lnet_register_lnd(&the_kgnilnd);
+
+       return 0;
+}
+
+MODULE_AUTHOR("Cray, Inc. <nic@cray.com>");
+MODULE_DESCRIPTION("Kernel Gemini LND v"KGNILND_BUILD_REV);
+MODULE_LICENSE("GPL");
+
+module_init(kgnilnd_module_init);
+module_exit(kgnilnd_module_fini);
diff --git a/lnet/klnds/gnilnd/gnilnd.h b/lnet/klnds/gnilnd/gnilnd.h
new file mode 100644 (file)
index 0000000..de43728
--- /dev/null
@@ -0,0 +1,1790 @@
+/*
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ * Copyright (C) 2009-2012 Cray, Inc.
+ *
+ *   Derived from work by: Eric Barton <eric@bartonsoftware.com>
+ *   Author: Nic Henke <nic@cray.com>
+ *   Author: James Shimek <jshimek@cray.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#ifndef _GNILND_GNILND_H_
+#define _GNILND_GNILND_H_
+
+#define DEBUG_SUBSYSTEM S_LND
+
+#include <libcfs/libcfs.h>
+#include <lnet/lnet.h>
+#include <lnet/lib-lnet.h>
+#include <lnet/lnet-sysctl.h>
+
+#include <gni_pub.h>
+#include "gnilnd_version.h"
+#include "gnilnd_hss_ops.h"
+
+/* tunables determined at compile time */
+#define GNILND_MIN_TIMEOUT     5               /* minimum timeout interval (seconds) */
+#define GNILND_BASE_TIMEOUT    60              /* default sane timeout */
+#define GNILND_TO2KA(t)                (((t)-1)/2)     /* timeout -> keepalive interval */
+#define GNILND_MIN_RECONNECT_TO        (GNILND_BASE_TIMEOUT/4)
+#define GNILND_MAX_RECONNECT_TO        GNILND_BASE_TIMEOUT
+#define GNILND_HARDWARE_TIMEOUT        15              /* maximum time for data to travel between nodes */
+#define GNILND_MDD_TIMEOUT     15              /* MDD hold timeout in minutes */
+
+/* reaper thread wakup interval */
+#define GNILND_REAPER_THREAD_WAKE  1
+/* reaper thread checks each conn NCHECKS time every kgnilnd_data.kgn_new_min_timeout */
+#define GNILND_REAPER_NCHECKS      4
+
+/* fixed constants */
+#define GNILND_MAXDEVS         1               /* max # of GNI devices currently supported */
+#define GNILND_MBOX_CREDITS    256             /* number of credits per mailbox */
+#define GNILND_COOKIE          0xa3579         /* cookie used by along with ptag by GNI */
+
+/* checksum values */
+#define GNILND_CHECKSUM_OFF            0       /* checksum turned off */
+#define GNILND_CHECKSUM_SMSG_HEADER    1       /* Only checksum SMSG header */
+#define GNILND_CHECKSUM_SMSG           2       /* checksum entire SMSG packet */
+#define GNILND_CHECKSUM_SMSG_BTE       3       /* Full checksum support */
+
+/* tune down some COMPUTE options as they won't see the same number of connections and
+ * don't need the throughput of multiple threads by default */
+#if defined(CONFIG_CRAY_COMPUTE)
+#define GNILND_SCHED_THREADS      1             /* default # of kgnilnd_scheduler threads */
+#define GNILND_FMABLK             64            /* default number of mboxes per fmablk */
+#else
+#define GNILND_SCHED_THREADS      3             /* default # of kgnilnd_scheduler threads */
+#define GNILND_FMABLK             1024          /* default number of mboxes per fmablk */
+#endif
+
+/* EXTRA_BITS are there to allow us to hide NOOP/CLOSE and anything else out of band */
+#define GNILND_EXTRA_BITS         1
+/* maximum number of conns & bits for cqid in the SMSG event data */
+#define GNILND_CQID_NBITS         (21 - GNILND_EXTRA_BITS)
+#define GNILND_MSGID_TX_NBITS     (32 - GNILND_CQID_NBITS)
+#define GNILND_MAX_CQID           (1 << GNILND_CQID_NBITS)
+#define GNILND_MAX_MSG_ID         (1 << GNILND_MSGID_TX_NBITS)
+#define GNILND_MAX_MSG_SIZE       (*kgnilnd_tunables.kgn_max_immediate + sizeof(kgn_msg_t))
+
+/* need sane upper bound to limit copy overhead */
+#define GNILND_MAX_IMMEDIATE      (64<<10)
+
+/* payload size to add to the base mailbox size
+ * This is subtracting 2 from the concurrent_sends as 4 messages are included in the size
+ * gni_smsg_buff_size_needed calculates, the MAX_PAYLOAD is added to
+ * the calculation return from that function.*/
+#define GNILND_MBOX_PAYLOAD     \
+         (GNILND_MAX_MSG_SIZE * \
+         ((*kgnilnd_tunables.kgn_concurrent_sends - 2) * 2));
+
+/* timeout -> deadman timer for kgni mdd holds */
+#define GNILND_TIMEOUT2DEADMAN   ((*kgnilnd_tunables.kgn_mdd_timeout) * 1000 * 60)
+
+/* timeout for failing sends in t is in jiffies*/
+#define GNILND_TIMEOUTRX(t)     (t + cfs_time_seconds(*kgnilnd_tunables.kgn_hardware_timeout))
+
+/* time when to release from purgatory in the reaper thread in jiffies */
+#define GNILND_PURG_RELEASE(t)   (GNILND_TIMEOUTRX(t) * 3)
+
+/* Macro for finding last_rx 2 datapoints are compared
+ * and the most recent one in jiffies is returned.
+ */
+#define GNILND_LASTRX(conn) (time_after(conn->gnc_last_rx, conn->gnc_last_rx_cq) \
+                               ? conn->gnc_last_rx : conn->gnc_last_rx_cq)
+
+/************************************************************************
+ * Enum, flag and tag data
+ */
+#define GNILND_INIT_NOTHING         0
+#define GNILND_INIT_DATA            1
+#define GNILND_INIT_ALL             2
+
+/* If you change the ordering away from MAPPED = UNMAPPED + 1, things break */
+#define GNILND_BUF_NONE           0              /* buffer type not set */
+#define GNILND_BUF_IMMEDIATE      1              /* immediate data */
+#define GNILND_BUF_IMMEDIATE_KIOV 2              /* immediate data */
+#define GNILND_BUF_PHYS_UNMAPPED  3              /* physical: not mapped yet */
+#define GNILND_BUF_PHYS_MAPPED    4              /* physical: mapped already */
+#define GNILND_BUF_VIRT_UNMAPPED  5              /* virtual: not mapped yet */
+#define GNILND_BUF_VIRT_MAPPED    6              /* virtual: mapped already */
+
+#define GNILND_TX_WAITING_REPLY      (1<<1)     /* expecting to receive reply */
+#define GNILND_TX_WAITING_COMPLETION (1<<2)     /* waiting for smsg_send to complete */
+#define GNILND_TX_PENDING_RDMA       (1<<3)     /* RDMA transaction pending until we get prev. completion */
+#define GNILND_TX_QUIET_ERROR        (1<<4)     /* don't print error on tx_done */
+#define GNILND_TX_FAIL_SMSG          (1<<5)     /* pass down error injection for SMSG fail */
+
+/* stash above max CQID to avoid any collision */
+#define GNILND_MSGID_NOOP           (GNILND_MAX_CQID + 128)
+#define GNILND_MSGID_CLOSE          (GNILND_MSGID_NOOP + 1)
+
+/* kgn_msg_t::gnm_type */
+#define GNILND_MSG_NONE              0x00        /* illegal message */
+#define GNILND_MSG_NOOP              0x01        /* empty gnm_u (keepalive) */
+#define GNILND_MSG_IMMEDIATE         0x02        /* gnm_u.immediate */
+#define GNILND_MSG_PUT_REQ           0x03        /* gnm_u.putreq (src->sink) */
+#define GNILND_MSG_PUT_NAK           0x04        /* gnm_u.completion (no PUT match: sink->src) */
+#define GNILND_MSG_PUT_ACK           0x05        /* gnm_u.putack (PUT matched: sink->src) */
+#define GNILND_MSG_PUT_DONE          0x06        /* gnm_u.completion (src->sink) */
+#define GNILND_MSG_GET_REQ           0x07        /* gnm_u.get (sink->src) */
+#define GNILND_MSG_GET_NAK           0x08        /* gnm_u.completion (no GET match: src->sink) */
+#define GNILND_MSG_GET_DONE          0x09        /* gnm_u.completion (src->sink) */
+#define GNILND_MSG_CLOSE             0x0a        /* empty gnm_u */
+
+/* defines for gnc_*scheduled states */
+#define GNILND_CONN_IDLE             0
+#define GNILND_CONN_SCHED            1
+#define GNILND_CONN_WANTS_SCHED      2
+#define GNILND_CONN_PROCESS          3
+
+#define GNILND_DEV_IDLE              0
+#define GNILND_DEV_IRQ               1
+#define GNILND_DEV_LOOP              2
+
+#define GNILND_DGRAM_IDLE            0
+#define GNILND_DGRAM_SCHED           1
+#define GNILND_DGRAM_PROCESS         2
+
+#define GNILND_PEER_IDLE             0
+#define GNILND_PEER_CONNECT          1
+#define GNILND_PEER_POSTING          2
+#define GNILND_PEER_POSTED           3
+#define GNILND_PEER_NEEDS_DEATH      4
+#define GNILND_PEER_KILL             5
+
+/* for gnc_close_recvd */
+#define GNILND_CLOSE_RX              1
+#define GNILND_CLOSE_INJECT1         2
+#define GNILND_CLOSE_INJECT2         3
+#define GNILND_CLOSE_EARLY           4
+
+/* defines for why quiesce trigger set */
+#define GNILND_QUIESCE_IDLE          0
+#define GNILND_QUIESCE_ADMIN         1
+#define GNILND_QUIESCE_RESET         2
+#define GNILND_QUIESCE_HW_QUIESCE    3
+
+#define GNILND_PEER_CLEAN            0
+#define GNILND_PEER_PERSISTING       1
+
+#define GNILND_DEL_CONN              0
+#define GNILND_DEL_PEER              1
+#define GNILND_CLEAR_PURGATORY       2
+
+typedef enum kgn_fmablk_state {
+       GNILND_FMABLK_IDLE = 0, /* is allocated or ready to be freed */
+       GNILND_FMABLK_PHYS,     /* allocated out of slab of physical memory */
+       GNILND_FMABLK_VIRT,     /* 'standard' vmalloc hunk */
+       GNILND_FMABLK_FREED,    /* after free */
+} kgn_fmablk_state_t;
+
+typedef enum kgn_tx_list_state {
+       GNILND_TX_IDLE = 0,     /* TX is on the idle list, kgn_idle_txs */
+       GNILND_TX_ALLOCD,       /* TX has been alloced (off of idle), could be in any state transition */
+       GNILND_TX_PEERQ,        /* TX on peer->gnp_tx_queue (no live conn) */
+       GNILND_TX_MAPQ,         /* TX on dev:gnd_map_tx for buffer mapping */
+       GNILND_TX_FMAQ,         /* TX waiting to be send on conn FMA */
+       GNILND_TX_LIVE_FMAQ,    /* TX live on the FMA wire, waiting for completion or reply */
+       GNILND_TX_RDMAQ,        /* TX waiting to send FMA confirmation to auth RDMA PUT */
+       GNILND_TX_LIVE_RDMAQ,   /* TX live on the RDMA wire, waiting for completion */
+       GNILND_TX_DYING,        /* TX got caught on MAPQ or RDMAQ while conn was closing, needs someone to call tx_done */
+       GNILND_TX_FREED         /* TX is free! */
+} kgn_tx_list_state_t;
+
+typedef enum kgn_conn_state {
+       /* don't start @ 0 - prevent memset(0) badness */
+       GNILND_CONN_DUMMY = 0,
+       GNILND_CONN_LISTEN,
+       GNILND_CONN_CONNECTING,
+       GNILND_CONN_ESTABLISHED,
+       GNILND_CONN_CLOSING,
+       GNILND_CONN_CLOSED,
+       GNILND_CONN_DONE,
+       GNILND_CONN_DESTROY_EP
+} kgn_conn_state_t;
+
+/* changing these requires a change to GNILND_CONNREQ_VERSION and
+ * will result in dropped packets instead of NAKs. Adding to this is
+ * acceptable without changing the CONNREQ_VERSION, but code should
+ * be ready to handle NAKs on version mismatch  */
+typedef enum kgn_connreq_type {
+       GNILND_CONNREQ_REQ = 1,         /* how YOU doin' ? */
+       GNILND_CONNREQ_NAK,             /* NO soup for you! */
+       GNILND_CONNREQ_CLOSE,           /* we should see other people */
+} kgn_connreq_type_t;
+
+typedef enum kgn_dgram_state {
+       /* don't use 0 to avoid thinking a memset of zero is valid data */
+       GNILND_DGRAM_USED = 1,
+       GNILND_DGRAM_POSTING,
+       GNILND_DGRAM_POSTED,
+       GNILND_DGRAM_PROCESSING,
+       GNILND_DGRAM_CANCELED,
+       GNILND_DGRAM_DONE,
+} kgn_dgram_state_t;
+
+typedef enum kgn_dgram_type {
+       GNILND_DGRAM_REQ = 1,         /* how YOU doin' ? */
+       GNILND_DGRAM_WC_REQ,          /* you talkin' to ME? */
+       GNILND_DGRAM_NAK,             /* NO soup for you! */
+       GNILND_DGRAM_CLOSE,           /* we should see other people */
+} kgn_dgram_type_t;
+
+/************************************************************************
+ * Wire message structs.  These are sent in sender's byte order
+ * (i.e. receiver checks magic and flips if required).
+ */
+
+#define GNILND_MSG_MAGIC     LNET_PROTO_GNI_MAGIC /* unique magic */
+#define GNILND_DGRAM_MAGIC   0x0DDBA11
+
+/*  kgn_msg_t - FMA/SMSG wire struct
+  v2:
+   * - added checksum to FMA
+   * moved seq before paylod
+   * WIRE_ATTR added for alignment
+  v3:
+   * added gnm_payload_len for FMA payload size
+  v4:
+   * added gncm_retval to completion, allowing return code transmission
+     on RDMA NAKs
+  v5:
+   * changed how CQID and TX ids are assigned
+  v6:
+   * added retval on CLOSE
+  v7:
+   * added payload checksumming
+  v8:
+   * reworked checksumming a bit, changed payload checksums
+*/
+#define GNILND_MSG_VERSION              8
+/* kgn_connreq_t connection request datagram wire struct
+  v2:
+   * added NAKs
+*/
+
+#define GNILND_CONNREQ_VERSION          2
+
+typedef struct kgn_gniparams {
+       __u32            gnpr_host_id;          /* ph. host ID of the NIC */
+       __u32            gnpr_cqid;             /* cqid I want peer to use when sending events to me */
+       gni_smsg_attr_t  gnpr_smsg_attr;        /* my short msg. attributes */
+} WIRE_ATTR kgn_gniparams_t;
+
+typedef struct kgn_nak_data {
+       __s32            gnnd_errno;            /* errno reason for NAK */
+
+} WIRE_ATTR kgn_nak_data_t;
+
+/* the first bits of the connreq struct CANNOT CHANGE FORM EVER
+ * without breaking the ability for us to properly NAK someone */
+typedef struct kgn_connreq {                    /* connection request/response */
+       __u32             gncr_magic;           /* I'm an gnilnd connreq */
+       __u32             gncr_cksum;           /* checksum (0 == disabled) */
+       __u16             gncr_type;            /* REQ, NAK, etc */
+       __u16             gncr_version;         /* this is my version number */
+       __u32             gncr_timeout;         /* sender's timeout */
+       __u64             gncr_srcnid;          /* sender's NID */
+       __u64             gncr_dstnid;          /* who sender expects to listen */
+       __u64             gncr_peerstamp;       /* sender's instance stamp */
+       __u64             gncr_connstamp;       /* sender's connection stamp */
+
+       /* everything before this needs to stay static, adding after should
+        * result in a change to GNILND_CONNREQ_VERSION */
+
+       union {
+               kgn_gniparams_t   gncr_gnparams;        /* sender's endpoint info */
+               kgn_nak_data_t    gncr_nakdata;         /* data (rc, etc) for NAK */
+       };
+} WIRE_ATTR kgn_connreq_t;
+
+typedef struct {
+       gni_mem_handle_t  gnrd_key;
+       __u64             gnrd_addr;
+       __u32             gnrd_nob;
+} WIRE_ATTR kgn_rdma_desc_t;
+
+typedef struct {
+       lnet_hdr_t        gnim_hdr;             /* LNet header */
+       /* LNet payload is in FMA "Message Data" */
+} WIRE_ATTR kgn_immediate_msg_t;
+
+typedef struct {
+       lnet_hdr_t        gnprm_hdr;            /* LNet header */
+       __u64             gnprm_cookie;         /* opaque completion cookie */
+} WIRE_ATTR kgn_putreq_msg_t;
+
+typedef struct {
+       __u64             gnpam_src_cookie;     /* reflected completion cookie */
+       __u64             gnpam_dst_cookie;     /* opaque completion cookie */
+       kgn_rdma_desc_t   gnpam_desc;           /* sender's sink buffer */
+} WIRE_ATTR kgn_putack_msg_t;
+
+typedef struct {
+       lnet_hdr_t        gngm_hdr;             /* LNet header */
+       __u64             gngm_cookie;          /* opaque completion cookie */
+       kgn_rdma_desc_t   gngm_desc;            /* sender's sink buffer */
+} WIRE_ATTR kgn_get_msg_t;
+
+typedef struct {
+       int               gncm_retval;          /* error on NAK, size on REQ */
+       __u64             gncm_cookie;          /* reflected completion cookie */
+} WIRE_ATTR kgn_completion_msg_t;
+
+typedef struct {                                /* NB must fit in FMA "Prefix" */
+       __u32             gnm_magic;            /* I'm an gni message */
+       __u16             gnm_version;          /* this is my version number */
+       __u16             gnm_type;             /* msg type */
+       __u64             gnm_srcnid;           /* sender's NID */
+       __u64             gnm_connstamp;        /* sender's connection stamp */
+       __u32             gnm_seq;              /* incrementing sequence number */
+       __u16             gnm_cksum;            /* checksum (0 == no checksum ) */
+       __u16             gnm_payload_cksum;    /* payload checksum (0 == no checksum ) */
+       __u32             gnm_payload_len;      /* size of the FMA payload sent */
+       union {
+               kgn_immediate_msg_t   immediate;
+               kgn_putreq_msg_t      putreq;
+               kgn_putack_msg_t      putack;
+               kgn_get_msg_t         get;
+               kgn_completion_msg_t  completion;
+       } gnm_u;
+} WIRE_ATTR kgn_msg_t;
+
+/************************************************************************
+ * runtime tunable data
+ */
+
+typedef struct kgn_tunables {
+       int              *kgn_min_reconnect_interval; /* connreq starting timeout & retransmit interval */
+       int              *kgn_max_reconnect_interval; /* ...exponentially increasing to this */
+       int              *kgn_credits;          /* # concurrent sends */
+       int              *kgn_fma_cq_size;      /* # entries in receive CQ */
+       int              *kgn_peer_credits;     /* # LNet peer credits */
+       int              *kgn_concurrent_sends; /* max # of max_immediate in mbox */
+       int              *kgn_timeout;          /* comms timeout (seconds) */
+       int              *kgn_max_immediate;    /* immediate payload breakpoint */
+       int              *kgn_checksum;         /* checksum data */
+       int              *kgn_checksum_dump;    /* dump raw data to D_INFO log when checksumming */
+       int              *kgn_bte_hash;         /* hashing on BTE transfers */
+       int              *kgn_bte_adapt;        /* adaptive routing on BTE transfers */
+       int              *kgn_bte_relaxed_ordering; /* relaxed ordering (PASSPW) on BTE transfers */
+       int              *kgn_ptag;             /* PTAG for cdm_create */
+       int              *kgn_max_retransmits;  /* max number of FMA retransmits */
+       int              *kgn_nwildcard;        /* # wildcard per net to post */
+       int              *kgn_nice;             /* nice value for kgnilnd threads */
+       int              *kgn_rdmaq_intervals;  /* # intervals per second for rdmaq throttle */
+       int              *kgn_loops;            /* # of loops sched does before flush/heartbeat tickle */
+       int              *kgn_peer_hash_size;   /* size of kgn_peers */
+       int              *kgn_peer_health;      /* enable/disable peer health */
+       int              *kgn_vmap_cksum;       /* enable/disable vmap of kiov checksums */
+       int              *kgn_mbox_per_block;   /* mailboxes per fmablk */
+       int              *kgn_nphys_mbox;       /* # mailboxes to preallocate with physical memory */
+       int              *kgn_mbox_credits;     /* max credits per fma */
+       int              *kgn_sched_threads;    /* number of kgnilnd_scheduler threads */
+       int              *kgn_net_hash_size;    /* size of kgn_net_ht */
+       int              *kgn_hardware_timeout; /* max time for a message to get across the network */
+       int              *kgn_mdd_timeout;      /* max time for ghal to hold an mdd in minutes */
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+       cfs_sysctl_table_header_t *kgn_sysctl;  /* sysctl interface */
+#endif
+} kgn_tunables_t;
+
+typedef struct kgn_mbox_info {
+       lnet_nid_t mbx_prev_nid;
+       unsigned long mbx_create_conn_memset;
+       unsigned long mbx_add_purgatory;
+       unsigned long mbx_detach_of_purgatory;
+       unsigned long mbx_release_from_purgatory;
+       unsigned long mbx_release_purg_active_dgram;
+} kgn_mbox_info_t;
+
+typedef struct kgn_fma_memblock {
+       struct list_head    gnm_bufflist;                          /* memblock is part of device's  gnd_fma_buffs */
+       kgn_fmablk_state_t  gnm_state;                             /* how this memory allocated & state of it */
+       int                 gnm_hold_timeout;                      /* hold_timeout if used at unmap time */
+       int                 gnm_num_mboxs;                         /* total mboxes allocated */
+       int                 gnm_avail_mboxs;                       /* number of available mailboxes in the block */
+       int                 gnm_held_mboxs;                        /* number of purgatory held  mailboxes */
+       int                 gnm_mbox_size;                         /* size of the single mailbox */
+       int                 gnm_next_avail_mbox;                   /* next available mailbox */
+       long                gnm_max_timeout;                       /* max timeout for possible purgatory hold */
+       unsigned int        gnm_blk_size;                          /* how big is our hunk o memory ? */
+       void               *gnm_block;                             /* pointer to mem. block */
+       gni_mem_handle_t    gnm_hndl;                              /* mem. handle of the block */
+       unsigned long      *gnm_bit_array;                         /* bit array tracking allocation of mailboxes */
+       kgn_mbox_info_t    *gnm_mbox_info;                         /* array of mbox_information about each mbox */
+} kgn_fma_memblock_t;
+
+typedef struct kgn_device {
+       gni_nic_handle_t        gnd_handle;       /* device handle */
+       gni_cdm_handle_t        gnd_domain;       /* GNI communication domain */
+       gni_err_handle_t        gnd_err_handle;   /* device error handle */
+       unsigned long           gnd_sched_alive;  /* scheduler thread alive stamp */
+       gni_cq_handle_t         gnd_rcv_fma_cqh;  /* FMA rcv. completion queue handle */
+       gni_cq_handle_t         gnd_snd_rdma_cqh; /* rdma send completion queue handle */
+       gni_cq_handle_t         gnd_snd_fma_cqh;  /* rdma send completion queue handle */
+       struct mutex            gnd_cq_mutex;     /* CQ access serialization */
+       __u32                   gnd_host_id;      /* ph. host ID of the NIC */
+       int                     gnd_id;           /* device id, also index in kgn_devices */
+       __u32                   gnd_nid;          /* ph host ID translated to NID */
+       struct list_head        gnd_fma_buffs;    /* list of FMA memory blocks */
+       struct semaphore        gnd_fmablk_sem;   /* semaphore for FMA block memory alloc/free */
+       spinlock_t              gnd_fmablk_lock;  /* lock for mbox alloc/release */
+       atomic_t                gnd_nfmablk;      /* # of fmablk live */
+       atomic_t                gnd_fmablk_vers;  /* gnd_fma_bufs stamp */
+       atomic_t                gnd_neps;         /* # EP allocated to conns */
+       short                   gnd_ready;        /* stuff to do in scheduler thread */
+       struct list_head        gnd_ready_conns;  /* connections ready to tx/rx */
+       struct list_head        gnd_map_tx;       /* TX: needing buffer mapping */
+       wait_queue_head_t       gnd_waitq;        /* scheduler wakeup */
+       spinlock_t              gnd_lock;         /* serialise gnd_ready_conns */
+       struct list_head        gnd_connd_peers;  /* peers waiting for a connection */
+       spinlock_t              gnd_connd_lock;   /* serialise connd_peers */
+       wait_queue_head_t       gnd_dgram_waitq;  /* dgram_mover thread wakeup */
+       wait_queue_head_t       gnd_dgping_waitq; /* dgram thread ping-pong */
+       int                     gnd_dgram_ready;  /* dgrams need movin' */
+       struct list_head       *gnd_dgrams;       /* nid hash to dgrams */
+       atomic_t                gnd_ndgrams;      /* # dgrams extant */
+       spinlock_t              gnd_dgram_lock;   /* serialize gnd_dgrams */
+       struct list_head        gnd_map_list;     /* list of all mapped regions */
+       int                     gnd_map_version;  /* version flag for map list */
+       atomic_t                gnd_n_mdd;        /* number of total MDD - fma, tx, etc */
+       atomic_t                gnd_n_mdd_held;   /* number of total MDD held - fma, tx, etc */
+       atomic_t                gnd_nq_map;       /* # queued waiting for mapping (MDD/GART) */
+       atomic64_t              gnd_nbytes_map;   /* bytes of total GART maps - fma, tx, etc */
+       __u32                   gnd_map_nphys;    /* # TX phys mappings */
+       __u32                   gnd_map_physnop;  /* # TX phys pages mapped */
+       __u32                   gnd_map_nvirt;    /* # TX virt mappings */
+       __u64                   gnd_map_virtnob;  /* # TX virt bytes mapped */
+       spinlock_t              gnd_map_lock;     /* serialize gnd_map_XXX */
+       struct list_head        gnd_rdmaq;        /* RDMA to be sent */
+       spinlock_t              gnd_rdmaq_lock;   /* play nice with others */
+       atomic64_t              gnd_rdmaq_bytes_out; /* # bytes authorized */
+       atomic64_t              gnd_rdmaq_bytes_ok;  /* # bytes allowed until deadline */
+       atomic_t                gnd_rdmaq_nstalls;   /* # stalls due to throttle */
+       unsigned long           gnd_rdmaq_deadline;  /* when does bucket roll over ? */
+       struct timer_list       gnd_rdmaq_timer;     /* wakey-wakey */
+       atomic_t                gnd_short_ntx;      /* TX stats: short messages */
+       atomic64_t              gnd_short_txbytes;  /* TX stats: short message  payload*/
+       atomic_t                gnd_rdma_ntx;       /* TX stats: rdma messages */
+       atomic64_t              gnd_rdma_txbytes;   /* TX stats: rdma message payload*/
+       atomic_t                gnd_short_nrx;      /* RX stats: short messages */
+       atomic64_t              gnd_short_rxbytes;  /* RX stats: short message  payload*/
+       atomic_t                gnd_rdma_nrx;       /* RX stats: rdma messages */
+       atomic64_t              gnd_rdma_rxbytes;   /* RX stats: rdma message payload*/
+       atomic_t                gnd_fast_try;       /* # of times fast send tried */
+       atomic_t                gnd_fast_ok;        /* # of times fast send ok */
+       atomic_t                gnd_fast_block;     /* # of times fast send blocked */
+       unsigned long           gnd_mutex_delay;
+       atomic_t                gnd_n_yield;
+       atomic_t                gnd_n_schedule;
+       atomic_t                gnd_canceled_dgrams; /* # of outstanding cancels */
+} kgn_device_t;
+
+typedef struct kgn_net {
+       struct list_head    gnn_list;           /* chain on kgni_data::kgn_nets */
+       kgn_device_t       *gnn_dev;            /* device for this net */
+       lnet_ni_t          *gnn_ni;             /* network interface instance */
+       atomic_t            gnn_refcount;       /* # current references */
+       int                 gnn_shutdown;       /* lnd_shutdown set */
+       __u16               gnn_netnum;         /* stash netnum for quicker lookup */
+} kgn_net_t;
+
+static inline lnet_nid_t
+kgnilnd_lnd2lnetnid(lnet_nid_t ni_nid, lnet_nid_t kgnilnd_nid)
+{
+       return LNET_MKNID(LNET_NIDNET(ni_nid), LNET_NIDADDR(kgnilnd_nid));
+}
+
+static inline lnet_nid_t
+kgnilnd_lnet2lndnid(lnet_nid_t lnet_nid, lnet_nid_t kgnilnd_nid)
+{
+       return LNET_MKNID(LNET_NIDNET(kgnilnd_nid), LNET_NIDADDR(lnet_nid));
+}
+
+/* The code for this is a bit ugly - but really  this just boils down to a __u64
+ * that can have various parts accessed separately.
+ *
+ * The lower 32 bits is the ID
+ * we give to SMSG for our completion event - it needs to be globally unique across
+ * all TX currently in flight. We separate that out into the CQID so that we can
+ * reference the connection (kgnilnd_cqid2conn_locked) and then the msg_id to pull
+ * the actual TX out of the per-connection gnc_tx_ref_table.
+ *
+ * The upper 32 bits are just extra stuff we put into the cookie to ensure this TX
+ * has a unique value we can send with RDMA setup messages to ensure the completion for
+ * those is unique across the wire. The extra 32 bits are there to ensure that TX id
+ * reuse is separated.
+ */
+
+typedef struct kgn_tx_ev_id {
+       union {
+               __u64             txe_cookie;    /* are you my mommy ? */
+               struct {
+                       __u32     txe_chips;     /* extra bits to ensure ID unique across reuse */
+                       union {
+                               __u32     txe_smsg_id;      /* ID for SMSG CQ event */
+                               /* N.B: Never ever ever ever use the bit shifts directly,
+                                * you are just asking for a world of pain and are at the
+                                * mercy of the compiler layouts */
+                               struct {
+                                       __u32     txe_cqid :GNILND_CQID_NBITS;
+                                       __u32     txe_idx :GNILND_MSGID_TX_NBITS;
+                               };
+                       };
+               };
+       };
+} kgn_tx_ev_id_t;
+
+typedef struct kgn_dgram {
+       struct list_head     gndg_list;          /* on hash dev::gnd_dgrams */
+       kgn_dgram_state_t    gndg_state;         /* state of this dgram */
+       kgn_dgram_type_t     gndg_type;          /* REQ, NAK, etc */
+       __u32                gndg_magic;         /* saftey word */
+       unsigned long        gndg_post_time;     /* time when we posted */
+       struct kgn_conn     *gndg_conn;          /* unbound conn with ep & smsg */
+       kgn_connreq_t        gndg_conn_out;      /* connreq from local node */
+       kgn_connreq_t        gndg_conn_in;       /* connreq from remote node */
+} kgn_dgram_t;
+
+typedef struct kgn_tx {                         /* message descriptor */
+       struct list_head          tx_list;      /* TX queues - peer, conn, rdma */
+       kgn_tx_list_state_t       tx_list_state;/* where in state machine is this TX ? */
+       struct list_head         *tx_list_p;    /* pointer to current list */
+       struct kgn_conn          *tx_conn;      /* owning conn */
+       lnet_msg_t               *tx_lntmsg[2]; /* ptl msgs to finalize on completion */
+       unsigned long             tx_qtime;     /* when tx started to wait for something (jiffies) */
+       unsigned long             tx_cred_wait; /* time spend waiting for smsg creds */
+       struct list_head          tx_map_list;  /* list entry on device map list */
+       unsigned int              tx_nob;       /* # bytes of payload */
+       int                       tx_buftype;   /* payload buffer type */
+       int                       tx_phys_npages; /* # physical pages */
+       gni_mem_handle_t          tx_map_key;   /* mapping key */
+       gni_mem_segment_t        *tx_phys;      /* page descriptors */
+       kgn_msg_t                 tx_msg;       /* FMA message buffer */
+       kgn_tx_ev_id_t            tx_id;        /* who are you, who ? who ? */
+       __u8                      tx_state;     /* state of the descriptor */
+       int                       tx_retrans;   /* retrans count of RDMA */
+       int                       tx_rc;        /* if we need to stash the ret code until we see completion */
+       void                     *tx_buffer;    /* source/sink buffer */
+       union {
+               gni_post_descriptor_t     tx_rdma_desc; /* rdma descriptor */
+               struct page              *tx_imm_pages[GNILND_MAX_IMMEDIATE/PAGE_SIZE];  /* page array to map kiov for immediate send */
+       };
+
+       /* we only use one or the other */
+       union {
+               kgn_putack_msg_t  tx_putinfo;   /* data for differed rdma & re-try */
+               kgn_get_msg_t     tx_getinfo;   /* data for rdma re-try*/
+       };
+} kgn_tx_t;
+
+typedef struct kgn_conn {
+       kgn_device_t       *gnc_device;         /* which device */
+       struct kgn_peer    *gnc_peer;           /* owning peer */
+       struct list_head    gnc_list;           /* stash on peer's conn list - or pending purgatory lists as we clear them */
+       struct list_head    gnc_hashlist;       /* stash in connection hash table */
+       struct list_head    gnc_schedlist;      /* schedule (on gnd_?_conns) for attention */
+       struct list_head    gnc_fmaq;           /* txs queued for FMA */
+       struct list_head    gnc_mdd_list;       /* hold list for MDD on hard conn reset */
+       __u64               gnc_peerstamp;      /* peer's unique stamp */
+       __u64               gnc_peer_connstamp; /* peer's unique connection stamp */
+       __u64               gnc_my_connstamp;   /* my unique connection stamp */
+       unsigned long       gnc_first_rx;       /* when I first received an FMA message (jiffies) */
+       unsigned long       gnc_last_tx;        /* when I last sent an FMA message (jiffies) */
+       unsigned long       gnc_last_rx;        /* when I last sent an FMA message (jiffies) */
+       unsigned long       gnc_last_tx_cq;     /* when I last received an FMA CQ (jiffies) */
+       unsigned long       gnc_last_rx_cq;     /* when I last received an FMA CQ (jiffies) */
+       unsigned long       gnc_last_noop_want; /* time I wanted to send NOOP */
+       unsigned long       gnc_last_noop_sent; /* time I did gni_smsg_send on NOOP */
+       unsigned long       gnc_last_noop_cq;   /* time when NOOP completed */
+       unsigned long       gnc_last_sched_ask; /* time when conn added to ready_conns */
+       unsigned long       gnc_last_sched_do;  /* time when conn processed from ready_conns */
+       atomic_t            gnc_reaper_noop;    /* # reaper triggered NOOP */
+       atomic_t            gnc_sched_noop;     /* # sched triggered NOOP */
+       unsigned int        gnc_timeout;        /* infer peer death if no rx for this many seconds */
+       __u32               gnc_cqid;           /* my completion callback id (non-unique) */
+       __u32               gnc_tx_seq;         /* tx msg sequence number */
+       __u32               gnc_rx_seq;         /* rx msg sequence number */
+       __u64               gnc_tx_retrans;     /* # retrans on SMSG */
+       atomic_t            gnc_nlive_fma;      /* # live FMA */
+       atomic_t            gnc_nq_rdma;        /* # queued (on device) RDMA */
+       atomic_t            gnc_nlive_rdma;     /* # live RDMA */
+       short               gnc_close_sent;     /* I've sent CLOSE */
+       short               gnc_close_recvd;    /* I've received CLOSE */
+       short               gnc_in_purgatory;   /* in the sin bin */
+       int                 gnc_error;          /* errno when conn being closed due to error */
+       int                 gnc_peer_error;     /* errno peer sent us on CLOSE */
+       kgn_conn_state_t    gnc_state;          /* connection state */
+       int                 gnc_scheduled;      /* being attented to */
+       atomic_t            gnc_refcount;       /* # users */
+       spinlock_t          gnc_list_lock;      /* serialise tx lists, max_rx_age */
+       gni_ep_handle_t     gnc_ephandle;       /* GNI endpoint */
+       kgn_fma_memblock_t *gnc_fma_blk;        /* pointer to fma block for our mailbox */
+       gni_smsg_attr_t     gnpr_smsg_attr;     /* my short msg. attributes */
+       spinlock_t          gnc_tx_lock;        /* protect tx alloc/free */
+       __u8                gnc_tx_bits[GNILND_MAX_MSG_ID/8]; /* bit table for tx id */
+       int                 gnc_next_tx;        /* next tx to use in tx_ref_table */
+       kgn_tx_t          **gnc_tx_ref_table;   /* table of TX descriptors for this conn */
+       int                 gnc_mbox_id;        /* id of mbox in fma_blk                 */
+       short               gnc_needs_detach;   /* flag set in detach_purgatory_all_locked so reaper will clear out purgatory */
+       short               gnc_needs_closing;  /* flag set in del_conns when called from kgnilnd_del_peer_or_conn */
+} kgn_conn_t;
+
+typedef struct kgn_mdd_purgatory {
+       gni_mem_handle_t    gmp_map_key;        /* mapping key */
+       struct list_head    gmp_list;           /* entry point for purgatory list */
+} kgn_mdd_purgatory_t;
+
+typedef struct kgn_peer {
+       struct list_head    gnp_list;                   /* stash on global peer list */
+       struct list_head    gnp_connd_list;             /* schedule on kgn_connd_peers */
+       struct list_head    gnp_conns;                  /* all active connections and all conns in purgatory for the peer */
+       struct list_head    gnp_tx_queue;               /* msgs waiting for a conn */
+       kgn_net_t          *gnp_net;                    /* net instance for this peer */
+       lnet_nid_t          gnp_nid;                    /* who's on the other end(s) */
+       atomic_t            gnp_refcount;               /* # users */
+       __u32               gnp_host_id;                /* ph. host ID of the peer */
+       short               gnp_connecting;             /* connection forming */
+       short               gnp_pending_unlink;         /* need last conn close to trigger unlink */
+       int                 gnp_last_errno;             /* last error conn saw */
+       unsigned long       gnp_last_alive;             /* last time I had valid comms */
+       int                 gnp_last_dgram_errno;       /* last error dgrams saw */
+       unsigned long       gnp_last_dgram_time;        /* last time I tried to connect */
+       unsigned long       gnp_reconnect_time;         /* CURRENT_SECONDS when reconnect OK */
+       unsigned long       gnp_reconnect_interval;     /* exponential backoff */
+       atomic_t            gnp_dirty_eps;              /* # of old but yet to be destroyed EPs from conns */
+} kgn_peer_t;
+
+/* the kgn_rx_t is a struct for handing to LNET as the private pointer for things
+ * like lnet_parse. It allows a single pointer to let us get enough
+ * information in _recv and friends */
+typedef struct kgn_rx {
+       kgn_conn_t              *grx_conn;      /* connection */
+       kgn_msg_t               *grx_msg;       /* message */
+       lnet_msg_t              *grx_lntmsg;    /* lnet msg for this rx (eager only) */
+       int                      grx_eager;     /* if eager, we copied msg to somewhere */
+       struct timespec          grx_received;  /* time this msg received */
+} kgn_rx_t;
+
+typedef struct kgn_data {
+       int                     kgn_init;             /* initialisation state */
+       int                     kgn_shutdown;         /* shut down? */
+       int                     kgn_wc_kill;          /* Should I repost the WC */
+       atomic_t                kgn_nthreads;         /* # live threads */
+       int                     kgn_nresets;          /* number of stack resets */
+       int                     kgn_in_reset;         /* are we in stack reset ? */
+
+       kgn_device_t            kgn_devices[GNILND_MAXDEVS]; /* device/ptag/cq etc */
+       int                     kgn_ndevs;            /* # devices */
+
+       int                     kgn_ruhroh_running;   /* ruhroh thread is running */
+       int                     kgn_ruhroh_shutdown;  /* ruhroh thread should or is shut down */
+       wait_queue_head_t       kgn_ruhroh_waitq;     /* ruhroh thread wakeup */
+       int                     kgn_quiesce_trigger;  /* should we quiesce ? */
+       atomic_t                kgn_nquiesce;         /* how many quiesced ? */
+       struct semaphore        kgn_quiesce_sem;      /* serialize ruhroh task, startup and shutdown */
+       int                     kgn_needs_reset;      /* we need stack reset */
+
+       /* These next three members implement communication from gnilnd into
+        * the ruhroh task.  To ensure correct operation of the task, code that
+        * writes into them must use memory barriers to ensure that the changes
+        * are visible to other cores in the order the members appear below.  */
+       __u32                   kgn_quiesce_secs;     /* seconds to bump timeouts */
+       int                     kgn_bump_info_rdy;    /* we have info needed to bump */
+       int                     kgn_needs_pause;      /* we need to pause for network quiesce */
+
+       struct list_head       *kgn_nets;             /* hashtable of kgn_net instances */
+       struct rw_semaphore     kgn_net_rw_sem;       /* serialise gnn_shutdown, kgn_nets */
+
+       rwlock_t                kgn_peer_conn_lock;   /* stabilize peer/conn ops */
+       struct list_head       *kgn_peers;            /* hash table of all my known peers */
+       atomic_t                kgn_npeers;           /* # peers extant */
+       int                     kgn_peer_version;     /* version flag for peer tables */
+
+       struct list_head       *kgn_conns;            /* conns hashed by cqid */
+       atomic_t                kgn_nconns;           /* # connections extant */
+       __u64                   kgn_peerstamp;        /* when I started up */
+       __u64                   kgn_connstamp;        /* conn stamp generator */
+       int                     kgn_conn_version;     /* version flag for conn tables */
+       int                     kgn_next_cqid;        /* cqid generator */
+
+       long                    kgn_new_min_timeout;  /* minimum timeout on any new conn */
+       wait_queue_head_t       kgn_reaper_waitq;     /* reaper sleeps here */
+       spinlock_t              kgn_reaper_lock;      /* serialise */
+
+       cfs_mem_cache_t        *kgn_rx_cache;         /* rx descriptor space */
+       cfs_mem_cache_t        *kgn_tx_cache;         /* tx descriptor memory */
+       cfs_mem_cache_t        *kgn_tx_phys_cache;    /* tx phys descriptor memory */
+       atomic_t                kgn_ntx;              /* # tx in use */
+       cfs_mem_cache_t        *kgn_dgram_cache;      /* outgoing datagrams */
+
+       struct page          ***kgn_cksum_map_pages;  /* page arrays for mapping pages on checksum */
+       __u64                   kgn_cksum_npages;     /* Number of pages allocated for checksumming */
+       atomic_t                kgn_nvmap_cksum;      /* # times we vmapped for checksums */
+       atomic_t                kgn_nvmap_short;      /* # times we vmapped for short kiov */
+
+       atomic_t                kgn_nkmap_short;      /* # time we kmapped for a short kiov */
+       long                    kgn_rdmaq_override;   /* bytes per second override */
+
+       struct kmem_cache      *kgn_mbox_cache;       /* mailboxes from not-GART */
+
+       atomic_t                kgn_npending_unlink;  /* # of peers pending unlink */
+       atomic_t                kgn_npending_conns;   /* # of conns with pending closes */
+       atomic_t                kgn_npending_detach;  /* # of conns with a pending detach */
+
+} kgn_data_t;
+
+extern kgn_data_t         kgnilnd_data;
+extern kgn_tunables_t     kgnilnd_tunables;
+
+extern void kgnilnd_destroy_peer(kgn_peer_t *peer);
+extern void kgnilnd_destroy_conn(kgn_conn_t *conn);
+extern void kgnilnd_schedule_conn(kgn_conn_t *conn);
+
+static inline int
+kgnilnd_thread_start(int(*fn)(void *arg), void *arg, char *name, int id)
+{
+       struct task_struct *thrd = kthread_run(fn, arg, "%s_%02d", name, id);
+       if (IS_ERR(thrd))
+               return PTR_ERR(thrd);
+
+       atomic_inc(&kgnilnd_data.kgn_nthreads);
+       return 0;
+}
+
+static inline void
+kgnilnd_thread_fini(void)
+{
+       atomic_dec(&kgnilnd_data.kgn_nthreads);
+}
+
+/* like mutex_trylock but with a jiffies spinner. This is to allow certain
+ * parts of the code to avoid a scheduler trip when the mutex is held
+ *
+ * Try to acquire the mutex atomically for 1 jiffie. Returns 1 if the mutex
+ * has been acquired successfully, and 0 on contention.
+ *
+ * NOTE: this function follows the spin_trylock() convention, so
+ * it is negated to the down_trylock() return values! Be careful
+ * about this when converting semaphore users to mutexes.
+ *
+ * This function must not be used in interrupt context. The
+ * mutex must be released by the same task that acquired it.
+ */
+static inline int kgnilnd_mutex_trylock(struct mutex *lock)
+{
+       int             ret;
+       unsigned long   timeout;
+
+       LASSERT(!in_interrupt());
+
+       for (timeout = jiffies + 1; time_before(jiffies, timeout);) {
+
+               ret = mutex_trylock(lock);
+               if (ret)
+                       return ret;
+       }
+       return 0;
+}
+
+/* Copied from DEBUG_REQ in Lustre - the dance is needed to save stack space */
+
+extern void
+_kgnilnd_debug_msg(kgn_msg_t *msg,
+               struct libcfs_debug_msg_data *data, const char *fmt, ... );
+
+#define kgnilnd_debug_msg(msgdata, mask, cdls, msg, fmt, a...)                \
+do {                                                                          \
+       CFS_CHECK_STACK(msgdata, mask, cdls);                                 \
+                                                                             \
+       if (((mask) & D_CANTMASK) != 0 ||                                     \
+           ((libcfs_debug & (mask)) != 0 &&                                  \
+            (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0))                \
+               _kgnilnd_debug_msg((msg), msgdata, fmt, ##a);                 \
+} while(0)
+
+/* for most callers (level is a constant) this is resolved at compile time */
+#define GNIDBG_MSG(level, msg, fmt, args...)                                  \
+do {                                                                          \
+       if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) {                   \
+           static cfs_debug_limit_state_t cdls;                              \
+           LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls);                \
+           kgnilnd_debug_msg(&msgdata, level, &cdls, msg,                    \
+                             "$$ "fmt" from %s ", ## args,                   \
+                             libcfs_nid2str((msg)->gnm_srcnid));             \
+       } else {                                                              \
+           LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL);                 \
+           kgnilnd_debug_msg(&msgdata, level, NULL, msg,                     \
+                             "$$ "fmt" from %s ", ## args,                   \
+                             libcfs_nid2str((msg)->gnm_srcnid));             \
+       }                                                                     \
+} while (0)
+
+/* user puts 'to nid' in msg for us */
+#define GNIDBG_TOMSG(level, msg, fmt, args...)                                \
+do {                                                                          \
+       if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) {                   \
+           static cfs_debug_limit_state_t cdls;                              \
+           LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls);                \
+           kgnilnd_debug_msg(&msgdata, level, &cdls, msg,                    \
+                             "$$ "fmt" ", ## args);                          \
+       } else {                                                              \
+           LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL);                 \
+           kgnilnd_debug_msg(&msgdata, level, NULL, msg,                     \
+                             "$$ "fmt" ", ## args);                          \
+       }                                                                     \
+} while (0)
+
+extern void
+_kgnilnd_debug_conn(kgn_conn_t *conn,
+               struct libcfs_debug_msg_data *data, const char *fmt, ... );
+
+#define kgnilnd_debug_conn(msgdata, mask, cdls, conn, fmt, a...)               \
+do {                                                                           \
+       CFS_CHECK_STACK(msgdata, mask, cdls);                                  \
+                                                                              \
+       if (((mask) & D_CANTMASK) != 0 ||                                      \
+           ((libcfs_debug & (mask)) != 0 &&                                   \
+            (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0))                 \
+               _kgnilnd_debug_conn((conn), msgdata, fmt, ##a);                \
+} while(0)
+
+/* for most callers (level is a constant) this is resolved at compile time */
+#define GNIDBG_CONN(level, conn, fmt, args...)                                  \
+do {                                                                            \
+       if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) {                     \
+           static cfs_debug_limit_state_t cdls;                                \
+           LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls);                  \
+           kgnilnd_debug_conn(&msgdata, level, &cdls, conn,                    \
+                              "$$ "fmt" ", ## args);                           \
+       } else {                                                                \
+           LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL);                   \
+           kgnilnd_debug_conn(&msgdata, level, NULL, conn,                     \
+                              "$$ "fmt" ", ## args);                           \
+       }                                                                       \
+} while (0)
+
+extern void
+_kgnilnd_debug_tx(kgn_tx_t *tx,
+               struct libcfs_debug_msg_data *data, const char *fmt, ... );
+
+#define kgnilnd_debug_tx(msgdata, mask, cdls, tx, fmt, a...)                   \
+do {                                                                           \
+       CFS_CHECK_STACK(msgdata, mask, cdls);                                  \
+                                                                              \
+       if (((mask) & D_CANTMASK) != 0 ||                                      \
+           ((libcfs_debug & (mask)) != 0 &&                                   \
+            (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0))                 \
+               _kgnilnd_debug_tx((tx), msgdata, fmt, ##a);                    \
+} while(0)
+
+/* for most callers (level is a constant) this is resolved at compile time */
+#define GNIDBG_TX(level, tx, fmt, args...)                                      \
+do {                                                                            \
+       if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) {                     \
+           static cfs_debug_limit_state_t cdls;                                \
+           LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls);                  \
+           kgnilnd_debug_tx(&msgdata, level, &cdls, tx,                        \
+                             "$$ "fmt" ", ## args);                            \
+       } else {                                                                \
+           LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL);                   \
+           kgnilnd_debug_tx(&msgdata, level, NULL, tx,                         \
+                             "$$ "fmt" ", ## args);                            \
+       }                                                                       \
+} while (0)
+
+#define GNITX_ASSERTF(tx, cond, fmt, a...)                                      \
+({                                                                              \
+       if (unlikely(!(cond))) {                                                \
+               GNIDBG_TX(D_EMERG, tx, "ASSERTION(" #cond ") failed:" fmt, a);  \
+               LBUG();                                                         \
+       }                                                                       \
+})
+
+#define GNILND_IS_QUIESCED                                                      \
+       (atomic_read(&kgnilnd_data.kgn_nquiesce) ==                             \
+               atomic_read(&kgnilnd_data.kgn_nthreads))
+
+#define KGNILND_SPIN_QUIESCE                                                 \
+do {                                                                         \
+       /* E.T phone home */                                                 \
+       atomic_inc(&kgnilnd_data.kgn_nquiesce);                              \
+       CDEBUG(D_NET, "Waiting for thread pause to be over...\n");           \
+       while (kgnilnd_data.kgn_quiesce_trigger) {                           \
+               set_current_state(TASK_INTERRUPTIBLE);                       \
+               cfs_schedule_timeout_and_set_state(TASK_INTERRUPTIBLE,       \
+                       cfs_time_seconds(1));                                \
+       }                                                                    \
+       /* Mom, my homework is done */                                       \
+       CDEBUG(D_NET, "Waking up from thread pause\n");                      \
+       atomic_dec(&kgnilnd_data.kgn_nquiesce);                              \
+} while(0)
+
+/* use macros for addref/decref to get the calling function name in the CDEBUG */
+#ifndef LIBCFS_DEBUG
+#error "this code uses actions inside LASSERT for ref counting"
+#endif
+
+#define kgnilnd_admin_addref(atomic)                                     \
+do {                                                                            \
+       int     val = atomic_inc_return(&atomic);                               \
+       LASSERTF(val > 0,  #atomic " refcount %d\n", val);                       \
+       CDEBUG(D_NETTRACE, #atomic " refcount %d\n", val);                       \
+} while (0)
+
+#define kgnilnd_admin_decref(atomic)                                     \
+do {                                                                            \
+       int     val = atomic_dec_return(&atomic);                               \
+       LASSERTF(val >=0,  #atomic " refcount %d\n", val);                        \
+       CDEBUG(D_NETTRACE, #atomic " refcount %d\n", val);                       \
+}while (0)
+
+#define kgnilnd_net_addref(net)                                                 \
+do {                                                                            \
+       int     val = atomic_inc_return(&net->gnn_refcount);                    \
+       LASSERTF(val > 1, "net %p refcount %d\n", net, val);                    \
+       CDEBUG(D_NETTRACE, "net %p->%s++ (%d)\n", net,                          \
+               libcfs_nid2str(net->gnn_ni->ni_nid), val);                      \
+} while (0)
+
+#define kgnilnd_net_decref(net)                                                 \
+do {                                                                            \
+       int     val = atomic_dec_return(&net->gnn_refcount);                    \
+       LASSERTF(val >= 0, "net %p refcount %d\n", net, val);                   \
+       CDEBUG(D_NETTRACE, "net %p->%s-- (%d)\n", net,                          \
+              libcfs_nid2str(net->gnn_ni->ni_nid), val);                       \
+} while (0)
+
+#define kgnilnd_peer_addref(peer)                                               \
+do {                                                                            \
+       int     val = atomic_inc_return(&peer->gnp_refcount);                   \
+       LASSERTF(val > 1, "peer %p refcount %d\n", peer, val);                  \
+       CDEBUG(D_NETTRACE, "peer %p->%s++ (%d)\n", peer,                        \
+              libcfs_nid2str(peer->gnp_nid), val);                             \
+} while (0)
+
+#define kgnilnd_peer_decref(peer)                                               \
+do {                                                                            \
+       int     val = atomic_dec_return(&peer->gnp_refcount);                   \
+       LASSERTF(val >= 0, "peer %p refcount %d\n", peer, val);                 \
+       CDEBUG(D_NETTRACE, "peer %p->%s--(%d)\n", peer,                         \
+              libcfs_nid2str(peer->gnp_nid), val);                             \
+       if (atomic_read(&peer->gnp_refcount) == 0)                              \
+               kgnilnd_destroy_peer(peer);                                     \
+} while(0)
+
+#define kgnilnd_conn_addref(conn)                                       \
+do {                                                                    \
+       int     val;                                                    \
+                                                                       \
+       smp_wmb();                                                      \
+       val = atomic_inc_return(&conn->gnc_refcount);                   \
+       LASSERTF(val >= 0, "conn %p refc %d to %s\n",                   \
+               conn, val,                                              \
+               conn->gnc_peer                                          \
+                       ? libcfs_nid2str(conn->gnc_peer->gnp_nid)       \
+                       : "<?>");                                       \
+       CDEBUG(D_NETTRACE, "conn %p->%s++ (%d)\n", conn,                \
+               conn->gnc_peer                                          \
+                       ? libcfs_nid2str(conn->gnc_peer->gnp_nid)       \
+                       : "<?>",                                        \
+               val);                                                   \
+} while (0)
+
+/* we hijack conn_decref && gnc_refcount = 1 to allow us to push the conn
+ * through the scheduler thread to get the EP destroyed. This avoids some
+ * messy semaphore business and allows us to reuse the connd_list and existing
+ * linkage and avoid creating extra lists just for destroying EPs */
+
+/* Safety Disclaimer:
+ * Q: If we decrement the refcount and then check it again, is it possible that
+ *    another caller could have passed through this macro concurrently? If so,
+ *    then it is possible that both will attempt to call kgnilnd_destroy_conn().
+ *
+ * A: Yes, entirely possible in most cases, but we can't get concurrent users
+ * once we are refcount <= 2. It hinges around gnc_state and membership of
+ * gnc_hashlist. There are two ways to find a connection - either ask for
+ * it from the peer, kgnilnd_find_conn_locked(peer) or from the CQ id,
+ * kgnilnd_cqid2conn_locked(id). While a conn is live, we'll have at least
+ * 4 refcounts
+ *
+ * - #1 from create (kgnilnd_create_conn)
+ * - #2 for EP (kgnilnd_create_conn)
+ * - #3 - living on peer (gnc_list, kgnilnd_finish_connect)
+ * - #4 living in global hash (gnc_hashlist, kgnilnd_finish_connect).
+ *
+ * Actually, only 3 live, as at the end of kgnilnd_finish_connect, we drop:
+ * - #1 - the ref the dgram inherited from kgnilnd_create_conn.
+ *
+ * There could be more from TX descriptors during the lifetime of a live
+ * conn.
+ *
+ * If we nuke the conn before finish_connect, we won't have parallel paths
+ * because nobody besides the dgram handler for the single outstanding
+ * dgram can find the connection as it isn't in any searchable tables yet.
+ *
+ * This leaves connection close, we'll drop 2 refs (#4 and #3) but only
+ * after calling kgnilnd_schedule_conn, which would add a new ref (#5). At
+ * this point gnc_refcount=2 (#2, #5). We have a 'maybe' send of the CLOSE
+ * now on the next scheduler loop, this could be #6 (schedule_conn again)
+ * and #7 (TX on gnc_fmaq). Both would be cleared quickly as that TX is
+ * sent. Now the gnc_state == CLOSED, so we hit
+ * kgnilnd_complete_closed_conn. At this point, nobody can 'find' this conn
+ * - we've nuked them from the peer and CQ id tables, so we own them and
+ * are guaranteed serial access - hence the complete lack of conn list
+ * locking in kgnilnd_complete_closed_conn. We are free then to mark the
+ * conn DESTROY_EP (add #6 for schedule_conn), then lose #5 in
+ * kgnilnd_process_conns. Then the next scheduler loop would call
+ * kgnilnd_destroy_conn_ep (drop #2 for EP) and lose #6 (refcount=0) in
+ * kgnilnd_process_conns.
+ *
+ * Clearly, we are totally safe. Clearly.
+ */
+
+#define kgnilnd_conn_decref(conn)                                       \
+do {                                                                    \
+       int     val;                                                    \
+                                                                       \
+       smp_wmb();                                                      \
+       val = atomic_dec_return(&conn->gnc_refcount);                   \
+       LASSERTF(val >= 0, "conn %p refc %d to %s\n",                   \
+               conn, val,                                              \
+               conn->gnc_peer                                          \
+                       ? libcfs_nid2str(conn->gnc_peer->gnp_nid)       \
+                       : "<?>");                                       \
+       CDEBUG(D_NETTRACE, "conn %p->%s-- (%d)\n", conn,                \
+               conn->gnc_peer                                          \
+                       ? libcfs_nid2str(conn->gnc_peer->gnp_nid)       \
+                       : "<?>",                                        \
+               val);                                                   \
+       smp_rmb();                                                      \
+       if ((atomic_read(&conn->gnc_refcount) == 1) &&                  \
+           (conn->gnc_ephandle != NULL) &&                             \
+           (conn->gnc_state != GNILND_CONN_DESTROY_EP)) {              \
+               set_mb(conn->gnc_state, GNILND_CONN_DESTROY_EP);        \
+               kgnilnd_schedule_conn(conn);                            \
+       } else if (atomic_read(&conn->gnc_refcount) == 0) {             \
+               kgnilnd_destroy_conn(conn);                             \
+       }                                                               \
+} while (0)
+
+static inline struct list_head *
+kgnilnd_nid2peerlist(lnet_nid_t nid)
+{
+       unsigned int hash = ((unsigned int)LNET_NIDADDR(nid)) % *kgnilnd_tunables.kgn_peer_hash_size;
+
+       RETURN(&kgnilnd_data.kgn_peers[hash]);
+}
+
+static inline struct list_head *
+kgnilnd_netnum2netlist(__u16 netnum)
+{
+       unsigned int hash = ((unsigned int) netnum) % *kgnilnd_tunables.kgn_net_hash_size;
+
+       RETURN(&kgnilnd_data.kgn_nets[hash]);
+}
+
+static inline int
+kgnilnd_peer_active(kgn_peer_t *peer)
+{
+       /* Am I in the peer hash table? */
+       return (!list_empty(&peer->gnp_list));
+}
+
+/* need write_lock on kgn_peer_conn_lock */
+static inline int
+kgnilnd_can_unlink_peer_locked(kgn_peer_t *peer)
+{
+       CDEBUG(D_NET, "peer 0x%p->%s conns? %d tx? %d\n",
+               peer, libcfs_nid2str(peer->gnp_nid),
+               !list_empty(&peer->gnp_conns),
+               !list_empty(&peer->gnp_tx_queue));
+
+       /* kgn_peer_conn_lock protects us from conflict with
+        * kgnilnd_peer_notify and gnp_persistent */
+       RETURN ((list_empty(&peer->gnp_conns)) &&
+               (list_empty(&peer->gnp_tx_queue)));
+}
+
+/* returns positive if error was for a clean shutdown of conn */
+static inline int
+kgnilnd_conn_clean_errno(int errno)
+{
+       /*  - ESHUTDOWN - LND is unloading
+        *  - EUCLEAN - admin requested via "lctl del_peer"
+        *  - ENETRESET - admin requested via "lctl disconnect"
+        *  - ENOTRECOVERABLE - stack reset
+        *  - EISCONN - cleared via "lctl push"
+        *  not doing ESTALE - that isn't clean */
+       RETURN ((errno == 0) ||
+               (errno == -ESHUTDOWN) ||
+               (errno == -EUCLEAN) ||
+               (errno == -ENETRESET) ||
+               (errno == -EISCONN) ||
+               (errno == -ENOTRECOVERABLE));
+}
+
+/* returns positive if error results in purgatory hold */
+static inline int
+kgnilnd_check_purgatory_errno(int errno)
+{
+       /* We don't want to save the purgatory lists these cases:
+        *  - EUCLEAN - admin requested via "lctl del_peer"
+        *  - ESHUTDOWN - LND is unloading
+        */
+       RETURN ((errno != -ESHUTDOWN) &&
+               (errno != -EUCLEAN));
+
+}
+
+/* returns positive if a purgatory hold is needed */
+static inline int
+kgnilnd_check_purgatory_conn(kgn_conn_t *conn)
+{
+       int loopback = 0;
+
+       if (conn->gnc_peer) {
+               loopback = conn->gnc_peer->gnp_nid ==
+                      conn->gnc_peer->gnp_net->gnn_ni->ni_nid;
+       } else {
+               /* short circuit - a conn that didn't complete
+                * setup never needs a purgatory hold */
+               RETURN(0);
+       }
+       CDEBUG(D_NETTRACE, "conn 0x%p->%s loopback %d close_recvd %d\n",
+               conn, conn->gnc_peer ?
+                               libcfs_nid2str(conn->gnc_peer->gnp_nid) :
+                               "<?>",
+               loopback, conn->gnc_close_recvd);
+
+       /* we only use a purgatory hold if we've not received the CLOSE msg
+        * from our peer - without that message, we can't know the state of
+        * the other end of this connection and must put it into purgatory
+        * to prevent reuse and corruption.
+        * The theory is that a TX error can be communicated in all other cases
+        */
+       RETURN(likely(!loopback) && !conn->gnc_close_recvd &&
+               kgnilnd_check_purgatory_errno(conn->gnc_error));
+}
+
+static inline const char *
+kgnilnd_tx_state2str(kgn_tx_list_state_t state);
+
+static inline struct list_head *
+kgnilnd_tx_state2list(kgn_peer_t *peer, kgn_conn_t *conn,
+                       kgn_tx_list_state_t to_state)
+{
+       switch (to_state) {
+       case GNILND_TX_PEERQ:
+               return &peer->gnp_tx_queue;
+       case GNILND_TX_FMAQ:
+               return &conn->gnc_fmaq;
+       case GNILND_TX_LIVE_FMAQ:
+       case GNILND_TX_LIVE_RDMAQ:
+       case GNILND_TX_DYING:
+               return NULL;
+       case GNILND_TX_MAPQ:
+               return &conn->gnc_device->gnd_map_tx;
+       case GNILND_TX_RDMAQ:
+               return &conn->gnc_device->gnd_rdmaq;
+       default:
+               /* IDLE, FREED or ALLOCD is not valid "on list" state */
+               CERROR("invalid state requested: %s\n",
+                       kgnilnd_tx_state2str(to_state));
+               LBUG();
+               break;
+       }
+}
+
+/* should hold tx, conn or peer lock when calling */
+static inline void
+kgnilnd_tx_add_state_locked(kgn_tx_t *tx, kgn_peer_t *peer,
+                       kgn_conn_t *conn, kgn_tx_list_state_t state,
+                       int add_tail)
+{
+       struct list_head        *list = NULL;
+
+       /* make sure we have a sane TX state to start */
+       GNITX_ASSERTF(tx, (tx->tx_list_p == NULL &&
+                 tx->tx_list_state == GNILND_TX_ALLOCD) &&
+               list_empty(&tx->tx_list),
+               "bad state with tx_list %s",
+               list_empty(&tx->tx_list) ? "empty" : "not empty");
+
+       /* WTF - you are already on that state buttmunch */
+       GNITX_ASSERTF(tx, state != tx->tx_list_state,
+                     "already at %s", kgnilnd_tx_state2str(state));
+
+       /* get proper list from the state requested */
+       list = kgnilnd_tx_state2list(peer, conn, state);
+
+       /* add refcount */
+       switch (state) {
+       case GNILND_TX_PEERQ:
+               kgnilnd_peer_addref(peer);
+               break;
+       case GNILND_TX_ALLOCD:
+               /* no refs needed */
+               break;
+       case GNILND_TX_FMAQ:
+               kgnilnd_conn_addref(conn);
+               break;
+       case GNILND_TX_MAPQ:
+               atomic_inc(&conn->gnc_device->gnd_nq_map);
+               kgnilnd_conn_addref(conn);
+               break;
+       case GNILND_TX_LIVE_FMAQ:
+               atomic_inc(&conn->gnc_nlive_fma);
+               kgnilnd_conn_addref(conn);
+               break;
+       case GNILND_TX_LIVE_RDMAQ:
+               atomic_inc(&conn->gnc_nlive_rdma);
+               kgnilnd_conn_addref(conn);
+               break;
+       case GNILND_TX_RDMAQ:
+               atomic_inc(&conn->gnc_nq_rdma);
+               kgnilnd_conn_addref(conn);
+               break;
+       case GNILND_TX_DYING:
+               kgnilnd_conn_addref(conn);
+               break;
+       default:
+               CERROR("invalid state requested: %s\n",
+                       kgnilnd_tx_state2str(state));
+               LBUG();
+               break;;
+       }
+
+       /* if this changes, change kgnilnd_alloc_tx */
+       tx->tx_list_state = state;
+
+       /* some states don't have lists - we track them in the per conn
+        * TX table instead. Waste not, want not! */
+       if (list != NULL) {
+               tx->tx_list_p = list;
+               if (add_tail)
+                       list_add_tail(&tx->tx_list, list);
+               else
+                       list_add(&tx->tx_list, list);
+       } else {
+               /* set dummy list_p to make book keeping happy and let debugging
+                * be a hair easier */
+               tx->tx_list_p = (void *)state;
+       }
+
+       GNIDBG_TX(D_NET, tx, "onto %s->0x%p",
+                 kgnilnd_tx_state2str(state), list);
+}
+
+static inline void
+kgnilnd_tx_del_state_locked(kgn_tx_t *tx, kgn_peer_t *peer,
+                       kgn_conn_t *conn, kgn_tx_list_state_t new_state)
+{
+       /* These is only 1 "off-list" state */
+       GNITX_ASSERTF(tx, new_state == GNILND_TX_ALLOCD,
+                     "invalid new_state %s", kgnilnd_tx_state2str(new_state));
+
+       /* new_state == ALLOCD means we are deallocating this tx,
+        * so make sure it was on a valid list to start with */
+       GNITX_ASSERTF(tx, (tx->tx_list_p != NULL) &&
+                     (((tx->tx_list_state == GNILND_TX_LIVE_FMAQ) ||
+                       (tx->tx_list_state == GNILND_TX_LIVE_RDMAQ) ||
+                       (tx->tx_list_state == GNILND_TX_DYING)) == list_empty(&tx->tx_list)),
+                     "bad state", NULL);
+
+       GNIDBG_TX(D_NET, tx, "off %p", tx->tx_list_p);
+
+       /* drop refcount */
+       switch (tx->tx_list_state) {
+       case GNILND_TX_PEERQ:
+               kgnilnd_peer_decref(peer);
+               break;
+       case GNILND_TX_FREED:
+       case GNILND_TX_IDLE:
+       case GNILND_TX_ALLOCD:
+               /* no refs needed */
+               break;
+       case GNILND_TX_DYING:
+               kgnilnd_conn_decref(conn);
+               break;
+       case GNILND_TX_FMAQ:
+               kgnilnd_conn_decref(conn);
+               break;
+       case GNILND_TX_MAPQ:
+               atomic_dec(&conn->gnc_device->gnd_nq_map);
+               kgnilnd_conn_decref(conn);
+               break;
+       case GNILND_TX_LIVE_FMAQ:
+               atomic_dec(&conn->gnc_nlive_fma);
+               kgnilnd_conn_decref(conn);
+               break;
+       case GNILND_TX_LIVE_RDMAQ:
+               atomic_dec(&conn->gnc_nlive_rdma);
+               kgnilnd_conn_decref(conn);
+               break;
+       case GNILND_TX_RDMAQ:
+               atomic_dec(&conn->gnc_nq_rdma);
+               kgnilnd_conn_decref(conn);
+       /* don't need to assert on default, already did in set */
+       }
+
+       /* for ALLOCD, this might already be true, but no harm doing it again */
+       list_del_init(&tx->tx_list);
+       tx->tx_list_p = NULL;
+       tx->tx_list_state = new_state;
+}
+
+static inline int
+kgnilnd_tx_mapped(kgn_tx_t *tx)
+{
+       return (tx->tx_buftype == GNILND_BUF_VIRT_MAPPED ||
+               tx->tx_buftype == GNILND_BUF_PHYS_MAPPED);
+}
+
+static inline struct list_head *
+kgnilnd_cqid2connlist(__u32 cqid)
+{
+       unsigned int hash = cqid % *kgnilnd_tunables.kgn_peer_hash_size;
+
+       return (&kgnilnd_data.kgn_conns [hash]);
+}
+
+static inline kgn_conn_t *
+kgnilnd_cqid2conn_locked(__u32 cqid)
+{
+       struct list_head *conns = kgnilnd_cqid2connlist(cqid);
+       struct list_head *tmp;
+       kgn_conn_t       *conn;
+
+       list_for_each(tmp, conns) {
+               conn = list_entry(tmp, kgn_conn_t, gnc_hashlist);
+
+               if (conn->gnc_cqid == cqid)
+                       return conn;
+       }
+
+       return NULL;
+}
+
+/* returns 1..GNILND_MAX_CQID on success, 0 on failure */
+static inline __u32
+kgnilnd_get_cqid_locked(void)
+{
+       int     looped = 0;
+       __u32   cqid;
+
+       do {
+               cqid = kgnilnd_data.kgn_next_cqid++;
+               if (kgnilnd_data.kgn_next_cqid >= GNILND_MAX_CQID) {
+                       if (looped) {
+                               return 0;
+                       }
+                       kgnilnd_data.kgn_next_cqid = 1;
+                       looped = 1;
+               }
+       } while (kgnilnd_cqid2conn_locked(cqid) != NULL);
+
+       return cqid;
+}
+
+static inline void
+kgnilnd_validate_tx_ev_id(kgn_tx_ev_id_t *ev_id, kgn_tx_t **txp, kgn_conn_t **connp)
+{
+       kgn_tx_t        *tx = NULL;
+       kgn_conn_t      *conn = NULL;
+
+       /* set to NULL so any early return is an error */
+       *txp = NULL;
+       *connp = NULL;
+
+       LASSERTF((ev_id->txe_idx > 0) &&
+                (ev_id->txe_idx < GNILND_MAX_MSG_ID),
+               "bogus txe_idx %d >= %d\n",
+               ev_id->txe_idx, GNILND_MAX_MSG_ID);
+
+       LASSERTF((ev_id->txe_cqid > 0) &&
+                (ev_id->txe_cqid < GNILND_MAX_CQID),
+               "bogus txe_cqid %d >= %d\n",
+               ev_id->txe_cqid, GNILND_MAX_CQID);
+
+       read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+       conn = kgnilnd_cqid2conn_locked(ev_id->txe_cqid);
+
+       if (conn == NULL) {
+               /* Conn was destroyed? */
+               read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+               CDEBUG(D_NET, "CQID %d lookup failed\n", ev_id->txe_cqid);
+               return;
+       }
+       /* just insurance */
+       kgnilnd_conn_addref(conn);
+       read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       /* we know this is safe - as the TX won't be reused until AFTER
+        * the conn is unlinked from the cqid hash, so we can use the TX
+        * (serializing to avoid any cache oddness) freely from the conn tx ref table */
+
+       spin_lock(&conn->gnc_tx_lock);
+       tx = conn->gnc_tx_ref_table[ev_id->txe_idx];
+       spin_unlock(&conn->gnc_tx_lock);
+
+       /* We could have a tx that was cleared out by other forces
+        * lctl disconnect or del_peer. */
+       if (tx == NULL) {
+               CNETERR("txe_idx %d is gone, ignoring event\n", ev_id->txe_idx);
+               kgnilnd_conn_decref(conn);
+               return;
+       }
+
+       /* check tx->tx_msg magic to make sure kgni didn't eat it */
+       GNITX_ASSERTF(tx, tx->tx_msg.gnm_magic == GNILND_MSG_MAGIC,
+                     "came back from kgni with bad magic %x", tx->tx_msg.gnm_magic);
+
+       GNITX_ASSERTF(tx, tx->tx_id.txe_idx == ev_id->txe_idx,
+                     "conn 0x%p->%s tx_ref_table hosed: wanted txe_idx %d "
+                     "found tx %p txe_idx %d",
+                     conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+                     ev_id->txe_idx, tx, tx->tx_id.txe_idx);
+
+       GNITX_ASSERTF(tx, tx->tx_conn != NULL, "tx with NULL connection", NULL);
+
+       GNITX_ASSERTF(tx, tx->tx_conn == conn, "tx conn does not equal conn", NULL);
+
+       *txp = tx;
+       *connp = conn;
+
+       GNIDBG_TX(D_NET, tx, "validated to 0x%p", conn);
+}
+
+/* set_normalized_timepsec isn't exported from the kernel, so
+ * we need to do the same thing inline */
+static inline struct timespec
+kgnilnd_ts_sub(struct timespec lhs, struct timespec rhs)
+{
+       time_t                  sec;
+       long                    nsec;
+       struct timespec         ts;
+
+       sec = lhs.tv_sec - rhs.tv_sec;
+       nsec = lhs.tv_nsec - rhs.tv_nsec;
+
+       while (nsec >= NSEC_PER_SEC) {
+               nsec -= NSEC_PER_SEC;
+               ++sec;
+       }
+       while (nsec < 0) {
+               nsec += NSEC_PER_SEC;
+               --sec;
+       }
+       ts.tv_sec = sec;
+       ts.tv_nsec = nsec;
+       return ts;
+}
+
+static inline int
+kgnilnd_count_list(struct list_head *q)
+{
+       struct list_head *e;
+       int               n = 0;
+
+       list_for_each(e, q) {
+               n++;
+       }
+
+       return n;
+}
+
+/* kgnilnd_find_net adds a reference to the net it finds
+ * this is so the net will not be removed before the calling function
+ * has time to use the data returned. This reference needs to be released
+ * by the calling function once it has finished using the returned net
+ */
+
+static inline int
+kgnilnd_find_net(lnet_nid_t nid, kgn_net_t **netp)
+{
+       kgn_net_t *net;
+       int rc;
+
+       rc = down_read_trylock(&kgnilnd_data.kgn_net_rw_sem);
+
+       if (!rc) {
+               return -ESHUTDOWN;
+       }
+
+       list_for_each_entry(net, kgnilnd_netnum2netlist(LNET_NETNUM(LNET_NIDNET(nid))), gnn_list) {
+               if (!net->gnn_shutdown && LNET_NIDNET(net->gnn_ni->ni_nid) == LNET_NIDNET(nid)) {
+                       kgnilnd_net_addref(net);
+                       up_read(&kgnilnd_data.kgn_net_rw_sem);
+                       *netp = net;
+                       return 0;
+               }
+       }
+
+       up_read(&kgnilnd_data.kgn_net_rw_sem);
+
+       return -ENONET;
+}
+
+#ifdef CONFIG_DEBUG_SLAB
+#define KGNILND_POISON(ptr, c, s) do {} while(0)
+#else
+#define KGNILND_POISON(ptr, c, s) memset(ptr, c, s)
+#endif
+
+int kgnilnd_dev_init(kgn_device_t *dev);
+void kgnilnd_dev_fini(kgn_device_t *dev);
+int kgnilnd_startup(lnet_ni_t *ni);
+void kgnilnd_shutdown(lnet_ni_t *ni);
+int kgnilnd_base_startup(void);
+void kgnilnd_base_shutdown(void);
+
+int kgnilnd_allocate_phys_fmablk(kgn_device_t *device);
+int kgnilnd_map_phys_fmablk(kgn_device_t *device);
+void kgnilnd_unmap_phys_fmablk(kgn_device_t *device);
+void kgnilnd_free_phys_fmablk(kgn_device_t *device);
+
+int kgnilnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
+void kgnilnd_query(lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when);
+int kgnilnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+int kgnilnd_eager_recv(lnet_ni_t *ni, void *private,
+                       lnet_msg_t *lntmsg, void **new_private);
+int kgnilnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+               int delayed, unsigned int niov,
+               struct iovec *iov, lnet_kiov_t *kiov,
+               unsigned int offset, unsigned int mlen, unsigned int rlen);
+
+__u16 kgnilnd_cksum_kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int offset, unsigned int nob, int dump_blob);
+
+/* purgatory functions */
+void kgnilnd_add_purgatory_locked(kgn_conn_t *conn, kgn_peer_t *peer);
+void kgnilnd_mark_for_detach_purgatory_all_locked(kgn_peer_t *peer);
+void kgnilnd_detach_purgatory_locked(kgn_conn_t *conn, struct list_head *conn_list);
+void kgnilnd_release_purgatory_list(struct list_head *conn_list);
+
+void kgnilnd_update_reaper_timeout(long timeout);
+void kgnilnd_unmap_buffer(kgn_tx_t *tx, int error);
+kgn_tx_t *kgnilnd_new_tx_msg(int type, lnet_nid_t source);
+void kgnilnd_tx_done(kgn_tx_t *tx, int completion);
+void kgnilnd_txlist_done(struct list_head *txlist, int error);
+void kgnilnd_unlink_peer_locked(kgn_peer_t *peer);
+void kgnilnd_schedule_conn(kgn_conn_t *conn);
+void kgnilnd_schedule_process_conn(kgn_conn_t *conn, int sched_intent);
+
+void kgnilnd_schedule_dgram(kgn_device_t *dev);
+int kgnilnd_create_peer_safe(kgn_peer_t **peerp, lnet_nid_t nid, kgn_net_t *net);
+void kgnilnd_add_peer_locked(lnet_nid_t nid, kgn_peer_t *new_stub_peer, kgn_peer_t **peerp);
+int kgnilnd_add_peer(kgn_net_t *net, lnet_nid_t nid, kgn_peer_t **peerp);
+
+kgn_peer_t *kgnilnd_find_peer_locked(lnet_nid_t nid);
+int kgnilnd_del_conn_or_peer(kgn_net_t *net, lnet_nid_t nid, int command, int error);
+void kgnilnd_peer_increase_reconnect_locked(kgn_peer_t *peer);
+void kgnilnd_queue_reply(kgn_conn_t *conn, kgn_tx_t *tx);
+void kgnilnd_queue_tx(kgn_conn_t *conn, kgn_tx_t *tx);
+void kgnilnd_launch_tx(kgn_tx_t *tx, kgn_net_t *net, lnet_process_id_t *target);
+int kgnilnd_send_mapped_tx(kgn_tx_t *tx, int try_map_if_full);
+void kgnilnd_consume_rx(kgn_rx_t *rx);
+
+void kgnilnd_schedule_device(kgn_device_t *dev);
+void kgnilnd_device_callback(__u32 devid, __u64 arg);
+void kgnilnd_schedule_device_timer(unsigned long arg);
+
+int kgnilnd_reaper(void *arg);
+int kgnilnd_scheduler(void *arg);
+int kgnilnd_dgram_mover(void *arg);
+
+int kgnilnd_create_conn(kgn_conn_t **connp, kgn_device_t *dev);
+int kgnilnd_conn_isdup_locked(kgn_peer_t *peer, kgn_conn_t *newconn);
+kgn_conn_t *kgnilnd_find_conn_locked(kgn_peer_t *peer);
+int kgnilnd_get_conn(kgn_conn_t **connp, kgn_peer_t);
+kgn_conn_t *kgnilnd_find_or_create_conn_locked(kgn_peer_t *peer);
+void kgnilnd_peer_cancel_tx_queue(kgn_peer_t *peer);
+void kgnilnd_cancel_peer_connect_locked(kgn_peer_t *peer, struct list_head *zombies);
+int kgnilnd_close_stale_conns_locked(kgn_peer_t *peer, kgn_conn_t *newconn);
+void kgnilnd_peer_alive(kgn_peer_t *peer);
+void kgnilnd_peer_notify(kgn_peer_t *peer, int error);
+void kgnilnd_close_conn_locked(kgn_conn_t *conn, int error);
+void kgnilnd_close_conn(kgn_conn_t *conn, int error);
+void kgnilnd_complete_closed_conn(kgn_conn_t *conn);
+void kgnilnd_destroy_conn_ep(kgn_conn_t *conn);
+
+int kgnilnd_close_peer_conns_locked(kgn_peer_t *peer, int why);
+
+int kgnilnd_tunables_init(void);
+void kgnilnd_tunables_fini(void);
+void kgnilnd_init_msg(kgn_msg_t *msg, int type, lnet_nid_t source);
+
+void kgnilnd_bump_timeouts(__u32 nap_time, char *reason);
+void kgnilnd_pause_threads(void);
+int kgnilnd_hw_in_quiesce(void);
+int kgnilnd_check_hw_quiesce(void);
+void kgnilnd_quiesce_wait(char *reason);
+void kgnilnd_quiesce_end_callback(gni_nic_handle_t nic_handle, uint64_t msecs);
+int kgnilnd_ruhroh_thread(void *arg);
+void kgnilnd_reset_stack(void);
+void kgnilnd_critical_error(gni_err_handle_t err_handle);
+
+void kgnilnd_insert_sysctl(void);
+void kgnilnd_remove_sysctl(void);
+void kgnilnd_proc_init(void);
+void kgnilnd_proc_fini(void);
+
+/* gnilnd_conn.c */
+void kgnilnd_release_mbox(kgn_conn_t *conn, int purgatory_hold);
+
+int kgnilnd_find_and_cancel_dgram(kgn_device_t *dev, lnet_nid_t dst_nid);
+void kgnilnd_cancel_dgram_locked(kgn_dgram_t *dgram);
+void kgnilnd_release_dgram(kgn_device_t *dev, kgn_dgram_t *dgram);
+
+int kgnilnd_setup_wildcard_dgram(kgn_device_t *dev);
+int kgnilnd_cancel_net_dgrams(kgn_net_t *net);
+int kgnilnd_cancel_wc_dgrams(kgn_device_t *dev);
+void kgnilnd_wait_for_canceled_dgrams(kgn_device_t *dev);
+
+int kgnilnd_dgram_waitq(void *arg);
+
+int kgnilnd_set_conn_params(kgn_dgram_t *dgram);
+
+/* struct2str functions - we don't use a default: case to cause the compile
+ * to fail if there is a missing case. This allows us to hide these down here
+ * out of the way but ensure we'll catch any updates to the enum/types
+ * above */
+
+#define DO_TYPE(x) case x: return #x;
+static inline const char *
+kgnilnd_fmablk_state2str(kgn_fmablk_state_t state)
+{
+       /* Only want single char string for this */
+       switch (state) {
+       case GNILND_FMABLK_IDLE:
+               return "I";
+       case GNILND_FMABLK_PHYS:
+               return "P";
+       case GNILND_FMABLK_VIRT:
+               return "V";
+       case GNILND_FMABLK_FREED:
+               return "F";
+       }
+       return "<unknown state>";
+}
+
+static inline const char *
+kgnilnd_msgtype2str(int type)
+{
+       switch (type) {
+               DO_TYPE(GNILND_MSG_NONE);
+               DO_TYPE(GNILND_MSG_NOOP);
+               DO_TYPE(GNILND_MSG_IMMEDIATE);
+               DO_TYPE(GNILND_MSG_PUT_REQ);
+               DO_TYPE(GNILND_MSG_PUT_NAK);
+               DO_TYPE(GNILND_MSG_PUT_ACK);
+               DO_TYPE(GNILND_MSG_PUT_DONE);
+               DO_TYPE(GNILND_MSG_GET_REQ);
+               DO_TYPE(GNILND_MSG_GET_NAK);
+               DO_TYPE(GNILND_MSG_GET_DONE);
+               DO_TYPE(GNILND_MSG_CLOSE);
+       }
+       return "<unknown msg type>";
+}
+
+static inline const char *
+kgnilnd_tx_state2str(kgn_tx_list_state_t state)
+{
+       switch (state) {
+               DO_TYPE(GNILND_TX_IDLE);
+               DO_TYPE(GNILND_TX_ALLOCD);
+               DO_TYPE(GNILND_TX_PEERQ);
+               DO_TYPE(GNILND_TX_MAPQ);
+               DO_TYPE(GNILND_TX_FMAQ);
+               DO_TYPE(GNILND_TX_LIVE_FMAQ);
+               DO_TYPE(GNILND_TX_RDMAQ);
+               DO_TYPE(GNILND_TX_LIVE_RDMAQ);
+               DO_TYPE(GNILND_TX_DYING);
+               DO_TYPE(GNILND_TX_FREED);
+       }
+       return "<unknown state>";
+}
+
+static inline const char *
+kgnilnd_conn_state2str(kgn_conn_t *conn)
+{
+       kgn_conn_state_t state = conn->gnc_state;
+       switch (state) {
+               DO_TYPE(GNILND_CONN_DUMMY);
+               DO_TYPE(GNILND_CONN_LISTEN);
+               DO_TYPE(GNILND_CONN_CONNECTING);
+               DO_TYPE(GNILND_CONN_ESTABLISHED);
+               DO_TYPE(GNILND_CONN_CLOSING);
+               DO_TYPE(GNILND_CONN_CLOSED);
+               DO_TYPE(GNILND_CONN_DONE);
+               DO_TYPE(GNILND_CONN_DESTROY_EP);
+       }
+       return "<?state?>";
+}
+
+static inline const char *
+kgnilnd_connreq_type2str(kgn_connreq_t *connreq)
+{
+       kgn_connreq_type_t type = connreq->gncr_type;
+
+       switch (type) {
+               DO_TYPE(GNILND_CONNREQ_REQ);
+               DO_TYPE(GNILND_CONNREQ_NAK);
+               DO_TYPE(GNILND_CONNREQ_CLOSE);
+       }
+       return "<?type?>";
+}
+
+static inline const char *
+kgnilnd_dgram_state2str(kgn_dgram_t *dgram)
+{
+       kgn_dgram_state_t state = dgram->gndg_state;
+
+       switch (state) {
+               DO_TYPE(GNILND_DGRAM_USED);
+               DO_TYPE(GNILND_DGRAM_POSTING);
+               DO_TYPE(GNILND_DGRAM_POSTED);
+               DO_TYPE(GNILND_DGRAM_PROCESSING);
+               DO_TYPE(GNILND_DGRAM_DONE);
+               DO_TYPE(GNILND_DGRAM_CANCELED);
+       }
+       return "<?state?>";
+}
+
+static inline const char *
+kgnilnd_dgram_type2str(kgn_dgram_t *dgram)
+{
+       kgn_dgram_type_t type = dgram->gndg_type;
+
+       switch (type) {
+               DO_TYPE(GNILND_DGRAM_REQ);
+               DO_TYPE(GNILND_DGRAM_WC_REQ);
+               DO_TYPE(GNILND_DGRAM_NAK);
+               DO_TYPE(GNILND_DGRAM_CLOSE);
+       }
+       return "<?type?>";
+}
+
+
+#undef DO_TYPE
+
+/* API wrapper functions - include late to pick up all of the other defines */
+#include "gnilnd_api_wrap.h"
+
+#endif /* _GNILND_GNILND_H_ */
diff --git a/lnet/klnds/gnilnd/gnilnd_api_wrap.h b/lnet/klnds/gnilnd/gnilnd_api_wrap.h
new file mode 100644 (file)
index 0000000..e7ba9ab
--- /dev/null
@@ -0,0 +1,1505 @@
+/*
+ * Copyright (C) 2009-2012 Cray, Inc.
+ *
+ *   Author: Nic Henke <nic@cray.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#ifndef _GNILND_API_WRAP_H
+#define _GNILND_API_WRAP_H
+
+/* LNet is allocated failure locations 0xe000 to 0xffff */
+
+/* GNILND has 0xf0XX */
+#define CFS_FAIL_GNI                   0xf000
+#define CFS_FAIL_GNI_PHYS_MAP          0xf001
+#define CFS_FAIL_GNI_VIRT_MAP          0xf002
+#define CFS_FAIL_GNI_GET_UNMAP         0xf003
+#define CFS_FAIL_GNI_PUT_UNMAP         0xf004
+#define CFS_FAIL_GNI_MAP_TX            0xf005
+#define CFS_FAIL_GNI_SMSG_SEND         0xf006
+#define CFS_FAIL_GNI_CLOSE_SEND                0xf007
+#define CFS_FAIL_GNI_CDM_CREATE                0xf008
+#define CFS_FAIL_GNI_CDM_DESTROY       0xf009
+#define CFS_FAIL_GNI_CDM_ATTACH                0xf00a
+#define CFS_FAIL_GNI_CQ_CREATE         0xf00b
+#define CFS_FAIL_GNI_CQ_DESTROY                0xf00c
+#define CFS_FAIL_GNI_EP_BIND           0xf00d
+#define CFS_FAIL_GNI_EP_UNBIND         0xf00e
+#define CFS_FAIL_GNI_EP_SET_EVDATA     0xf00f
+#define CFS_FAIL_GNI_SMSG_INIT         0xf010
+#define CFS_FAIL_GNI_SMSG_RELEASE      0xf011
+#define CFS_FAIL_GNI_POST_RDMA         0xf012
+#define CFS_FAIL_GNI_GET_COMPLETED     0xf013
+#define CFS_FAIL_GNI_EP_DESTROY                0xf015
+#define CFS_FAIL_GNI_VIRT_UNMAP                0xf016
+#define CFS_FAIL_GNI_MDD_RELEASE       0xf017
+#define CFS_FAIL_GNI_NOOP_SEND         0xf018
+#define CFS_FAIL_GNI_ERR_SUBSCRIBE     0xf01a
+#define CFS_FAIL_GNI_QUIESCE_RACE      0xf01b
+#define CFS_FAIL_GNI_DG_TERMINATE      0xf01c
+#define CFS_FAIL_GNI_REG_QUIESCE       0xf01d
+#define CFS_FAIL_GNI_IN_QUIESCE                0xf01e
+#define CFS_FAIL_GNI_DELAY_RDMA                0xf01f
+#define CFS_FAIL_GNI_SR_DOWN_RACE      0xf020
+#define CFS_FAIL_GNI_ALLOC_TX          0xf021
+#define CFS_FAIL_GNI_FMABLK_AVAIL      0xf022
+#define CFS_FAIL_GNI_EP_CREATE         0xf023
+#define CFS_FAIL_GNI_CQ_GET_EVENT      0xf024
+#define CFS_FAIL_GNI_PROBE             0xf025
+#define CFS_FAIL_GNI_EP_TEST           0xf026
+#define CFS_FAIL_GNI_CONNREQ_DROP      0xf027
+#define CFS_FAIL_GNI_CONNREQ_PROTO     0xf028
+#define CFS_FAIL_GNI_CONND_PILEUP      0xf029
+#define CFS_FAIL_GNI_PHYS_SETUP                0xf02a
+#define CFS_FAIL_GNI_FIND_TARGET       0xf02b
+#define CFS_FAIL_GNI_WC_DGRAM_FREE     0xf02c
+#define CFS_FAIL_GNI_DROP_CLOSING      0xf02d
+#define CFS_FAIL_GNI_RX_CLOSE_CLOSING  0xf02e
+#define CFS_FAIL_GNI_RX_CLOSE_CLOSED   0xf02f
+#define CFS_FAIL_GNI_EP_POST           0xf030
+#define CFS_FAIL_GNI_PACK_SRCNID       0xf031
+#define CFS_FAIL_GNI_PACK_DSTNID       0xf032
+#define CFS_FAIL_GNI_PROBE_WAIT                0xf033
+#define CFS_FAIL_GNI_SMSG_CKSUM1       0xf034
+#define CFS_FAIL_GNI_SMSG_CKSUM2       0xf035
+#define CFS_FAIL_GNI_SMSG_CKSUM3       0xf036
+#define CFS_FAIL_GNI_DROP_DESTROY_EP   0xf037
+#define CFS_FAIL_GNI_SMSG_GETNEXT      0xf038
+#define CFS_FAIL_GNI_FINISH_PURG       0xf039
+#define CFS_FAIL_GNI_PURG_REL_DELAY    0xf03a
+#define CFS_FAIL_GNI_DONT_NOTIFY       0xf03b
+#define CFS_FAIL_GNI_VIRT_SMALL_MAP    0xf03c
+#define CFS_FAIL_GNI_DELAY_RDMAQ       0xf03d
+#define CFS_FAIL_GNI_PAUSE_SHUTDOWN    0xf03e
+#define CFS_FAIL_GNI_PAUSE_DGRAM_COMP  0xf03f
+#define CFS_FAIL_GNI_NET_LOOKUP                0xf040
+#define CFS_FAIL_GNI_RECV_TIMEOUT      0xf041
+#define CFS_FAIL_GNI_SEND_TIMEOUT      0xf042
+#define CFS_FAIL_GNI_ONLY_NOOP         0xf043
+#define CFS_FAIL_GNI_FINISH_PURG2      0xf044
+#define CFS_FAIL_GNI_RACE_RESET                0xf045
+#define CFS_FAIL_GNI_GNP_CONNECTING1   0xf046
+#define CFS_FAIL_GNI_GNP_CONNECTING2   0xf047
+#define CFS_FAIL_GNI_GNP_CONNECTING3   0xf048
+#define CFS_FAIL_GNI_PUT_ACK_AGAIN     0xf050
+#define CFS_FAIL_GNI_GET_REQ_AGAIN     0xf051
+
+/* helper macros */
+extern void
+_kgnilnd_api_rc_lbug(const char *rcstr, int rc, struct libcfs_debug_msg_data *data,
+                       const char *fmt, ...)
+       __attribute__ ((format (printf, 4, 5)));
+
+#define kgnilnd_api_rc_lbug(msgdata, rc, fmt, a...)                            \
+do {                                                                           \
+       CFS_CHECK_STACK(msgdata, D_ERROR, NULL);                                \
+       /* we don't mask this - it is always at D_ERROR */                      \
+       _kgnilnd_api_rc_lbug(kgnilnd_api_rc2str(rc), (rc), msgdata, fmt, ##a);  \
+} while (0)
+
+#define DO_RETCODE(x) case x: return #x;
+static inline const char *
+kgnilnd_api_rc2str(gni_return_t rrc)
+{
+
+       switch (rrc) {
+               DO_RETCODE(GNI_RC_SUCCESS)
+               DO_RETCODE(GNI_RC_NOT_DONE);
+               DO_RETCODE(GNI_RC_INVALID_PARAM);
+               DO_RETCODE(GNI_RC_ERROR_RESOURCE);
+               DO_RETCODE(GNI_RC_TIMEOUT);
+               DO_RETCODE(GNI_RC_PERMISSION_ERROR);
+               DO_RETCODE(GNI_RC_DESCRIPTOR_ERROR);
+               DO_RETCODE(GNI_RC_ALIGNMENT_ERROR);
+               DO_RETCODE(GNI_RC_INVALID_STATE);
+               DO_RETCODE(GNI_RC_NO_MATCH);
+               DO_RETCODE(GNI_RC_SIZE_ERROR);
+               DO_RETCODE(GNI_RC_TRANSACTION_ERROR);
+               DO_RETCODE(GNI_RC_ILLEGAL_OP);
+               DO_RETCODE(GNI_RC_ERROR_NOMEM);
+       }
+       LBUG();
+}
+#undef DO_RETCODE
+
+/* log an error and LBUG for unhandled rc from gni api function
+ * the fmt should be something like:
+ *  gni_api_call(arg1, arg2, arg3)
+ */
+
+/* apick_fn and apick_fmt should be defined for each site */
+#undef apick_fn
+#undef apick_fmt
+
+#define GNILND_API_RC_LBUG(args...)                                            \
+do {                                                                           \
+       LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_ERROR, NULL);                     \
+       kgnilnd_api_rc_lbug(&msgdata, rrc, apick_fn"("apick_fmt")", ##args);    \
+} while (0)
+
+#define GNILND_API_SWBUG(args...)                                               \
+do {                                                                            \
+       CERROR("likely SOFTWARE BUG "apick_fn"("apick_fmt") rc %s\n",           \
+                ##args, kgnilnd_api_rc2str(rrc));                              \
+} while (0)
+
+#define GNILND_API_EINVAL(args...)                                              \
+do {                                                                            \
+       CERROR("invalid parameter to "apick_fn"("apick_fmt") rc %s\n",          \
+                ##args, kgnilnd_api_rc2str(rrc));                              \
+} while (0)
+
+#define GNILND_API_RESOURCE(args...)                                            \
+do {                                                                            \
+       CERROR("no resources for "apick_fn"("apick_fmt") rc %s\n",              \
+               ##args, kgnilnd_api_rc2str(rrc));                               \
+} while (0)
+
+#define GNILND_API_BUSY(args...)                                                \
+do {                                                                            \
+       CERROR("resources busy for "apick_fn"("apick_fmt") rc %s\n",            \
+               ##args, kgnilnd_api_rc2str(rrc));                               \
+} while (0)
+
+#undef DEBUG_SMSG_CREDITS
+#ifdef DEBUG_SMSG_CREDITS
+#define CRAY_CONFIG_GHAL_GEMINI
+#include <gni_priv.h>
+#define GNIDBG_SMSG_CREDS(level, conn)                                        \
+do {                                                                          \
+       gni_ep_smsg_mbox_t *smsg = conn->gnc_ephandle->smsg;                  \
+       CDEBUG(level, "SMSGDBG: conn %p mcred %d/%d bcred %d/%d "             \
+               "s_seq %d/%d/%d r_seq %d/%d/%d retr %d\n",                    \
+               conn, smsg->mbox_credits, smsg->back_mbox_credits,            \
+               smsg->buffer_credits, smsg->back_buffer_credits,              \
+               smsg->s_seqno, smsg->s_seqno_back_mbox_credits,               \
+               smsg->s_seqno_back_buffer_credits, smsg->r_seqno,             \
+               smsg->r_seqno_back_mbox_credits,                              \
+               smsg->r_seqno_back_buffer_credits, smsg->retransmit_count);   \
+} while (0)
+#else
+#define GNIDBG_SMSG_CREDS(level, conn) do {} while(0)
+#endif
+
+/* these are all wrappers around gni_XXX functions.
+ * This allows us to handle all the return codes and api checks without
+ * dirtying up the logic code */
+
+/* TODO: RETURN wrapper that translates integer to GNI API RC string */
+
+#define apick_fn "kgnilnd_cdm_create"
+#define apick_fmt "%u, %u, %u, %u, 0x%p"
+static inline gni_return_t kgnilnd_cdm_create(
+               IN uint32_t             inst_id,
+               IN uint8_t              ptag,
+               IN uint32_t             cookie,
+               IN uint32_t             modes,
+               OUT gni_cdm_handle_t    *cdm_hndl
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CDM_CREATE)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+       } else {
+               rrc = gni_cdm_create(inst_id, ptag, cookie, modes, cdm_hndl);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_ERROR_RESOURCE:
+       case GNI_RC_INVALID_PARAM:
+               /* Try to bail gracefully */
+               GNILND_API_SWBUG(
+                       inst_id, ptag, cookie, modes, cdm_hndl);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       inst_id, ptag, cookie, modes, cdm_hndl);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_cdm_attach"
+#define apick_fmt "0x%p, %u, 0x%p, 0x%p"
+static inline gni_return_t kgnilnd_cdm_attach(
+               IN gni_cdm_handle_t     cdm_hndl,
+               IN uint32_t             device_id,
+               OUT uint32_t            *local_addr,
+               OUT gni_nic_handle_t    *nic_hndl
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CDM_ATTACH)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+       } else {
+               rrc = gni_cdm_attach(cdm_hndl, device_id, local_addr, nic_hndl);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_NO_MATCH:
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       cdm_hndl, device_id, local_addr, nic_hndl);
+               break;
+       case GNI_RC_ERROR_RESOURCE:
+       case GNI_RC_INVALID_STATE:
+               GNILND_API_RESOURCE(
+                       cdm_hndl, device_id, local_addr, nic_hndl);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       cdm_hndl, device_id, local_addr, nic_hndl);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fmt
+#undef apick_fn
+
+#define apick_fn "kgnilnd_cdm_destroy"
+#define apick_fmt "0x%p"
+static inline gni_return_t kgnilnd_cdm_destroy(
+               IN gni_cdm_handle_t     cdm_hndl
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CQ_DESTROY)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+       } else {
+               rrc = gni_cdm_destroy(
+                       cdm_hndl);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       cdm_hndl);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       cdm_hndl);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_subscribe_errors"
+#define apick_fmt "0x%p,%x,%u,0x%p,0x%p,0x%p"
+static inline gni_return_t kgnilnd_subscribe_errors(
+               IN gni_nic_handle_t  nic_handle,
+               IN gni_error_mask_t  mask,
+               IN uint32_t          EEQ_size,
+               IN void              (*EQ_new_event)(gni_err_handle_t),
+               IN void              (*app_crit_err)(gni_err_handle_t),
+               OUT gni_err_handle_t *err_handle
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_ERR_SUBSCRIBE)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+       } else {
+               rrc = gni_subscribe_errors(
+                       nic_handle, mask, EEQ_size, EQ_new_event, app_crit_err,
+                       err_handle);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       nic_handle, mask, EEQ_size, EQ_new_event, app_crit_err,
+                       err_handle);
+               break;
+       case GNI_RC_ERROR_RESOURCE:
+               GNILND_API_RESOURCE(
+                       nic_handle, mask, EEQ_size, EQ_new_event, app_crit_err,
+                       err_handle);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       nic_handle, mask, EEQ_size, EQ_new_event, app_crit_err,
+                       err_handle);
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_release_errors"
+#define apick_fmt "0x%p"
+static inline gni_return_t kgnilnd_release_errors(
+               IN gni_err_handle_t err_handle
+               )
+{
+       gni_return_t rrc;
+
+       rrc = gni_release_errors(
+                       err_handle);
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_INVALID_PARAM:
+       case GNI_RC_NOT_DONE:
+               GNILND_API_SWBUG(
+                       err_handle);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       err_handle);
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_set_quiesce_callback"
+#define apick_fmt "0x%p,0x%p"
+static inline gni_return_t kgnilnd_set_quiesce_callback(
+               IN gni_nic_handle_t  nic_handle,
+               IN void              (*qsce_func)(gni_nic_handle_t, uint64_t msecs)
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_REG_QUIESCE)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+       } else {
+               rrc = gni_set_quiesce_callback(
+                       nic_handle, qsce_func);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_INVALID_STATE:
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       nic_handle, qsce_func);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       nic_handle, qsce_func);
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_get_quiesce_status"
+#define apick_fmt "0x%p"
+static inline gni_return_t kgnilnd_get_quiesce_status(
+               IN gni_nic_handle_t  nic_handle
+               )
+{
+       uint32_t rrc;
+
+       /* this has weird RC -
+        * 0 - quiesce not in progress
+        * 1 - quiesce is turned on
+       */
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_IN_QUIESCE)) {
+               rrc = 1;
+       } else {
+               rrc = gni_get_quiesce_status(
+                       nic_handle);
+       }
+
+       switch (rrc)  {
+       case 1:
+       case 0:
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       nic_handle);
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_cq_create"
+#define apick_fmt "0x%p, %u, %u, 0x%p, "LPX64", 0x%p"
+static inline gni_return_t kgnilnd_cq_create(
+               IN gni_nic_handle_t     nic_hndl,
+               IN uint32_t             entry_count,
+               IN uint32_t             delay_index,
+               IN gni_cq_event_hndlr_f *event_handler,
+               IN uint64_t             usr_event_data,
+               OUT gni_cq_handle_t     *cq_hndl
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CQ_CREATE)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+       } else {
+               rrc = gni_cq_create(
+                      nic_hndl, entry_count, delay_index, event_handler,
+                       usr_event_data, cq_hndl);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       nic_hndl, entry_count, delay_index, event_handler,
+                       usr_event_data, cq_hndl);
+               break;
+       case GNI_RC_ERROR_RESOURCE:
+               GNILND_API_RESOURCE(
+                       nic_hndl, entry_count, delay_index, event_handler,
+                       usr_event_data, cq_hndl);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       nic_hndl, entry_count, delay_index, event_handler,
+                       usr_event_data, cq_hndl);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_cq_destroy"
+#define apick_fmt "0x%p"
+static inline gni_return_t kgnilnd_cq_destroy(
+               IN gni_cq_handle_t cq_hndl
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CQ_DESTROY)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+       } else {
+
+               rrc = gni_cq_destroy(
+                       cq_hndl);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       cq_hndl);
+               break;
+       case GNI_RC_ERROR_RESOURCE:
+               GNILND_API_BUSY(
+                       cq_hndl);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       cq_hndl);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_cq_get_event"
+#define apick_fmt "0x%p, 0x%p"
+static inline gni_return_t kgnilnd_cq_get_event(
+               IN gni_cq_handle_t cq_hndl,
+               OUT gni_cq_entry_t *event_data
+               )
+{
+       gni_return_t rrc;
+
+       /* no error injection - CQs are touchy about the data.
+        * where appropriate, we'll do this on the CQs that should be able to
+        * handle the various errors */
+       rrc = gni_cq_get_event(
+                       cq_hndl, event_data);
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+       case GNI_RC_NOT_DONE:
+       case GNI_RC_TRANSACTION_ERROR:
+               break;
+       case GNI_RC_ERROR_RESOURCE:
+               LASSERTF(GNI_CQ_OVERRUN(*event_data),
+                        "kgni returned ERROR_RESOURCE but cq_hndl 0x%p is not "
+                        "overrun\n", cq_hndl);
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       cq_hndl, event_data);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       cq_hndl, event_data);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       return rrc;
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_smsg_init"
+#define apick_fmt "0x%p, 0x%p, 0x%p"
+static inline gni_return_t kgnilnd_smsg_init(
+               IN gni_ep_handle_t      ep_hndl,
+               IN gni_smsg_attr_t      *local_smsg_attr,
+               IN gni_smsg_attr_t      *remote_smsg_attr
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_INIT)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_ERROR_RESOURCE;
+       } else {
+               rrc = gni_smsg_init(
+                       ep_hndl, local_smsg_attr, remote_smsg_attr);
+       }
+
+       switch (rrc)  {
+       /* both of these are OK, upper SW needs to handle */
+       case GNI_RC_SUCCESS:
+       case GNI_RC_NOT_DONE:
+               break;
+       case GNI_RC_INVALID_PARAM:
+       case GNI_RC_INVALID_STATE:
+               GNILND_API_SWBUG(
+                       ep_hndl, local_smsg_attr, remote_smsg_attr);
+               break;
+       case GNI_RC_ERROR_RESOURCE:
+               GNILND_API_RESOURCE(
+                       ep_hndl, local_smsg_attr, remote_smsg_attr);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       ep_hndl, local_smsg_attr, remote_smsg_attr);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_smsg_send"
+#define apick_fmt "0x%p, 0x%p, %d, 0x%p, %u %u"
+static inline gni_return_t kgnilnd_smsg_send(
+               IN gni_ep_handle_t      ep_hndl,
+               IN void                 *header,
+               IN uint32_t             header_length,
+               IN void                 *data,
+               IN uint32_t             data_length,
+               IN uint32_t             msg_id
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_SEND)) {
+               if (cfs_fail_loc & CFS_FAIL_RAND) {
+                       rrc = GNI_RC_NOT_DONE;
+               } else {
+                       rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE;
+               }
+       } else {
+               rrc = gni_smsg_send(
+                       ep_hndl, header, header_length, data, data_length, msg_id);
+       }
+
+       switch (rrc)  {
+       /* both of these are OK, upper SW needs to handle */
+       case GNI_RC_SUCCESS:
+       case GNI_RC_NOT_DONE:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       ep_hndl, header, header_length, data, data_length, msg_id);
+               break;
+       case GNI_RC_ERROR_RESOURCE:
+               GNILND_API_RESOURCE(
+                       ep_hndl, header, header_length, data, data_length, msg_id);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       ep_hndl, header, header_length, data, data_length, msg_id);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_smsg_getnext"
+#define apick_fmt "0x%p,0x%p"
+static inline gni_return_t kgnilnd_smsg_getnext(
+               IN gni_ep_handle_t      ep_hndl,
+               OUT void                **header
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_RELEASE)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE;
+       } else {
+               rrc = gni_smsg_getnext(
+                       ep_hndl, header);
+       }
+
+       switch (rrc)  {
+       /* both of these are OK, upper SW needs to handle */
+       case GNI_RC_SUCCESS:
+       case GNI_RC_NOT_DONE:
+       case GNI_RC_INVALID_STATE:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       ep_hndl, header);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       ep_hndl, header);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_smsg_release"
+#define apick_fmt "0x%p"
+static inline gni_return_t kgnilnd_smsg_release(
+               IN gni_ep_handle_t      ep_hndl
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_RELEASE)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+       } else {
+               rrc = gni_smsg_release(
+                       ep_hndl);
+       }
+
+       switch (rrc)  {
+       /* both of these are OK, upper SW needs to handle */
+       case GNI_RC_SUCCESS:
+       case GNI_RC_NOT_DONE:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       ep_hndl);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       ep_hndl);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_create"
+#define apick_fmt "0x%p, 0x%p, 0x%p"
+static inline gni_return_t kgnilnd_ep_create(
+               IN gni_nic_handle_t     nic_hndl,
+               IN gni_cq_handle_t      src_cq_hndl,
+               OUT gni_ep_handle_t     *ep_hndl
+               )
+{
+       gni_return_t rrc;
+
+       /* error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_CREATE)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_ERROR_NOMEM;
+       } else {
+               rrc = gni_ep_create(
+                       nic_hndl, src_cq_hndl, ep_hndl);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       nic_hndl, src_cq_hndl, ep_hndl);
+               break;
+       case GNI_RC_ERROR_NOMEM:
+               GNILND_API_RESOURCE(
+                       nic_hndl, src_cq_hndl, ep_hndl);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       nic_hndl, src_cq_hndl, ep_hndl);
+
+               /* lbug never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_bind"
+#define apick_fmt "0x%p, %x, %x"
+static inline gni_return_t kgnilnd_ep_bind(
+               IN gni_ep_handle_t      ep_hndl,
+               IN uint32_t             remote_addr,
+               IN uint32_t             remote_id
+               )
+{
+       gni_return_t rrc;
+
+       /* error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_BIND)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE;
+       } else {
+               rrc = gni_ep_bind(
+                       ep_hndl, remote_addr, remote_id);
+       }
+
+       switch (rrc)  {
+       /* both of these are ok, upper sw needs to handle */
+       case GNI_RC_SUCCESS:
+       case GNI_RC_NOT_DONE:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       ep_hndl, remote_addr, remote_id);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       ep_hndl, remote_addr, remote_id);
+
+               /* lbug never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_set_eventdata"
+#define apick_fmt "0x%p, %x, %x"
+static inline gni_return_t kgnilnd_ep_set_eventdata(
+               IN gni_ep_handle_t      ep_hndl,
+               IN uint32_t             local_event,
+               IN uint32_t             remote_event
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_SET_EVDATA)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+       } else {
+               rrc = gni_ep_set_eventdata(
+                       ep_hndl, local_event, remote_event);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       ep_hndl, local_event, remote_event);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       ep_hndl, local_event, remote_event);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_unbind"
+#define apick_fmt "0x%p"
+static inline gni_return_t kgnilnd_ep_unbind(
+               IN gni_ep_handle_t      ep_hndl
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_UNBIND)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE;
+       } else {
+               rrc = gni_ep_unbind(
+                       ep_hndl);
+       }
+
+       switch (rrc)  {
+       /* both of these are OK, upper SW needs to handle */
+       case GNI_RC_NOT_DONE:
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       ep_hndl);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       ep_hndl);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_destroy"
+#define apick_fmt "0x%p"
+static inline gni_return_t kgnilnd_ep_destroy(
+               IN gni_ep_handle_t      ep_hndl
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_DESTROY)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE;
+       } else {
+               rrc = gni_ep_destroy(
+                       ep_hndl);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       ep_hndl);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       ep_hndl);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_postdata_w_id"
+#define apick_fmt "0x%p, 0x%p, %d, 0x%p, %d, "LPU64""
+static inline gni_return_t kgnilnd_ep_postdata_w_id(
+               IN gni_ep_handle_t ep_hndl,
+               IN void            *in_data,
+               IN uint16_t        data_len,
+               IN void            *out_buf,
+               IN uint16_t        buf_size,
+               IN uint64_t        datagram_id
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_POST)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_SIZE_ERROR;
+       } else {
+               rrc = gni_ep_postdata_w_id(
+                       ep_hndl, in_data, data_len, out_buf, buf_size,
+                       datagram_id);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+       case GNI_RC_ERROR_NOMEM:
+       case GNI_RC_ERROR_RESOURCE:
+               break;
+       case GNI_RC_INVALID_PARAM:
+       case GNI_RC_SIZE_ERROR:
+               GNILND_API_SWBUG(
+                       ep_hndl, in_data, data_len, out_buf, buf_size,
+                       datagram_id);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       ep_hndl, in_data, data_len, out_buf, buf_size,
+                       datagram_id);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_postdata_test_by_id"
+#define apick_fmt "0x%p, "LPU64", 0x%p, 0x%p, 0x%p"
+static inline gni_return_t kgnilnd_ep_postdata_test_by_id(
+               IN gni_ep_handle_t      ep_hndl,
+               IN uint64_t             datagram_id,
+               OUT gni_post_state_t    *post_state,
+               OUT uint32_t            *remote_addr,
+               OUT uint32_t            *remote_id
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_TEST)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_ERROR_NOMEM;
+       } else {
+               rrc = gni_ep_postdata_test_by_id(
+                       ep_hndl, datagram_id, post_state, remote_addr,
+                       remote_id);
+
+               /* we want to lie, but we need to do the actual work first
+                * so we don't keep getting the event saying a dgram is ready */
+               if (rrc == GNI_RC_SUCCESS && CFS_FAIL_CHECK(CFS_FAIL_GNI_DG_TERMINATE)) {
+                       /* don't use fail_val, allows us to do FAIL_SOME */
+                       *post_state = GNI_POST_TERMINATED;
+               }
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+       case GNI_RC_NO_MATCH:
+               break;
+       case GNI_RC_SIZE_ERROR:
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       ep_hndl, datagram_id, post_state, remote_addr,
+                       remote_id);
+               break;
+       case GNI_RC_ERROR_NOMEM:
+               GNILND_API_RESOURCE(
+                       ep_hndl, datagram_id, post_state, remote_addr,
+                       remote_id);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       ep_hndl, datagram_id, post_state, remote_addr,
+                       remote_id);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_postdata_cancel_by_id"
+#define apick_fmt "0x%p, "LPU64""
+static inline gni_return_t kgnilnd_ep_postdata_cancel_by_id(
+               IN gni_ep_handle_t      ep_hndl,
+               IN uint64_t             datagram_id
+               )
+{
+       gni_return_t rrc;
+
+       /* no error injection as the only thing we'd do is LBUG */
+
+       rrc = gni_ep_postdata_cancel_by_id(
+               ep_hndl, datagram_id);
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+       case GNI_RC_NO_MATCH:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       ep_hndl, datagram_id);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       ep_hndl, datagram_id);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_postdata_probe_by_id"
+#define apick_fmt "0x%p, 0x%p"
+static inline gni_return_t kgnilnd_postdata_probe_by_id(
+               IN gni_nic_handle_t    nic_hndl,
+               OUT uint64_t          *datagram_id
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PROBE)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NO_MATCH;
+       } else {
+               rrc = gni_postdata_probe_by_id(
+                       nic_hndl, datagram_id);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+       case GNI_RC_NO_MATCH:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       nic_hndl, datagram_id);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       nic_hndl, datagram_id);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_postdata_probe_wait_by_id"
+#define apick_fmt "0x%p, %d, 0x%p"
+static inline gni_return_t kgnilnd_postdata_probe_wait_by_id(
+               IN gni_nic_handle_t nic_hndl,
+               IN uint32_t         timeout,
+               OUT uint64_t        *datagram_id
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PROBE_WAIT)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_TIMEOUT;
+       } else {
+               rrc = gni_postdata_probe_wait_by_id(
+                       nic_hndl, timeout, datagram_id);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+       case GNI_RC_TIMEOUT:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       nic_hndl, timeout, datagram_id);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       nic_hndl, timeout, datagram_id);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_post_rdma"
+#define apick_fmt "0x%p, 0x%p"
+static inline gni_return_t kgnilnd_post_rdma(
+               IN gni_ep_handle_t               ep_hndl,
+               IN gni_post_descriptor_t        *post_descr
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_POST_RDMA)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+       } else {
+               rrc = gni_post_rdma(
+                       ep_hndl, post_descr);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_ALIGNMENT_ERROR:
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       ep_hndl, post_descr);
+               break;
+       case GNI_RC_ERROR_RESOURCE:
+               GNILND_API_RESOURCE(
+                       ep_hndl, post_descr);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       ep_hndl, post_descr);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_get_completed"
+#define apick_fmt "0x%p,"LPX64",0x%p"
+static inline gni_return_t kgnilnd_get_completed(
+               IN gni_cq_handle_t              cq_hndl,
+               IN gni_cq_entry_t               event_data,
+               OUT gni_post_descriptor_t       **post_descr
+               )
+{
+       gni_return_t rrc;
+
+
+       rrc = gni_get_completed(cq_hndl, event_data, post_descr);
+
+       switch (rrc)  {
+       case GNI_RC_TRANSACTION_ERROR:
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_DESCRIPTOR_ERROR:
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(cq_hndl, event_data, post_descr);
+               break;
+       default:
+               GNILND_API_RC_LBUG(cq_hndl, event_data, post_descr);
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+
+       /* Error injection - we need a valid desc, so let kgni give us one
+        * - then we lie  */
+       if (rrc == GNI_RC_SUCCESS &&
+           (CFS_FAIL_CHECK(CFS_FAIL_GNI_GET_COMPLETED))) {
+               /* We only trigger TRANSACTION_ERROR for now */
+               gni_post_descriptor_t *desc;
+               rrc = GNI_RC_TRANSACTION_ERROR;
+               desc = *post_descr;
+               desc->status = rrc;
+               /* recoverable decision made from cfs_fail_val in
+                *  kgnilnd_cq_error_str and
+                *  kgnilnd_cq_error_recoverable */
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_cq_error_str"
+#define apick_fmt LPX64",0x%p,%d"
+static inline gni_return_t kgnilnd_cq_error_str(
+               IN gni_cq_entry_t       entry,
+               IN void                *buffer,
+               IN uint32_t             len
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection - set string if we injected a
+        *  TRANSACTION_ERROR earlier */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GET_COMPLETED)) {
+               /* if we just set persistent error, we can't ever
+                * break in via ssh to clear, so use a count > 10 to indicate fatal */
+               sprintf(buffer, "INJECT:%s", cfs_fail_val > 10 ?
+                       "FATAL" : "RECOVERABLE");
+               rrc = GNI_RC_SUCCESS;
+       } else {
+               rrc = gni_cq_error_str(
+                       entry, buffer, len);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_SIZE_ERROR:
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       entry, buffer, len);
+               /* give them something to use */
+               snprintf(buffer, len, "UNDEF:UNDEF");
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       entry, buffer, len);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_cq_error_recoverable"
+#define apick_fmt LPX64",0x%p"
+static inline gni_return_t kgnilnd_cq_error_recoverable(
+               IN gni_cq_entry_t       entry,
+               IN uint32_t            *recoverable
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection - set string if we injected a
+        *  TRANSACTION_ERROR earlier */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GET_COMPLETED)) {
+               *recoverable = cfs_fail_val > 10 ? 0 : 1;
+               rrc = GNI_RC_SUCCESS;
+       } else {
+               rrc = gni_cq_error_recoverable(
+                       entry, recoverable);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_INVALID_STATE:
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       entry, recoverable);
+               *recoverable = 0;
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       entry, recoverable);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_mem_register_segments"
+#define apick_fmt "0x%p,0x%p,%u,0x%p,%x,0x%p"
+static inline gni_return_t
+kgnilnd_mem_register_segments(
+               IN gni_nic_handle_t     nic_hndl,
+               IN gni_mem_segment_t    *mem_segments,
+               IN uint32_t             segments_cnt,
+               IN gni_cq_handle_t      dst_cq_hndl,
+               IN uint32_t             flags,
+               OUT gni_mem_handle_t    *mem_hndl
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PHYS_MAP)) {
+               rrc = GNI_RC_ERROR_RESOURCE;
+       } else {
+               rrc = gni_mem_register_segments(
+                       nic_hndl, mem_segments, segments_cnt,
+                       dst_cq_hndl, flags, mem_hndl);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+       case GNI_RC_ERROR_RESOURCE:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       nic_hndl, mem_segments, segments_cnt,
+                       dst_cq_hndl, flags, mem_hndl);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       nic_hndl, mem_segments, segments_cnt,
+                       dst_cq_hndl, flags, mem_hndl);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_mem_register"
+#define apick_fmt "0x%p,"LPX64","LPX64"0x%p,%u,0x%p"
+static inline gni_return_t kgnilnd_mem_register(
+               IN gni_nic_handle_t     nic_hndl,
+               IN uint64_t             address,
+               IN uint64_t             length,
+               IN gni_cq_handle_t      dst_cq_hndl,
+               IN uint32_t             flags,
+               OUT gni_mem_handle_t    *mem_hndl
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_VIRT_MAP)) {
+               rrc = GNI_RC_ERROR_RESOURCE;
+       } else if (CFS_FAIL_CHECK(CFS_FAIL_GNI_VIRT_SMALL_MAP) &&
+                  length <= *kgnilnd_tunables.kgn_max_immediate) {
+               rrc = GNI_RC_INVALID_PARAM;
+       } else {
+               rrc = gni_mem_register(
+                       nic_hndl, address, length,
+                       dst_cq_hndl, flags, mem_hndl);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+       case GNI_RC_ERROR_RESOURCE:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       nic_hndl, address, length,
+                       dst_cq_hndl, flags, mem_hndl);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       nic_hndl, address, length,
+                       dst_cq_hndl, flags, mem_hndl);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_mem_deregister"
+#define apick_fmt "0x%p,0x%p,%d"
+static inline gni_return_t kgnilnd_mem_deregister(
+               IN gni_nic_handle_t     nic_hndl,
+               IN gni_mem_handle_t     *mem_hndl,
+               IN int                  hold_timeout
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_VIRT_UNMAP)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+       } else {
+               rrc = gni_mem_deregister(
+                       nic_hndl, mem_hndl, hold_timeout);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+               break;
+       case GNI_RC_INVALID_PARAM:
+               GNILND_API_SWBUG(
+                       nic_hndl, mem_hndl, hold_timeout);
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       nic_hndl, mem_hndl, hold_timeout);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_mem_mdd_release"
+#define apick_fmt "0x%p,0x%p"
+static inline gni_return_t kgnilnd_mem_mdd_release(
+               IN gni_nic_handle_t     nic_hndl,
+               IN gni_mem_handle_t     *mem_hndl
+               )
+{
+       gni_return_t rrc;
+
+       /* Error injection */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_MDD_RELEASE)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NO_MATCH;
+       } else {
+               rrc = gni_mem_mdd_release(
+                       nic_hndl, mem_hndl);
+       }
+
+       switch (rrc)  {
+       case GNI_RC_SUCCESS:
+       case GNI_RC_NO_MATCH:
+               break;
+       default:
+               GNILND_API_RC_LBUG(
+                       nic_hndl, mem_hndl);
+
+               /* LBUG never returns, but just for style and consistency */
+               break;
+       }
+       RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#endif /* _GNILND_API_WRAP_H */
diff --git a/lnet/klnds/gnilnd/gnilnd_cb.c b/lnet/klnds/gnilnd/gnilnd_cb.c
new file mode 100644 (file)
index 0000000..56be88a
--- /dev/null
@@ -0,0 +1,4366 @@
+/*
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ * Copyright (C) 2009-2012 Cray, Inc.
+ *
+ *   Derived from work by Eric Barton <eric@bartonsoftware.com>
+ *   Author: Nic Henke <nic@cray.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <linux/nmi.h>
+#include "gnilnd.h"
+
+/* this is useful when needed to debug wire corruption. */
+static void
+kgnilnd_dump_blob(int level, char *prefix, void *buf, int len) {
+       __u64 *ptr;
+
+       ptr = (__u64 *) buf;
+
+       while (len > 0) {
+               if (len >= 32) {
+                       CDEBUG(level,
+                              "%s 0x%p: 0x%16.16llx 0x%16.16llx 0x%16.16llx 0x%16.16llx\n",
+                              prefix, ptr, *(ptr), *(ptr + 1), *(ptr + 2), *(ptr + 3));
+                       ptr += 4;
+                       len -= 32;
+               } else if (len >= 16) {
+                       CDEBUG(level,
+                              "%s 0x%p: 0x%16.16llx 0x%16.16llx\n",
+                              prefix, ptr, *(ptr), *(ptr + 1));
+                       ptr += 2;
+                       len -= 16;
+               } else {
+                       CDEBUG(level, "%s 0x%p: 0x%16.16llx\n",
+                              prefix, ptr, *(ptr));
+                       ptr++;
+                       len -= 8;
+               }
+       }
+}
+
+static void
+kgnilnd_dump_msg(int mask, kgn_msg_t *msg)
+{
+       CDEBUG(mask, "0x%8.8x 0x%4.4x 0x%4.4x 0x%16.16llx"
+               " 0x%16.16llx 0x%8.8x 0x%4.4x 0x%4.4x 0x%8.8x\n",
+               msg->gnm_magic, msg->gnm_version,
+               msg->gnm_type, msg->gnm_srcnid,
+               msg->gnm_connstamp, msg->gnm_seq,
+               msg->gnm_cksum, msg->gnm_payload_cksum,
+               msg->gnm_payload_len);
+}
+
+void
+kgnilnd_schedule_device(kgn_device_t *dev)
+{
+       short         already_live = 0;
+
+       /* we'll only want to wake if the scheduler thread
+        * has come around and set ready to zero */
+       already_live = cmpxchg(&dev->gnd_ready, GNILND_DEV_IDLE, GNILND_DEV_IRQ);
+
+       if (!already_live) {
+               wake_up_all(&dev->gnd_waitq);
+       }
+       return;
+}
+
+void kgnilnd_schedule_device_timer(unsigned long arg)
+{
+       kgn_device_t *dev = (kgn_device_t *) arg;
+
+       kgnilnd_schedule_device(dev);
+}
+
+void
+kgnilnd_device_callback(__u32 devid, __u64 arg)
+{
+       kgn_device_t *dev;
+       int           index = (int) arg;
+
+       if (index >= kgnilnd_data.kgn_ndevs) {
+               /* use _EMERG instead of an LBUG to prevent LBUG'ing in
+                * interrupt context. */
+               LCONSOLE_EMERG("callback for unknown device %d->%d\n",
+                               devid, index);
+               return;
+       }
+
+       dev = &kgnilnd_data.kgn_devices[index];
+       /* just basic sanity */
+       if (dev->gnd_id == devid) {
+               kgnilnd_schedule_device(dev);
+       } else {
+               LCONSOLE_EMERG("callback for bad device %d devid %d\n",
+                               dev->gnd_id, devid);
+       }
+}
+
+/* sched_intent values:
+ * < 0 : do not reschedule under any circumstances
+ * == 0: reschedule if someone marked him WANTS_SCHED
+ * > 0 : force a reschedule */
+
+void
+kgnilnd_schedule_process_conn(kgn_conn_t *conn, int sched_intent)
+{
+       int     conn_sched;
+
+       /* move back to IDLE but save previous state.
+        * if we see WANTS_SCHED, we'll call kgnilnd_schedule_conn and
+        * let the xchg there handle any racing callers to get it
+        * onto gnd_ready_conns */
+
+       conn_sched = xchg(&conn->gnc_scheduled, GNILND_CONN_IDLE);
+       LASSERTF(conn_sched == GNILND_CONN_WANTS_SCHED ||
+                conn_sched == GNILND_CONN_PROCESS,
+                "conn %p after process in bad state: %d\n",
+                conn, conn_sched);
+
+       if (sched_intent >= 0) {
+               if ((sched_intent > 0 || (conn_sched == GNILND_CONN_WANTS_SCHED))) {
+                       kgnilnd_schedule_conn(conn);
+               }
+       }
+}
+
+void
+kgnilnd_schedule_conn(kgn_conn_t *conn)
+{
+       kgn_device_t        *dev = conn->gnc_device;
+       int                  sched;
+
+       sched = xchg(&conn->gnc_scheduled, GNILND_CONN_WANTS_SCHED);
+
+       /* if we are IDLE, add to list - only one guy sees IDLE and "wins"
+        * the chance to put it onto gnd_ready_conns.
+        * otherwise, leave marked as WANTS_SCHED and the thread that "owns"
+        *  the conn in process_conns will take care of moving it back to
+        *  SCHED when it is done processing */
+
+       if (sched == GNILND_CONN_IDLE) {
+               /* if the conn is already scheduled, we've already requested
+                * the scheduler thread wakeup */
+               kgnilnd_conn_addref(conn);       /* +1 ref for scheduler */
+
+               LASSERTF(list_empty(&conn->gnc_schedlist), "conn %p already sched state %d\n",
+                        conn, sched);
+
+               CDEBUG(D_INFO, "scheduling conn 0x%p\n", conn);
+
+               spin_lock(&dev->gnd_lock);
+               list_add_tail(&conn->gnc_schedlist, &dev->gnd_ready_conns);
+               spin_unlock(&dev->gnd_lock);
+               set_mb(conn->gnc_last_sched_ask, jiffies);
+
+       } else {
+               CDEBUG(D_INFO, "not scheduling conn 0x%p: %d\n", conn, sched);
+       }
+
+       /* make sure thread(s) going to process conns - but let it make
+        * separate decision from conn schedule */
+       kgnilnd_schedule_device(dev);
+}
+
+void
+kgnilnd_schedule_dgram(kgn_device_t *dev)
+{
+       int                  wake;
+
+       wake = xchg(&dev->gnd_dgram_ready, GNILND_DGRAM_SCHED);
+       if (wake != GNILND_DGRAM_SCHED)  {
+               wake_up(&dev->gnd_dgram_waitq);
+       } else {
+               CDEBUG(D_NETTRACE, "not waking: %d\n", wake);
+       }
+}
+
+void
+kgnilnd_free_tx(kgn_tx_t *tx)
+{
+       /* taken from kgnilnd_tx_add_state_locked */
+
+       LASSERTF((tx->tx_list_p == NULL &&
+                 tx->tx_list_state == GNILND_TX_ALLOCD) &&
+               list_empty(&tx->tx_list),
+               "tx %p with bad state %s (list_p %p) tx_list %s\n",
+               tx, kgnilnd_tx_state2str(tx->tx_list_state), tx->tx_list_p,
+               list_empty(&tx->tx_list) ? "empty" : "not empty");
+
+       atomic_dec(&kgnilnd_data.kgn_ntx);
+
+       /* we only allocate this if we need to */
+       if (tx->tx_phys != NULL) {
+               cfs_mem_cache_free(kgnilnd_data.kgn_tx_phys_cache, tx->tx_phys);
+               CDEBUG(D_MALLOC, "slab-freed 'tx_phys': %lu at %p.\n",
+                      LNET_MAX_IOV * sizeof(gni_mem_segment_t), tx->tx_phys);
+       }
+#if 0
+       KGNILND_POISON(tx, 0x5a, sizeof(kgn_tx_t));
+#endif
+       cfs_mem_cache_free(kgnilnd_data.kgn_tx_cache, tx);
+       CDEBUG(D_MALLOC, "slab-freed 'tx': %lu at %p.\n",
+              sizeof(*tx), tx);
+}
+
+kgn_tx_t *
+kgnilnd_alloc_tx(void)
+{
+       kgn_tx_t      *tx = NULL;
+
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_ALLOC_TX))
+               return tx;
+
+       tx = cfs_mem_cache_alloc(kgnilnd_data.kgn_tx_cache, CFS_ALLOC_ATOMIC);
+       if (tx == NULL) {
+               CERROR("failed to allocate tx\n");
+               return NULL;
+       }
+       CDEBUG(D_MALLOC, "slab-alloced 'tx': %lu at %p.\n",
+              sizeof(*tx), tx);
+
+       /* need this memset, cache alloc'd memory is not cleared */
+       memset(tx, 0, sizeof(*tx));
+
+       /* setup everything here to minimize time under the lock */
+       tx->tx_buftype = GNILND_BUF_NONE;
+       tx->tx_msg.gnm_type = GNILND_MSG_NONE;
+       INIT_LIST_HEAD(&tx->tx_list);
+       INIT_LIST_HEAD(&tx->tx_map_list);
+       tx->tx_list_state = GNILND_TX_ALLOCD;
+
+       atomic_inc(&kgnilnd_data.kgn_ntx);
+
+       return tx;
+}
+
+/* csum_fold needs to be run on the return value before shipping over the wire */
+#define _kgnilnd_cksum(seed, ptr, nob)  csum_partial(ptr, nob, seed)
+
+/* we don't use offset as every one is passing a buffer reference that already
+ * includes the offset into the base address -
+ *  see kgnilnd_setup_virt_buffer and kgnilnd_setup_immediate_buffer */
+static inline __u16
+kgnilnd_cksum(void *ptr, size_t nob)
+{
+       __u16   sum;
+
+       sum = csum_fold(_kgnilnd_cksum(0, ptr, nob));
+
+       /* don't use magic 'no checksum' value */
+       if (sum == 0)
+               sum = 1;
+
+       CDEBUG(D_INFO, "cksum 0x%x for ptr 0x%p sz %zu\n",
+              sum, ptr, nob);
+
+       return sum;
+}
+
+inline __u16
+kgnilnd_cksum_kiov(unsigned int nkiov, lnet_kiov_t *kiov,
+                   unsigned int offset, unsigned int nob, int dump_blob)
+{
+       __wsum             cksum = 0;
+       __wsum             tmpck;
+       __u16              retsum;
+       void              *addr;
+       unsigned int       fraglen;
+       int                i, odd;
+
+       LASSERT(nkiov > 0);
+       LASSERT(nob > 0);
+
+       CDEBUG(D_BUFFS, "calc cksum for kiov 0x%p nkiov %u offset %u nob %u, dump %d\n",
+              kiov, nkiov, offset, nob, dump_blob);
+
+       /* if loops changes, please change kgnilnd_setup_phys_buffer */
+
+       while (offset >= kiov->kiov_len) {
+               offset -= kiov->kiov_len;
+               nkiov--;
+               kiov++;
+               LASSERT(nkiov > 0);
+       }
+
+       /* ignore nob here, if nob < (kiov_len - offset), kiov == 1 */
+       odd = (unsigned long) (kiov[0].kiov_len - offset) & 1;
+
+       if ((odd || *kgnilnd_tunables.kgn_vmap_cksum) && nkiov > 1) {
+               struct page **pages = kgnilnd_data.kgn_cksum_map_pages[get_cpu()];
+
+               LASSERTF(pages != NULL, "NULL pages for cpu %d map_pages 0x%p\n",
+                        get_cpu(), kgnilnd_data.kgn_cksum_map_pages);
+
+               CDEBUG(D_BUFFS, "odd %d len %u offset %u nob %u\n",
+                      odd, kiov[0].kiov_len, offset, nob);
+
+               for (i = 0; i < nkiov; i++) {
+                       pages[i] = kiov[i].kiov_page;
+               }
+
+               addr = vmap(pages, nkiov, VM_MAP, PAGE_KERNEL);
+               if (addr == NULL) {
+                       CNETERR("Couldn't vmap %d frags on %d bytes to avoid odd length fragment in cksum\n",
+                               nkiov, nob);
+                       /* return zero to avoid killing tx - we'll just get warning on console
+                        * when remote end sees zero checksum */
+                       RETURN(0);
+               }
+               atomic_inc(&kgnilnd_data.kgn_nvmap_cksum);
+
+               tmpck = _kgnilnd_cksum(0, (void *) addr + kiov[0].kiov_offset + offset, nob);
+               cksum = tmpck;
+
+               if (dump_blob) {
+                       kgnilnd_dump_blob(D_BUFFS, "flat kiov RDMA payload",
+                                         (void *)addr + kiov[0].kiov_offset + offset, nob);
+               }
+               CDEBUG(D_BUFFS, "cksum 0x%x (+0x%x) for addr 0x%p+%u len %u offset %u\n",
+                      cksum, tmpck, addr, kiov[0].kiov_offset, nob, offset);
+               vunmap(addr);
+       } else {
+               do {
+                       fraglen = min(kiov->kiov_len - offset, nob);
+
+                       /* make dang sure we don't send a bogus checksum if somehow we get
+                        * an odd length fragment on anything but the last entry in a kiov  -
+                        * we know from kgnilnd_setup_rdma_buffer that we can't have non
+                        * PAGE_SIZE pages in the middle, so if nob < PAGE_SIZE, it is the last one */
+                       LASSERTF(!(fraglen&1) || (nob < PAGE_SIZE),
+                                "odd fraglen %u on nkiov %d, nob %u kiov_len %u offset %u kiov 0x%p\n",
+                                fraglen, nkiov, nob, kiov->kiov_len, offset, kiov);
+
+                       addr = (void *)kmap(kiov->kiov_page) + kiov->kiov_offset + offset;
+                       tmpck = _kgnilnd_cksum(cksum, addr, fraglen);
+
+                       CDEBUG(D_BUFFS,
+                              "cksum 0x%x (+0x%x) for page 0x%p+%u (0x%p) len %u offset %u\n",
+                              cksum, tmpck, kiov->kiov_page, kiov->kiov_offset, addr,
+                              fraglen, offset);
+
+                       cksum = tmpck;
+
+                       if (dump_blob)
+                               kgnilnd_dump_blob(D_BUFFS, "kiov cksum", addr, fraglen);
+
+                       kunmap(kiov->kiov_page);
+
+                       kiov++;
+                       nkiov--;
+                       nob -= fraglen;
+                       offset = 0;
+
+                       /* iov must not run out before end of data */
+                       LASSERTF(nob == 0 || nkiov > 0, "nob %u nkiov %u\n", nob, nkiov);
+
+               } while (nob > 0);
+       }
+
+       retsum = csum_fold(cksum);
+
+       /* don't use magic 'no checksum' value */
+       if (retsum == 0)
+               retsum = 1;
+
+       CDEBUG(D_BUFFS, "retsum 0x%x from cksum 0x%x\n", retsum, cksum);
+
+       return retsum;
+}
+
+void
+kgnilnd_init_msg(kgn_msg_t *msg, int type, lnet_nid_t source)
+{
+       msg->gnm_magic = GNILND_MSG_MAGIC;
+       msg->gnm_version = GNILND_MSG_VERSION;
+       msg->gnm_type = type;
+       msg->gnm_payload_len = 0;
+       msg->gnm_srcnid = source;
+       /* gnm_connstamp gets set when FMA is sent */
+       /* gnm_srcnid is set on creation via function argument
+        * The right interface/net and nid is passed in when the message
+        * is created.
+        */
+}
+
+kgn_tx_t *
+kgnilnd_new_tx_msg(int type, lnet_nid_t source)
+{
+       kgn_tx_t *tx = kgnilnd_alloc_tx();
+
+       if (tx != NULL) {
+               kgnilnd_init_msg(&tx->tx_msg, type, source);
+       } else {
+               CERROR("couldn't allocate new tx type %s!\n",
+                      kgnilnd_msgtype2str(type));
+       }
+
+       return tx;
+}
+
+static void
+kgnilnd_nak_rdma(kgn_conn_t *conn, int type, int error, __u64 cookie, lnet_nid_t source) {
+       kgn_tx_t        *tx;
+
+       /* only allow NAK on error and truncate to zero */
+       LASSERTF(error <= 0, "error %d conn 0x%p, cookie "LPU64"\n",
+                error, conn, cookie);
+
+       tx = kgnilnd_new_tx_msg(type, source);
+       if (tx == NULL) {
+               CNETERR("can't get TX to NAK RDMA to %s\n",
+                       libcfs_nid2str(conn->gnc_peer->gnp_nid));
+               return;
+       }
+
+       tx->tx_msg.gnm_u.completion.gncm_retval = error;
+       tx->tx_msg.gnm_u.completion.gncm_cookie = cookie;
+       kgnilnd_queue_tx(conn, tx);
+}
+
+int
+kgnilnd_setup_immediate_buffer(kgn_tx_t *tx, unsigned int niov, struct iovec *iov,
+                              lnet_kiov_t *kiov, unsigned int offset, unsigned int nob)
+
+{
+       kgn_msg_t       *msg = &tx->tx_msg;
+       int              i;
+
+       /* To help save on MDDs for short messages, we'll vmap a kiov to allow
+        * gni_smsg_send to send that as the payload */
+
+       LASSERT(tx->tx_buftype == GNILND_BUF_NONE);
+       LASSERT(nob >= 0);
+
+       if (nob == 0) {
+               tx->tx_buffer = NULL;
+       } else if (kiov != NULL) {
+               LASSERTF(niov > 0 && niov < GNILND_MAX_IMMEDIATE/PAGE_SIZE,
+                        "bad niov %d\n", niov);
+
+               while (offset >= kiov->kiov_len) {
+                       offset -= kiov->kiov_len;
+                       niov--;
+                       kiov++;
+                       LASSERT(niov > 0);
+               }
+               for (i = 0; i < niov; i++) {
+                       /* We can't have a kiov_offset on anything but the first entry,
+                        * otherwise we'll have a hole at the end of the mapping as we only map
+                        * whole pages.
+                        * Also, if we have a kiov_len < PAGE_SIZE but we need to map more
+                        * than kiov_len, we will also have a whole at the end of that page
+                        * which isn't allowed */
+                       if ((kiov[i].kiov_offset != 0 && i > 0) ||
+                           (kiov[i].kiov_offset + kiov[i].kiov_len != CFS_PAGE_SIZE && i < niov - 1)) {
+                               CNETERR("Can't make payload contiguous in I/O VM:"
+                                      "page %d, offset %u, nob %u, kiov_offset %u kiov_len %u \n",
+                                      i, offset, nob, kiov->kiov_offset, kiov->kiov_len);
+                               RETURN(-EINVAL);
+                       }
+                       tx->tx_imm_pages[i] = kiov[i].kiov_page;
+               }
+
+               /* hijack tx_phys for the later unmap */
+               if (niov == 1) {
+                       /* tx->phyx being equal to NULL is the signal for unmap to discern between kmap and vmap */
+                       tx->tx_phys = NULL;
+                       tx->tx_buffer = (void *)kmap(tx->tx_imm_pages[0]) + kiov[0].kiov_offset + offset;
+                       atomic_inc(&kgnilnd_data.kgn_nkmap_short);
+                       GNIDBG_TX(D_NET, tx, "kmapped page for %d bytes for kiov 0x%p, buffer 0x%p",
+                               nob, kiov, tx->tx_buffer);
+               } else {
+                       tx->tx_phys = vmap(tx->tx_imm_pages, niov, VM_MAP, PAGE_KERNEL);
+                       if (tx->tx_phys == NULL) {
+                               CNETERR("Couldn't vmap %d frags on %d bytes\n", niov, nob);
+                               RETURN(-ENOMEM);
+
+                       }
+                       atomic_inc(&kgnilnd_data.kgn_nvmap_short);
+                       /* make sure we take into account the kiov offset as the start of the buffer */
+                       tx->tx_buffer = (void *)tx->tx_phys + kiov[0].kiov_offset + offset;
+                       GNIDBG_TX(D_NET, tx, "mapped %d pages for %d bytes from kiov 0x%p to 0x%p, buffer 0x%p",
+                               niov, nob, kiov, tx->tx_phys, tx->tx_buffer);
+               }
+               tx->tx_buftype = GNILND_BUF_IMMEDIATE_KIOV;
+               tx->tx_nob = nob;
+
+       } else {
+               /* For now this is almost identical to kgnilnd_setup_virt_buffer, but we
+                * could "flatten" the payload into a single contiguous buffer ready
+                * for sending direct over an FMA if we ever needed to. */
+
+               LASSERT(niov > 0);
+
+               while (offset >= iov->iov_len) {
+                       offset -= iov->iov_len;
+                       niov--;
+                       iov++;
+                       LASSERT(niov > 0);
+               }
+
+               if (nob > iov->iov_len - offset) {
+                       CERROR("Can't handle multiple vaddr fragments\n");
+                       return -EMSGSIZE;
+               }
+
+               tx->tx_buffer = (void *)(((unsigned long)iov->iov_base) + offset);
+
+               tx->tx_buftype = GNILND_BUF_IMMEDIATE;
+               tx->tx_nob = nob;
+       }
+
+       /* checksum payload early - it shouldn't be changing after lnd_send */
+       if (*kgnilnd_tunables.kgn_checksum >= 2) {
+               msg->gnm_payload_cksum = kgnilnd_cksum(tx->tx_buffer, nob);
+               if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_CKSUM2)) {
+                       msg->gnm_payload_cksum += 0xe00e;
+               }
+               if (*kgnilnd_tunables.kgn_checksum_dump > 1) {
+                       kgnilnd_dump_blob(D_BUFFS, "payload checksum",
+                                         tx->tx_buffer, nob);
+               }
+       } else {
+               msg->gnm_payload_cksum = 0;
+       }
+
+       return 0;
+}
+
+int
+kgnilnd_setup_virt_buffer(kgn_tx_t *tx,
+                         unsigned int niov, struct iovec *iov,
+                         unsigned int offset, unsigned int nob)
+
+{
+       LASSERT(nob > 0);
+       LASSERT(niov > 0);
+       LASSERT(tx->tx_buftype == GNILND_BUF_NONE);
+
+       while (offset >= iov->iov_len) {
+               offset -= iov->iov_len;
+               niov--;
+               iov++;
+               LASSERT(niov > 0);
+       }
+
+       if (nob > iov->iov_len - offset) {
+               CERROR("Can't handle multiple vaddr fragments\n");
+               return -EMSGSIZE;
+       }
+
+       tx->tx_buftype = GNILND_BUF_VIRT_UNMAPPED;
+       tx->tx_nob = nob;
+       tx->tx_buffer = (void *)(((unsigned long)iov->iov_base) + offset);
+       return 0;
+}
+
+int
+kgnilnd_setup_phys_buffer(kgn_tx_t *tx, int nkiov, lnet_kiov_t *kiov,
+                         unsigned int offset, unsigned int nob)
+{
+       gni_mem_segment_t *phys;
+       int                rc = 0;
+       unsigned int       fraglen;
+
+       GNIDBG_TX(D_NET, tx, "niov %d kiov 0x%p offset %u nob %u", nkiov, kiov, offset, nob);
+
+       LASSERT(nob > 0);
+       LASSERT(nkiov > 0);
+       LASSERT(tx->tx_buftype == GNILND_BUF_NONE);
+
+       /* only allocate this if we are going to use it */
+       tx->tx_phys = cfs_mem_cache_alloc(kgnilnd_data.kgn_tx_phys_cache,
+                                         CFS_ALLOC_ATOMIC);
+       if (tx->tx_phys == NULL) {
+               CERROR("failed to allocate tx_phys\n");
+               rc = -ENOMEM;
+               GOTO(error, rc);
+       }
+
+       CDEBUG(D_MALLOC, "slab-alloced 'tx->tx_phys': %lu at %p.\n",
+              LNET_MAX_IOV * sizeof(gni_mem_segment_t), tx->tx_phys);
+
+       /* if loops changes, please change kgnilnd_cksum_kiov
+        *   and kgnilnd_setup_immediate_buffer */
+
+       while (offset >= kiov->kiov_len) {
+               offset -= kiov->kiov_len;
+               nkiov--;
+               kiov++;
+               LASSERT(nkiov > 0);
+       }
+
+       /* at this point, kiov points to the first page that we'll actually map
+        * now that we've seeked into the koiv for offset and dropped any
+        * leading pages that fall entirely within the offset */
+       tx->tx_buftype = GNILND_BUF_PHYS_UNMAPPED;
+       tx->tx_nob = nob;
+
+       /* kiov_offset is start of 'valid' buffer, so index offset past that */
+       tx->tx_buffer = (void *)((unsigned long)(kiov->kiov_offset + offset));
+       phys = tx->tx_phys;
+
+       CDEBUG(D_NET, "tx 0x%p buffer 0x%p map start kiov 0x%p+%u niov %d offset %u\n",
+              tx, tx->tx_buffer, kiov, kiov->kiov_offset, nkiov, offset);
+
+       do {
+               fraglen = min(kiov->kiov_len - offset, nob);
+
+               /* We can't have a kiov_offset on anything but the first entry,
+                * otherwise we'll have a hole at the end of the mapping as we only map
+                * whole pages. Only the first page is allowed to have an offset -
+                * we'll add that into tx->tx_buffer and that will get used when we
+                * map in the segments (see kgnilnd_map_buffer).
+                * Also, if we have a kiov_len < PAGE_SIZE but we need to map more
+                * than kiov_len, we will also have a whole at the end of that page
+                * which isn't allowed */
+               if ((phys != tx->tx_phys) &&
+                   ((kiov->kiov_offset != 0) ||
+                    ((kiov->kiov_len < PAGE_SIZE) && (nob > kiov->kiov_len)))) {
+                       CERROR("Can't make payload contiguous in I/O VM:"
+                              "page %d, offset %u, nob %u, kiov_offset %u kiov_len %u \n",
+                              (int)(phys - tx->tx_phys),
+                              offset, nob, kiov->kiov_offset, kiov->kiov_len);
+                       rc = -EINVAL;
+                       GOTO(error, rc);
+               }
+
+               if ((phys - tx->tx_phys) == LNET_MAX_IOV) {
+                       CERROR ("payload too big (%d)\n", (int)(phys - tx->tx_phys));
+                       rc = -EMSGSIZE;
+                       GOTO(error, rc);
+               }
+
+               if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PHYS_SETUP)) {
+                       rc = -EINVAL;
+                       GOTO(error, rc);
+               }
+
+               CDEBUG(D_BUFFS, "page 0x%p kiov_offset %u kiov_len %u nob %u "
+                              "nkiov %u offset %u\n",
+                     kiov->kiov_page, kiov->kiov_offset, kiov->kiov_len, nob, nkiov, offset);
+
+               phys->address = lnet_page2phys(kiov->kiov_page);
+               phys++;
+               kiov++;
+               nkiov--;
+               nob -= fraglen;
+               offset = 0;
+
+               /* iov must not run out before end of data */
+               LASSERTF(nob == 0 || nkiov > 0, "nob %u nkiov %u\n", nob, nkiov);
+
+       } while (nob > 0);
+
+       tx->tx_phys_npages = phys - tx->tx_phys;
+
+       return 0;
+
+error:
+       if (tx->tx_phys != NULL) {
+               cfs_mem_cache_free(kgnilnd_data.kgn_tx_phys_cache, tx->tx_phys);
+               CDEBUG(D_MALLOC, "slab-freed 'tx_phys': %lu at %p.\n",
+                      sizeof(*tx->tx_phys), tx->tx_phys);
+               tx->tx_phys = NULL;
+       }
+       return rc;
+}
+
+static inline int
+kgnilnd_setup_rdma_buffer(kgn_tx_t *tx, unsigned int niov,
+                         struct iovec *iov, lnet_kiov_t *kiov,
+                         unsigned int offset, unsigned int nob)
+{
+       int     rc;
+
+       LASSERT((iov == NULL) != (kiov == NULL));
+
+       if (kiov != NULL) {
+               rc = kgnilnd_setup_phys_buffer(tx, niov, kiov, offset, nob);
+       } else {
+               rc = kgnilnd_setup_virt_buffer(tx, niov, iov, offset, nob);
+       }
+       return rc;
+}
+
+static void
+kgnilnd_parse_lnet_rdma(lnet_msg_t *lntmsg, unsigned int *niov, unsigned int *offset,
+                       unsigned int *nob, lnet_kiov_t **kiov)
+{
+       /* GETs are weird, see kgnilnd_send */
+       if (lntmsg->msg_type == LNET_MSG_GET) {
+               if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0) {
+                       *kiov = NULL;
+               } else {
+                       *kiov = lntmsg->msg_md->md_iov.kiov;
+               }
+               *niov = lntmsg->msg_md->md_niov;
+               *nob = lntmsg->msg_md->md_length;
+               *offset = 0;
+       } else {
+               *kiov = lntmsg->msg_kiov;
+               *niov = lntmsg->msg_niov;
+               *nob = lntmsg->msg_len;
+               *offset = lntmsg->msg_offset;
+       }
+}
+
+static inline void
+kgnilnd_compute_rdma_cksum(kgn_tx_t *tx)
+{
+       unsigned int     niov, offset, nob;
+       lnet_kiov_t     *kiov;
+       lnet_msg_t      *lntmsg = tx->tx_lntmsg[0];
+       int              dump_cksum = (*kgnilnd_tunables.kgn_checksum_dump > 1);
+
+       GNITX_ASSERTF(tx, ((tx->tx_msg.gnm_type == GNILND_MSG_PUT_DONE) ||
+                          (tx->tx_msg.gnm_type == GNILND_MSG_GET_DONE)),
+                     "bad type %s", kgnilnd_msgtype2str(tx->tx_msg.gnm_type));
+
+
+       if (*kgnilnd_tunables.kgn_checksum < 3) {
+               tx->tx_msg.gnm_payload_cksum = 0;
+               return;
+       }
+
+       GNITX_ASSERTF(tx, lntmsg, "no LNet message!", NULL);
+
+       kgnilnd_parse_lnet_rdma(lntmsg, &niov, &offset, &nob, &kiov);
+
+       if (kiov != NULL) {
+               tx->tx_msg.gnm_payload_cksum = kgnilnd_cksum_kiov(niov, kiov, offset, nob, dump_cksum);
+       } else {
+               tx->tx_msg.gnm_payload_cksum = kgnilnd_cksum(tx->tx_buffer, nob);
+               if (dump_cksum) {
+                       kgnilnd_dump_blob(D_BUFFS, "peer RDMA payload", tx->tx_buffer, nob);
+               }
+       }
+
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_CKSUM3)) {
+               tx->tx_msg.gnm_payload_cksum += 0xd00d;
+       }
+}
+
+static inline int
+kgnilnd_verify_rdma_cksum(kgn_tx_t *tx, __u16 rx_cksum)
+{
+       int              rc = 0;
+       __u16            cksum;
+       unsigned int     niov, offset, nob;
+       lnet_kiov_t     *kiov;
+       lnet_msg_t      *lntmsg = tx->tx_lntmsg[0];
+       int dump_on_err = *kgnilnd_tunables.kgn_checksum_dump;
+
+       /* we can only match certain requests */
+       GNITX_ASSERTF(tx, ((tx->tx_msg.gnm_type == GNILND_MSG_GET_REQ) ||
+                          (tx->tx_msg.gnm_type == GNILND_MSG_PUT_ACK)),
+                     "bad type %s", kgnilnd_msgtype2str(tx->tx_msg.gnm_type));
+
+       if (rx_cksum == 0)  {
+               if (*kgnilnd_tunables.kgn_checksum >= 3) {
+                       GNIDBG_MSG(D_WARNING, &tx->tx_msg,
+                                  "no RDMA payload checksum when enabled");
+               }
+               return 0;
+       }
+
+       GNITX_ASSERTF(tx, lntmsg, "no LNet message!", NULL);
+
+       kgnilnd_parse_lnet_rdma(lntmsg, &niov, &offset, &nob, &kiov);
+
+       if (kiov != NULL) {
+               cksum = kgnilnd_cksum_kiov(niov, kiov, offset, nob, 0);
+       } else {
+               cksum = kgnilnd_cksum(tx->tx_buffer, nob);
+       }
+
+       if (cksum != rx_cksum) {
+               GNIDBG_MSG(D_NETERROR, &tx->tx_msg,
+                          "Bad RDMA payload checksum (%x expected %x); "
+                          "kiov 0x%p niov %d nob %u offset %u",
+                           cksum, rx_cksum, kiov, niov, nob, offset);
+               switch (dump_on_err) {
+               case 2:
+                       if (kiov != NULL) {
+                               kgnilnd_cksum_kiov(niov, kiov, offset, nob, 1);
+                       } else {
+                               kgnilnd_dump_blob(D_BUFFS, "RDMA payload",
+                                                 tx->tx_buffer, nob);
+                       }
+                       /* fall through to dump log */
+               case 1:
+                       libcfs_debug_dumplog();
+                       break;
+               default:
+                       break;
+               }
+               rc = -ENOKEY;
+               /* kgnilnd_check_fma_rx will close conn, kill tx with error */
+       }
+       return rc;
+}
+
+void
+kgnilnd_mem_add_map_list(kgn_device_t *dev, kgn_tx_t *tx)
+{
+       int     bytes;
+
+       GNITX_ASSERTF(tx, list_empty(&tx->tx_map_list),
+               "already mapped!", NULL);
+
+       spin_lock(&dev->gnd_map_lock);
+       switch (tx->tx_buftype) {
+       default:
+               GNIDBG_TX(D_EMERG, tx,
+                       "SOFTWARE BUG: invalid mapping %d", tx->tx_buftype);
+               spin_unlock(&dev->gnd_map_lock);
+               LBUG();
+               break;
+
+       case GNILND_BUF_PHYS_MAPPED:
+               bytes = tx->tx_phys_npages * PAGE_SIZE;
+               dev->gnd_map_nphys++;
+               dev->gnd_map_physnop += tx->tx_phys_npages;
+               break;
+
+       case GNILND_BUF_VIRT_MAPPED:
+               bytes = tx->tx_nob;
+               dev->gnd_map_nvirt++;
+               dev->gnd_map_virtnob += tx->tx_nob;
+               break;
+       }
+
+       if (tx->tx_msg.gnm_type == GNILND_MSG_PUT_ACK ||
+           tx->tx_msg.gnm_type == GNILND_MSG_GET_REQ) {
+               atomic64_add(bytes, &dev->gnd_rdmaq_bytes_out);
+               GNIDBG_TX(D_NETTRACE, tx, "rdma ++ %d to "LPD64"",
+                         bytes, atomic64_read(&dev->gnd_rdmaq_bytes_out));
+       }
+
+       atomic_inc(&dev->gnd_n_mdd);
+       atomic64_add(bytes, &dev->gnd_nbytes_map);
+
+       /* clear retrans to prevent any SMSG goofiness as that code uses the same counter */
+       tx->tx_retrans = 0;
+
+       /* we only get here in the valid cases */
+       list_add_tail(&tx->tx_map_list, &dev->gnd_map_list);
+       dev->gnd_map_version++;
+       spin_unlock(&dev->gnd_map_lock);
+}
+
+void
+kgnilnd_mem_del_map_list(kgn_device_t *dev, kgn_tx_t *tx)
+{
+       int     bytes;
+
+       GNITX_ASSERTF(tx, !list_empty(&tx->tx_map_list),
+               "not mapped!", NULL);
+       spin_lock(&dev->gnd_map_lock);
+
+       switch (tx->tx_buftype) {
+       default:
+               GNIDBG_TX(D_EMERG, tx,
+                       "SOFTWARE BUG: invalid mapping %d", tx->tx_buftype);
+               spin_unlock(&dev->gnd_map_lock);
+               LBUG();
+               break;
+
+       case GNILND_BUF_PHYS_UNMAPPED:
+               bytes = tx->tx_phys_npages * PAGE_SIZE;
+               dev->gnd_map_nphys--;
+               dev->gnd_map_physnop -= tx->tx_phys_npages;
+               break;
+
+       case GNILND_BUF_VIRT_UNMAPPED:
+               bytes = tx->tx_nob;
+               dev->gnd_map_nvirt--;
+               dev->gnd_map_virtnob -= tx->tx_nob;
+               break;
+       }
+
+       if (tx->tx_msg.gnm_type == GNILND_MSG_PUT_ACK ||
+           tx->tx_msg.gnm_type == GNILND_MSG_GET_REQ) {
+               atomic64_sub(bytes, &dev->gnd_rdmaq_bytes_out);
+               LASSERTF(atomic64_read(&dev->gnd_rdmaq_bytes_out) >= 0,
+                        "bytes_out negative! %ld\n", atomic64_read(&dev->gnd_rdmaq_bytes_out));
+               GNIDBG_TX(D_NETTRACE, tx, "rdma -- %d to "LPD64"",
+                         bytes, atomic64_read(&dev->gnd_rdmaq_bytes_out));
+       }
+
+       atomic_dec(&dev->gnd_n_mdd);
+       atomic64_sub(bytes, &dev->gnd_nbytes_map);
+
+       /* we only get here in the valid cases */
+       list_del_init(&tx->tx_map_list);
+       dev->gnd_map_version++;
+       spin_unlock(&dev->gnd_map_lock);
+}
+
+int
+kgnilnd_map_buffer(kgn_tx_t *tx)
+{
+       kgn_conn_t       *conn = tx->tx_conn;
+       kgn_device_t     *dev = conn->gnc_device;
+       __u32             flags = GNI_MEM_READWRITE;
+       gni_return_t      rrc;
+
+       /* The kgnilnd_mem_register(_segments) Gemini Driver functions can
+        * be called concurrently as there are internal locks that protect
+        * any data structures or HW resources. We just need to ensure
+        * that our concurrency doesn't result in the kgn_device_t
+        * getting nuked while we are in here */
+
+       LASSERTF(conn != NULL, "tx %p with NULL conn, someone forgot"
+               " to set tx_conn before calling %s\n", tx, __FUNCTION__);
+
+       if (unlikely(CFS_FAIL_CHECK(CFS_FAIL_GNI_MAP_TX)))
+               RETURN(-ENOMEM);
+
+       if (*kgnilnd_tunables.kgn_bte_relaxed_ordering) {
+               flags |= GNI_MEM_RELAXED_PI_ORDERING;
+       }
+
+       switch (tx->tx_buftype) {
+       default:
+               LBUG();
+
+       case GNILND_BUF_NONE:
+       case GNILND_BUF_IMMEDIATE:
+       case GNILND_BUF_IMMEDIATE_KIOV:
+       case GNILND_BUF_PHYS_MAPPED:
+       case GNILND_BUF_VIRT_MAPPED:
+               return 0;
+
+       case GNILND_BUF_PHYS_UNMAPPED:
+               GNITX_ASSERTF(tx, tx->tx_phys != NULL, "physical buffer not there!", NULL);
+               rrc = kgnilnd_mem_register_segments(dev->gnd_handle,
+                       tx->tx_phys, tx->tx_phys_npages, NULL,
+                       GNI_MEM_PHYS_SEGMENTS | flags,
+                       &tx->tx_map_key);
+               /* could race with other uses of the map counts, but this is ok
+                * - this needs to turn into a non-fatal error soon to allow
+                *  GART resource, etc starvation handling */
+               if (rrc != GNI_RC_SUCCESS) {
+                       GNIDBG_TX(D_NET, tx, "Can't map %d pages: dev %d "
+                               "phys %u pp %u, virt %u nob "LPU64"",
+                               tx->tx_phys_npages, dev->gnd_id,
+                               dev->gnd_map_nphys, dev->gnd_map_physnop,
+                               dev->gnd_map_nvirt, dev->gnd_map_virtnob);
+                       RETURN(rrc == GNI_RC_ERROR_RESOURCE ? -ENOMEM : -EINVAL);
+               }
+
+               tx->tx_buftype = GNILND_BUF_PHYS_MAPPED;
+               kgnilnd_mem_add_map_list(dev, tx);
+               return 0;
+
+       case GNILND_BUF_VIRT_UNMAPPED:
+               rrc = kgnilnd_mem_register(dev->gnd_handle,
+                       (__u64)tx->tx_buffer, tx->tx_nob,
+                       NULL, flags, &tx->tx_map_key);
+               if (rrc != GNI_RC_SUCCESS) {
+                       GNIDBG_TX(D_NET, tx, "Can't map %u bytes: dev %d "
+                               "phys %u pp %u, virt %u nob "LPU64"",
+                               tx->tx_nob, dev->gnd_id,
+                               dev->gnd_map_nphys, dev->gnd_map_physnop,
+                               dev->gnd_map_nvirt, dev->gnd_map_virtnob);
+                       RETURN(rrc == GNI_RC_ERROR_RESOURCE ? -ENOMEM : -EINVAL);
+               }
+
+               tx->tx_buftype = GNILND_BUF_VIRT_MAPPED;
+               kgnilnd_mem_add_map_list(dev, tx);
+               if (tx->tx_msg.gnm_type == GNILND_MSG_PUT_ACK ||
+                   tx->tx_msg.gnm_type == GNILND_MSG_GET_REQ) {
+                       atomic64_add(tx->tx_nob, &dev->gnd_rdmaq_bytes_out);
+                       GNIDBG_TX(D_NETTRACE, tx, "rdma ++ %d to %ld\n",
+                              tx->tx_nob, atomic64_read(&dev->gnd_rdmaq_bytes_out));
+               }
+
+               return 0;
+       }
+}
+
+void
+kgnilnd_add_purgatory_tx(kgn_tx_t *tx)
+{
+       kgn_conn_t                  *conn = tx->tx_conn;
+       kgn_mdd_purgatory_t         *gmp;
+
+       LIBCFS_ALLOC(gmp, sizeof(*gmp));
+       LASSERTF(gmp != NULL, "couldn't allocate MDD purgatory member;"
+               " asserting to avoid data corruption\n");
+
+       gmp->gmp_map_key = tx->tx_map_key;
+       atomic_inc(&conn->gnc_device->gnd_n_mdd_held);
+
+       /* ensure that we don't have a blank purgatory - indicating the
+        * conn is not already on purgatory lists - we'd never recover these
+        * MDD if that were the case */
+       GNITX_ASSERTF(tx, conn->gnc_in_purgatory,
+               "conn 0x%p->%s with NULL purgatory",
+               conn, libcfs_nid2str(conn->gnc_peer->gnp_nid));
+
+       /* link 'er up! - only place we really need to lock for
+        * concurrent access */
+       spin_lock(&conn->gnc_list_lock);
+       list_add_tail(&gmp->gmp_list, &conn->gnc_mdd_list);
+       spin_unlock(&conn->gnc_list_lock);
+}
+
+void
+kgnilnd_unmap_buffer(kgn_tx_t *tx, int error)
+{
+       kgn_device_t     *dev;
+       gni_return_t      rrc;
+       int               hold_timeout = 0;
+
+       /* code below relies on +1 relationship ... */
+       CLASSERT(GNILND_BUF_PHYS_MAPPED == (GNILND_BUF_PHYS_UNMAPPED + 1));
+       CLASSERT(GNILND_BUF_VIRT_MAPPED == (GNILND_BUF_VIRT_UNMAPPED + 1));
+
+       switch (tx->tx_buftype) {
+       default:
+               LBUG();
+
+       case GNILND_BUF_NONE:
+       case GNILND_BUF_IMMEDIATE:
+       case GNILND_BUF_PHYS_UNMAPPED:
+       case GNILND_BUF_VIRT_UNMAPPED:
+               break;
+       case GNILND_BUF_IMMEDIATE_KIOV:
+               if (tx->tx_phys != NULL) {
+                       vunmap(tx->tx_phys);
+               } else if (tx->tx_phys == NULL && tx->tx_buffer != NULL) {
+                       kunmap(tx->tx_imm_pages[0]);
+               }
+               /* clear to prevent kgnilnd_free_tx from thinking
+                * this is a RDMA descriptor */
+               tx->tx_phys = NULL;
+               break;
+
+       case GNILND_BUF_PHYS_MAPPED:
+       case GNILND_BUF_VIRT_MAPPED:
+               LASSERT(tx->tx_conn != NULL);
+
+               dev = tx->tx_conn->gnc_device;
+
+               /* only want to hold if we are closing conn without
+                * verified peer notification  - the theory is that
+                * a TX error can be communicated in all other cases */
+               if (tx->tx_conn->gnc_state != GNILND_CONN_ESTABLISHED &&
+                   kgnilnd_check_purgatory_conn(tx->tx_conn)) {
+                       kgnilnd_add_purgatory_tx(tx);
+
+                       /* The timeout we give to kgni is a deadman stop only.
+                        *  we are setting high to ensure we don't have the kgni timer
+                        *  fire before ours fires _and_ is handled */
+                       hold_timeout = GNILND_TIMEOUT2DEADMAN;
+
+                       GNIDBG_TX(D_NET, tx,
+                                "dev %p delaying MDD release for %dms key "LPX64"."LPX64"",
+                                tx->tx_conn->gnc_device, hold_timeout,
+                                tx->tx_map_key.qword1, tx->tx_map_key.qword2);
+               }
+
+               rrc = kgnilnd_mem_deregister(dev->gnd_handle, &tx->tx_map_key, hold_timeout);
+
+               LASSERTF(rrc == GNI_RC_SUCCESS, "rrc %d\n", rrc);
+
+               tx->tx_buftype--;
+               kgnilnd_mem_del_map_list(dev, tx);
+               break;
+       }
+}
+
+void
+kgnilnd_tx_done(kgn_tx_t *tx, int completion)
+{
+       lnet_msg_t      *lntmsg0, *lntmsg1;
+       int             status0, status1;
+       lnet_ni_t       *ni = NULL;
+       kgn_conn_t      *conn = tx->tx_conn;
+
+       LASSERT(!in_interrupt());
+
+       lntmsg0 = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL;
+       lntmsg1 = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL;
+
+       if (completion &&
+           !(tx->tx_state & GNILND_TX_QUIET_ERROR) &&
+           !kgnilnd_conn_clean_errno(completion)) {
+               GNIDBG_TOMSG(D_NETERROR, &tx->tx_msg,
+                      "error %d on tx 0x%p->%s id %u/%d state %s age %ds",
+                      completion, tx, conn ?
+                      libcfs_nid2str(conn->gnc_peer->gnp_nid) : "<?>",
+                      tx->tx_id.txe_smsg_id, tx->tx_id.txe_idx,
+                      kgnilnd_tx_state2str(tx->tx_list_state),
+                      cfs_duration_sec((long)jiffies - tx->tx_qtime));
+       }
+
+       /* The error codes determine if we hold onto the MDD */
+       kgnilnd_unmap_buffer(tx, completion);
+
+       /* we have to deliver a reply on lntmsg[1] for the GET, so make sure
+        * we play nice with the error codes to avoid delivering a failed
+        * REQUEST and then a REPLY event as well */
+
+       /* return -EIO to lnet - it is the magic value for failed sends */
+       if (tx->tx_msg.gnm_type == GNILND_MSG_GET_REQ) {
+               status0 = 0;
+               status1 = completion;
+       } else {
+               status0 = status1 = completion;
+       }
+
+       tx->tx_buftype = GNILND_BUF_NONE;
+       tx->tx_msg.gnm_type = GNILND_MSG_NONE;
+
+       /* lnet_finalize doesn't do anything with the *ni, so ok for us to
+        * set NULL when we are a tx without a conn */
+       if (conn != NULL) {
+               ni = conn->gnc_peer->gnp_net->gnn_ni;
+
+               spin_lock(&conn->gnc_tx_lock);
+
+               LASSERTF(test_and_clear_bit(tx->tx_id.txe_idx,
+                       (volatile unsigned long *)&conn->gnc_tx_bits),
+                       "conn %p tx %p bit %d already cleared\n",
+                       conn, tx, tx->tx_id.txe_idx);
+
+               LASSERTF(conn->gnc_tx_ref_table[tx->tx_id.txe_idx] != NULL,
+                        "msg_id %d already NULL\n", tx->tx_id.txe_idx);
+
+               conn->gnc_tx_ref_table[tx->tx_id.txe_idx] = NULL;
+               spin_unlock(&conn->gnc_tx_lock);
+       }
+
+       kgnilnd_free_tx(tx);
+
+       /* finalize AFTER freeing lnet msgs */
+
+       /* warning - we should hold no locks here - calling lnet_finalize
+        * could free up lnet credits, resulting in a call chain back into
+        * the LND via kgnilnd_send and friends */
+       lnet_finalize(ni, lntmsg0, status0);
+
+       if (lntmsg1 != NULL) {
+               lnet_finalize(ni, lntmsg1, status1);
+       }
+}
+
+void
+kgnilnd_txlist_done(struct list_head *txlist, int error)
+{
+       kgn_tx_t        *tx, *txn;
+       int              err_printed = 0;
+
+       if (list_empty(txlist))
+               return;
+
+       list_for_each_entry_safe(tx, txn, txlist, tx_list) {
+               /* only print the first error */
+               if (err_printed)
+                       tx->tx_state |= GNILND_TX_QUIET_ERROR;
+               list_del_init(&tx->tx_list);
+               kgnilnd_tx_done(tx, error);
+               err_printed++;
+       }
+}
+int
+kgnilnd_set_tx_id(kgn_tx_t *tx, kgn_conn_t *conn)
+{
+       int     id;
+
+       spin_lock(&conn->gnc_tx_lock);
+
+       /* ID zero is NOT ALLOWED!!! */
+
+search_again:
+       id = find_next_zero_bit((unsigned long *)&conn->gnc_tx_bits,
+                                GNILND_MAX_MSG_ID, conn->gnc_next_tx);
+       if (id == GNILND_MAX_MSG_ID) {
+               if (conn->gnc_next_tx != 1) {
+                       /* we only searched from next_tx to end and didn't find
+                        * one, so search again from start */
+                       conn->gnc_next_tx = 1;
+                       goto search_again;
+               }
+               /* couldn't find one! */
+               spin_unlock(&conn->gnc_tx_lock);
+               return -E2BIG;
+       }
+
+       /* bump next_tx to prevent immediate reuse */
+       conn->gnc_next_tx = id + 1;
+
+       set_bit(id, (volatile unsigned long *)&conn->gnc_tx_bits);
+       LASSERTF(conn->gnc_tx_ref_table[id] == NULL,
+                "tx 0x%p already at id %d\n",
+                conn->gnc_tx_ref_table[id], id);
+
+       /* delay these until we have a valid ID - prevents bad clear of the bit
+        * in kgnilnd_tx_done */
+       tx->tx_conn = conn;
+       tx->tx_id.txe_cqid = conn->gnc_cqid;
+
+       tx->tx_id.txe_idx = id;
+       conn->gnc_tx_ref_table[id] = tx;
+
+       /* Using jiffies to help differentiate against TX reuse - with
+        * the usual minimum of a 250HZ clock, we wrap jiffies on the same TX
+        * if we are sending to the same node faster than 256000/sec.
+        * To help guard against this, we OR in the tx_seq - that is 32 bits */
+
+       tx->tx_id.txe_chips = (__u32)(jiffies | conn->gnc_tx_seq);
+
+       GNIDBG_TX(D_NET, tx, "set cookie/id/bits", NULL);
+
+       spin_unlock(&conn->gnc_tx_lock);
+       return 0;
+}
+
+static inline int
+kgnilnd_tx_should_retry(kgn_conn_t *conn, kgn_tx_t *tx)
+{
+       int             max_retrans = *kgnilnd_tunables.kgn_max_retransmits;
+       int             log_retrans;
+       int             log_retrans_level;
+
+       /* I need kgni credits to send this.  Replace tx at the head of the
+        * fmaq and I'll get rescheduled when credits appear */
+       tx->tx_state = 0;
+       tx->tx_retrans++;
+       conn->gnc_tx_retrans++;
+       log_retrans = ((tx->tx_retrans < 25) || ((tx->tx_retrans % 25) == 0) ||
+                       (tx->tx_retrans > (max_retrans / 2)));
+       log_retrans_level = tx->tx_retrans < (max_retrans / 2) ? D_NET : D_NETERROR;
+
+       /* Decision time - either error, warn or just retransmit */
+
+       /* we don't care about TX timeout - it could be that the network is slower
+        * or throttled. We'll keep retranmitting - so if the network is so slow
+        * that we fill up our mailbox, we'll keep trying to resend that msg
+        * until we exceed the max_retrans _or_ gnc_last_rx expires, indicating
+        * that he hasn't send us any traffic in return */
+
+       if (tx->tx_retrans > max_retrans) {
+               /* this means we are not backing off the retransmits
+                * in a healthy manner and are likely chewing up the
+                * CPU cycles quite badly */
+               GNIDBG_TOMSG(D_ERROR, &tx->tx_msg,
+                       "SOFTWARE BUG: too many retransmits (%d) for tx id %x "
+                       "conn 0x%p->%s\n",
+                       tx->tx_retrans, tx->tx_id, conn,
+                       libcfs_nid2str(conn->gnc_peer->gnp_nid));
+
+               /* yes - double errors to help debug this condition */
+               GNIDBG_TOMSG(D_NETERROR, &tx->tx_msg, "connection dead. "
+                       "unable to send to %s for %lu secs (%d tries)",
+                       libcfs_nid2str(tx->tx_conn->gnc_peer->gnp_nid),
+                       cfs_duration_sec(jiffies - tx->tx_cred_wait),
+                       tx->tx_retrans);
+
+               kgnilnd_close_conn(conn, -ETIMEDOUT);
+
+               /* caller should terminate */
+               RETURN(0);
+       } else {
+               /* some reasonable throttling of the debug message */
+               if (log_retrans) {
+                       unsigned long now = jiffies;
+                       /* XXX Nic: Mystical TX debug here... */
+                       GNIDBG_SMSG_CREDS(log_retrans_level, conn);
+                       GNIDBG_TOMSG(log_retrans_level, &tx->tx_msg,
+                               "NOT_DONE on conn 0x%p->%s id %x retrans %d wait %dus"
+                               " last_msg %uus/%uus last_cq %uus/%uus",
+                               conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+                               tx->tx_id, tx->tx_retrans,
+                               jiffies_to_usecs(now - tx->tx_cred_wait),
+                               jiffies_to_usecs(now - conn->gnc_last_tx),
+                               jiffies_to_usecs(now - conn->gnc_last_rx),
+                               jiffies_to_usecs(now - conn->gnc_last_tx_cq),
+                               jiffies_to_usecs(now - conn->gnc_last_rx_cq));
+               }
+               /* caller should retry */
+               RETURN(1);
+       }
+}
+
+/* caller must be holding gnd_cq_mutex and not unlock it afterwards, as we need to drop it
+ * to avoid bad ordering with state_lock */
+
+static inline int
+kgnilnd_sendmsg_nolock(kgn_tx_t *tx, void *immediate, unsigned int immediatenob,
+               spinlock_t *state_lock, kgn_tx_list_state_t state)
+{
+       kgn_conn_t      *conn = tx->tx_conn;
+       kgn_msg_t       *msg = &tx->tx_msg;
+       int              retry_send;
+       gni_return_t     rrc;
+       unsigned long    newest_last_rx, timeout;
+       unsigned long    now;
+
+       LASSERTF((msg->gnm_type == GNILND_MSG_IMMEDIATE) ?
+               immediatenob <= *kgnilnd_tunables.kgn_max_immediate :
+               immediatenob == 0,
+               "msg 0x%p type %d wrong payload size %d\n",
+               msg, msg->gnm_type, immediatenob);
+
+       /* make sure we catch all the cases where we'd send on a dirty old mbox
+        * but allow case for sending CLOSE. Since this check is within the CQ
+        * mutex barrier and the close message is only sent through
+        * kgnilnd_send_conn_close the last message out the door will be the
+        * close message.
+        */
+       if (atomic_read(&conn->gnc_peer->gnp_dirty_eps) != 0 && msg->gnm_type != GNILND_MSG_CLOSE) {
+               mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+               /* Return -ETIME, we are closing the connection already so we dont want to
+                * have this tx hit the wire. The tx will be killed by the calling function.
+                * Once the EP is marked dirty the close message will be the last
+                * thing to hit the wire */
+               return -ETIME;
+       }
+
+       now = jiffies;
+       timeout = cfs_time_seconds(conn->gnc_timeout);
+
+       newest_last_rx = GNILND_LASTRX(conn);
+
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SEND_TIMEOUT)) {
+               now = now + (GNILND_TIMEOUTRX(timeout) * 2);
+       }
+
+       if (time_after_eq(now, newest_last_rx + GNILND_TIMEOUTRX(timeout))) {
+               GNIDBG_CONN(D_NETERROR|D_CONSOLE, conn, "Cant send to %s after timeout lapse of %lu; TO %lu",
+               libcfs_nid2str(conn->gnc_peer->gnp_nid),
+               cfs_duration_sec(now - newest_last_rx),
+               cfs_duration_sec(GNILND_TIMEOUTRX(timeout)));
+               mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+               return -ETIME;
+       }
+
+       GNITX_ASSERTF(tx, (conn != NULL) && (tx->tx_id.txe_idx != 0), "tx id unset!", NULL);
+       /* msg->gnm_srcnid is set when the message is initialized by whatever function is
+        * creating the message this allows the message to contain the correct LNET NID/NET needed
+        * instead of the one that the peer/conn uses for sending the data.
+        */
+       msg->gnm_connstamp = conn->gnc_my_connstamp;
+       msg->gnm_payload_len = immediatenob;
+       msg->gnm_seq = conn->gnc_tx_seq;
+
+       /* always init here - kgn_checksum is a /sys module tunable
+        * and can be flipped at any point, even between msg init and sending */
+       msg->gnm_cksum = 0;
+       if (*kgnilnd_tunables.kgn_checksum) {
+               /* We must set here and not in kgnilnd_init_msg,
+                * we could resend this msg many times
+                * (NOT_DONE from gni_smsg_send below) and wouldn't pass
+                * through init_msg again */
+               msg->gnm_cksum = kgnilnd_cksum(msg, sizeof(kgn_msg_t));
+               if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_CKSUM1)) {
+                       msg->gnm_cksum += 0xf00f;
+               }
+       }
+
+       GNIDBG_TOMSG(D_NET, msg, "tx 0x%p conn 0x%p->%s sending SMSG sz %u id %x/%d [%p for %u]",
+              tx, conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+              sizeof(kgn_msg_t), tx->tx_id.txe_smsg_id,
+              tx->tx_id.txe_idx, immediate, immediatenob);
+
+       if (unlikely(tx->tx_state & GNILND_TX_FAIL_SMSG)) {
+               rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE;
+       } else {
+       rrc = kgnilnd_smsg_send(conn->gnc_ephandle,
+                                   msg, sizeof(*msg), immediate, immediatenob,
+                           tx->tx_id.txe_smsg_id);
+       }
+
+       switch (rrc) {
+       case GNI_RC_SUCCESS:
+               conn->gnc_tx_seq++;
+               conn->gnc_last_tx = jiffies;
+               /* no locking here as LIVE isn't a list */
+               kgnilnd_tx_add_state_locked(tx, NULL, conn, GNILND_TX_LIVE_FMAQ, 1);
+
+               /* this needs to be checked under lock as it might be freed from a completion
+                * event.
+                */
+               if (msg->gnm_type == GNILND_MSG_NOOP) {
+                       set_mb(conn->gnc_last_noop_sent, jiffies);
+               }
+
+               /* serialize with seeing CQ events for completion on this, as well as
+                * tx_seq */
+               mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+
+               atomic_inc(&conn->gnc_device->gnd_short_ntx);
+               atomic64_add(immediatenob, &conn->gnc_device->gnd_short_txbytes);
+               kgnilnd_peer_alive(conn->gnc_peer);
+               GNIDBG_SMSG_CREDS(D_NET, conn);
+               return 0;
+
+       case GNI_RC_NOT_DONE:
+               /* XXX Nic: We need to figure out how to track this
+                * - there are bound to be good reasons for it,
+                * but we want to know when it happens */
+
+               mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+               /* We'll handle this error inline - makes the calling logic much more
+                * clean */
+
+               /* If no lock, caller doesn't want us to retry */
+               if (state_lock == NULL) {
+                       return -EAGAIN;
+               }
+
+               retry_send = kgnilnd_tx_should_retry(conn, tx);
+               if (retry_send) {
+                       /* add to head of list for the state and retries */
+                       spin_lock(state_lock);
+                       kgnilnd_tx_add_state_locked(tx, conn->gnc_peer, conn, state, 0);
+                       spin_unlock(state_lock);
+
+                       /* We only reschedule for a certain number of retries, then
+                        * we will wait for the CQ events indicating a release of SMSG
+                        * credits */
+                       if (tx->tx_retrans < (*kgnilnd_tunables.kgn_max_retransmits/4)) {
+                               kgnilnd_schedule_conn(conn);
+                               return 0;
+                       } else {
+                               /* CQ event coming in signifies either TX completed or
+                                * RX receive. Either of these *could* free up credits
+                                * in the SMSG mbox and we should try sending again */
+                               GNIDBG_TX(D_NET, tx, "waiting for CQID %u event to resend",
+                                        tx->tx_conn->gnc_cqid);
+                               /* use +ve return code to let upper layers know they
+                                * should stop looping on sends */
+                               return EAGAIN;
+                       }
+               } else {
+                       return -EAGAIN;
+               }
+       default:
+               /* handle bad retcode gracefully */
+               mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+               return -EIO;
+       }
+}
+
+/* kgnilnd_sendmsg has hard wait on gnd_cq_mutex */
+static inline int
+kgnilnd_sendmsg(kgn_tx_t *tx, void *immediate, unsigned int immediatenob,
+               spinlock_t *state_lock, kgn_tx_list_state_t state)
+{
+       kgn_device_t    *dev = tx->tx_conn->gnc_device;
+       unsigned long    timestamp;
+       int              rc;
+
+       timestamp = jiffies;
+       mutex_lock(&dev->gnd_cq_mutex);
+       /* delay in jiffies - we are really concerned only with things that
+        * result in a schedule() or really holding this off for long times .
+        * NB - mutex_lock could spin for 2 jiffies before going to sleep to wait */
+       dev->gnd_mutex_delay += (long) jiffies - timestamp;
+
+       rc = kgnilnd_sendmsg_nolock(tx, immediate, immediatenob, state_lock, state);
+
+       RETURN(rc);
+}
+
+
+/* returns -EAGAIN for lock miss, anything else < 0 is hard error, >=0 for success */
+static inline int
+kgnilnd_sendmsg_trylock(kgn_tx_t *tx, void *immediate, unsigned int immediatenob,
+               spinlock_t *state_lock, kgn_tx_list_state_t state)
+{
+       kgn_conn_t      *conn = tx->tx_conn;
+       kgn_device_t    *dev = conn->gnc_device;
+       unsigned long    timestamp;
+       int              rc;
+
+       timestamp = jiffies;
+
+       /* technically we are doing bad things with the read_lock on the peer_conn
+        * table, but we shouldn't be sleeping inside here - and we don't sleep/block
+        * for the mutex. I bet lockdep is gonna flag this one though... */
+
+       /* there are a few cases where we don't want the immediate send - like
+        * when we are in the scheduler thread and it'd harm the latency of
+        * getting messages up to LNet */
+
+       /* rmb for gnd_ready */
+       smp_rmb();
+       if (conn->gnc_device->gnd_ready == GNILND_DEV_LOOP) {
+               rc = 0;
+               atomic_inc(&conn->gnc_device->gnd_fast_block);
+       } else if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+              /* dont hit HW during quiesce */
+               rc = 0;
+       } else if (unlikely(atomic_read(&conn->gnc_peer->gnp_dirty_eps))) {
+              /* dont hit HW if stale EPs and conns left to close */
+               rc = 0;
+       } else {
+               atomic_inc(&conn->gnc_device->gnd_fast_try);
+               rc = mutex_trylock(&conn->gnc_device->gnd_cq_mutex);
+       }
+       if (!rc) {
+               rc = -EAGAIN;
+       } else {
+               /* we got the mutex and weren't blocked */
+
+               /* delay in jiffies - we are really concerned only with things that
+                * result in a schedule() or really holding this off for long times .
+                * NB - mutex_lock could spin for 2 jiffies before going to sleep to wait */
+               dev->gnd_mutex_delay += (long) jiffies - timestamp;
+
+               atomic_inc(&conn->gnc_device->gnd_fast_ok);
+               tx->tx_qtime = jiffies;
+               tx->tx_state = GNILND_TX_WAITING_COMPLETION;
+               rc = kgnilnd_sendmsg_nolock(tx, tx->tx_buffer, tx->tx_nob, &conn->gnc_list_lock, GNILND_TX_FMAQ);
+               /* _nolock unlocks the mutex for us */
+       }
+
+       RETURN(rc);
+}
+
+/* lets us know if we can push this RDMA through now */
+inline int
+kgnilnd_auth_rdma_bytes(kgn_device_t *dev, kgn_tx_t *tx)
+{
+       long    bytes_left;
+
+       bytes_left = atomic64_sub_return(tx->tx_nob, &dev->gnd_rdmaq_bytes_ok);
+
+       if (bytes_left < 0) {
+               atomic64_add(tx->tx_nob, &dev->gnd_rdmaq_bytes_ok);
+               atomic_inc(&dev->gnd_rdmaq_nstalls);
+               smp_wmb();
+
+               CDEBUG(D_NET, "no bytes to send, turning on timer for %lu\n",
+                      dev->gnd_rdmaq_deadline);
+               mod_timer(&dev->gnd_rdmaq_timer, dev->gnd_rdmaq_deadline);
+               /* we never del this timer - at worst it schedules us.. */
+               return -EAGAIN;
+       } else {
+               return 0;
+       }
+}
+
+/* this adds a TX to the queue pending throttling authorization before
+ * we allow our remote peer to launch a PUT at us */
+void
+kgnilnd_queue_rdma(kgn_conn_t *conn, kgn_tx_t *tx)
+{
+       int     rc;
+
+       /* we cannot go into send_mapped_tx from here as we are holding locks
+        * and mem registration might end up allocating memory in kgni.
+        * That said, we'll push this as far as we can into the queue process */
+       rc = kgnilnd_auth_rdma_bytes(conn->gnc_device, tx);
+
+       if (rc < 0) {
+               spin_lock(&conn->gnc_device->gnd_rdmaq_lock);
+               kgnilnd_tx_add_state_locked(tx, NULL, conn, GNILND_TX_RDMAQ, 0);
+               /* lets us know how delayed RDMA is */
+               tx->tx_qtime = jiffies;
+               spin_unlock(&conn->gnc_device->gnd_rdmaq_lock);
+       } else {
+               /* we have RDMA authorized, now it just needs a MDD and to hit the wire */
+               spin_lock(&tx->tx_conn->gnc_device->gnd_lock);
+               kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_MAPQ, 0);
+               /* lets us know how delayed mapping is */
+               tx->tx_qtime = jiffies;
+               spin_unlock(&tx->tx_conn->gnc_device->gnd_lock);
+       }
+
+       /* make sure we wake up sched to run this */
+       kgnilnd_schedule_device(tx->tx_conn->gnc_device);
+}
+
+/* push TX through state machine */
+void
+kgnilnd_queue_tx(kgn_conn_t *conn, kgn_tx_t *tx)
+{
+       int            rc;
+       int            add_tail = 1;
+
+       /* set the tx_id here, we delay it until we have an actual conn
+        * to fiddle with
+        * in some cases, the tx_id is already set to provide for things
+        * like RDMA completion cookies, etc */
+       if (tx->tx_id.txe_idx == 0) {
+               rc = kgnilnd_set_tx_id(tx, conn);
+               if (rc != 0) {
+                       kgnilnd_tx_done(tx, rc);
+                       return;
+               }
+       }
+
+       CDEBUG(D_NET, "%s to conn %p for %s\n", kgnilnd_msgtype2str(tx->tx_msg.gnm_type),
+               conn, libcfs_nid2str(conn->gnc_peer->gnp_nid));
+
+       /* Only let NOOPs to be sent while fail loc is set, otherwise kill the tx.
+        */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_ONLY_NOOP) && (tx->tx_msg.gnm_type != GNILND_MSG_NOOP)) {
+               kgnilnd_tx_done(tx, rc);
+               return;
+       }
+
+       switch (tx->tx_msg.gnm_type) {
+       case GNILND_MSG_PUT_ACK:
+       case GNILND_MSG_GET_REQ:
+               /* hijacking time! If this messages will authorize our peer to
+                * send his dirty little bytes in an RDMA, we need to get permission */
+               kgnilnd_queue_rdma(conn, tx);
+               break;
+       case GNILND_MSG_IMMEDIATE:
+               /* try to send right now, can help reduce latency */
+               rc = kgnilnd_sendmsg_trylock(tx, tx->tx_buffer, tx->tx_nob, &conn->gnc_list_lock, GNILND_TX_FMAQ);
+
+               if (rc >= 0) {
+                       /* it was sent, break out of switch to avoid default case of queueing */
+                       break;
+               } else if (rc == -EAGAIN) {
+                       /* needs to queue to try again, so  fall through to default case */
+               } else {
+                       /* bail: it wasnt sent and we didn't get EAGAIN indicating
+                        * we should retrans - We do not close the conn due to locking
+                        * we let the reaper thread take care of it. There are no hard
+                        * errors from send_msg that would require close to be called
+                        */
+                       kgnilnd_tx_done(tx, rc);
+                       break;
+               }
+       case GNILND_MSG_NOOP:
+               /* Just make sure this goes out first for this conn */
+               add_tail = 0;
+               /* fall through... */
+       default:
+               spin_lock(&conn->gnc_list_lock);
+               kgnilnd_tx_add_state_locked(tx, conn->gnc_peer, conn, GNILND_TX_FMAQ, add_tail);
+               tx->tx_qtime = jiffies;
+               spin_unlock(&conn->gnc_list_lock);
+               kgnilnd_schedule_conn(conn);
+       }
+}
+
+void
+kgnilnd_launch_tx(kgn_tx_t *tx, kgn_net_t *net, lnet_process_id_t *target)
+{
+       kgn_peer_t      *peer;
+       kgn_peer_t      *new_peer = NULL;
+       kgn_conn_t      *conn = NULL;
+       int              rc;
+
+       ENTRY;
+
+       /* If I get here, I've committed to send, so I complete the tx with
+        * failure on any problems */
+
+       GNITX_ASSERTF(tx, tx->tx_conn == NULL,
+                     "tx already has connection %p", tx->tx_conn);
+
+       /* do all of the peer & conn searching in one swoop - this avoids
+        * nastiness when dropping locks and needing to maintain a sane state
+        * in the face of stack reset or something else nuking peers & conns */
+
+       /* I expect to find him, so only take a read lock */
+       read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       peer = kgnilnd_find_peer_locked(target->nid);
+       if (peer != NULL) {
+               conn = kgnilnd_find_conn_locked(peer);
+               /* this could be NULL during quiesce */
+               if (conn != NULL)  {
+                       /* Connection exists; queue message on it */
+                       kgnilnd_queue_tx(conn, tx);
+                       read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+                       RETURN_EXIT;
+               }
+       }
+
+       /* creating peer or conn; I'll need a write lock... */
+       read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       CFS_RACE(CFS_FAIL_GNI_FIND_TARGET);
+
+       /* NB - this will not block during normal operations -
+        * the only writer of this is in the startup/shutdown path. */
+       rc = down_read_trylock(&kgnilnd_data.kgn_net_rw_sem);
+       if (!rc) {
+               rc = -ESHUTDOWN;
+               GOTO(no_peer, rc);
+       }
+
+       /* ignore previous peer entirely - we cycled the lock, so we
+        * will create new peer and at worst drop it if peer is still
+        * in the tables */
+       rc = kgnilnd_create_peer_safe(&new_peer, target->nid, net);
+       if (rc != 0) {
+               up_read(&kgnilnd_data.kgn_net_rw_sem);
+               GOTO(no_peer, rc);
+       }
+
+       write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+       up_read(&kgnilnd_data.kgn_net_rw_sem);
+
+       /* search for peer again now that we have the lock
+        * if we don't find it, add our new one to the list */
+       kgnilnd_add_peer_locked(target->nid, new_peer, &peer);
+
+       conn = kgnilnd_find_or_create_conn_locked(peer);
+       if (conn != NULL) {
+               /* oh hey, found a conn now... magical */
+               kgnilnd_queue_tx(conn, tx);
+       } else {
+               /* no conn, must be trying to connect - so we queue for now */
+               tx->tx_qtime = jiffies;
+               kgnilnd_tx_add_state_locked(tx, peer, NULL, GNILND_TX_PEERQ, 1);
+       }
+       write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+       RETURN_EXIT;
+no_peer:
+       kgnilnd_tx_done(tx, rc);
+       RETURN_EXIT;
+}
+
+void
+kgnilnd_rdma(kgn_tx_t *tx, int type,
+           kgn_rdma_desc_t *sink, unsigned int nob, __u64 cookie)
+{
+       kgn_conn_t   *conn = tx->tx_conn;
+       unsigned long timestamp;
+       gni_return_t  rrc;
+
+       LASSERTF(kgnilnd_tx_mapped(tx),
+               "unmapped tx %p\n", tx);
+       LASSERTF(conn != NULL,
+               "NULL conn on tx %p, naughty, naughty\n", tx);
+       LASSERTF(nob <= sink->gnrd_nob,
+               "nob %u > sink->gnrd_nob %d (%p)\n",
+               nob, sink->gnrd_nob, sink);
+       LASSERTF(nob <= tx->tx_nob,
+               "nob %d > tx(%p)->tx_nob %d\n",
+               nob, tx, tx->tx_nob);
+
+       memset(&tx->tx_rdma_desc, 0, sizeof(tx->tx_rdma_desc));
+       tx->tx_rdma_desc.post_id = tx->tx_id.txe_cookie;
+       tx->tx_rdma_desc.type = GNI_POST_RDMA_PUT;
+       tx->tx_rdma_desc.cq_mode = GNI_CQMODE_GLOBAL_EVENT;
+       tx->tx_rdma_desc.local_addr = (__u64)((unsigned long)tx->tx_buffer);
+       tx->tx_rdma_desc.local_mem_hndl = tx->tx_map_key;
+       tx->tx_rdma_desc.remote_addr = sink->gnrd_addr;
+       tx->tx_rdma_desc.remote_mem_hndl = sink->gnrd_key;
+       tx->tx_rdma_desc.length = nob;
+       if (!*kgnilnd_tunables.kgn_bte_hash)
+               tx->tx_rdma_desc.dlvr_mode |= GNI_DLVMODE_NO_HASH;
+       if (!*kgnilnd_tunables.kgn_bte_adapt)
+               tx->tx_rdma_desc.dlvr_mode |= (GNI_DLVMODE_NO_ADAPT | GNI_DLVMODE_NO_RADAPT);
+
+       /* prep final completion message */
+       kgnilnd_init_msg(&tx->tx_msg, type, tx->tx_msg.gnm_srcnid);
+       tx->tx_msg.gnm_u.completion.gncm_cookie = cookie;
+       /* send actual size RDMA'd in retval */
+       tx->tx_msg.gnm_u.completion.gncm_retval = nob;
+
+       kgnilnd_compute_rdma_cksum(tx);
+
+       if (nob == 0) {
+               kgnilnd_queue_tx(conn, tx);
+               return;
+       }
+
+       /* Don't lie (CLOSE == RDMA idle) */
+       LASSERTF(!conn->gnc_close_sent, "tx %p on conn %p after close sent %d\n",
+                tx, conn, conn->gnc_close_sent);
+
+       GNIDBG_TX(D_NET, tx, "Post RDMA type 0x%02x dlvr_mode 0x%x",
+              type, tx->tx_rdma_desc.dlvr_mode);
+
+       /* set CQ dedicated for RDMA */
+       tx->tx_rdma_desc.src_cq_hndl = conn->gnc_device->gnd_snd_rdma_cqh;
+
+       timestamp = jiffies;
+       mutex_lock(&conn->gnc_device->gnd_cq_mutex);
+       /* delay in jiffies - we are really concerned only with things that
+        * result in a schedule() or really holding this off for long times .
+        * NB - mutex_lock could spin for 2 jiffies before going to sleep to wait */
+       conn->gnc_device->gnd_mutex_delay += (long) jiffies - timestamp;
+
+       rrc = kgnilnd_post_rdma(conn->gnc_ephandle, &tx->tx_rdma_desc);
+
+       spin_lock(&conn->gnc_list_lock);
+       kgnilnd_tx_add_state_locked(tx, conn->gnc_peer, conn, GNILND_TX_LIVE_RDMAQ, 1);
+       tx->tx_qtime = jiffies;
+       spin_unlock(&conn->gnc_list_lock);
+
+       mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+
+       /* XXX Nic: is this a place we should handle more errors for
+        * robustness sake */
+       LASSERT(rrc == GNI_RC_SUCCESS);
+
+}
+
+kgn_rx_t *
+kgnilnd_alloc_rx(void)
+{
+       kgn_rx_t        *rx;
+
+       rx = cfs_mem_cache_alloc(kgnilnd_data.kgn_rx_cache, CFS_ALLOC_ATOMIC);
+       if (rx == NULL) {
+               CERROR("failed to allocate rx\n");
+               return NULL;
+       }
+       CDEBUG(D_MALLOC, "slab-alloced 'rx': %lu at %p.\n",
+              sizeof(*rx), rx);
+
+       /* no memset to zero, we'll always fill all members */
+       return rx;
+}
+
+/* release is to just free connection resources
+ * we use this for the eager path after copying */
+void
+kgnilnd_release_msg(kgn_conn_t *conn)
+{
+       gni_return_t    rrc;
+       unsigned long   timestamp;
+
+       CDEBUG(D_NET, "consuming %p\n", conn);
+
+       timestamp = jiffies;
+       mutex_lock(&conn->gnc_device->gnd_cq_mutex);
+       /* delay in jiffies - we are really concerned only with things that
+        * result in a schedule() or really holding this off for long times .
+        * NB - mutex_lock could spin for 2 jiffies before going to sleep to wait */
+       conn->gnc_device->gnd_mutex_delay += (long) jiffies - timestamp;
+
+       rrc = kgnilnd_smsg_release(conn->gnc_ephandle);
+       mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+
+       LASSERTF(rrc == GNI_RC_SUCCESS, "bad rrc %d\n", rrc);
+       GNIDBG_SMSG_CREDS(D_NET, conn);
+
+       return;
+}
+
+void
+kgnilnd_consume_rx(kgn_rx_t *rx)
+{
+       kgn_conn_t      *conn = rx->grx_conn;
+       kgn_msg_t       *rxmsg = rx->grx_msg;
+
+       /* if we are eager, free the cache alloc'd msg */
+       if (unlikely(rx->grx_eager)) {
+               LIBCFS_FREE(rxmsg, sizeof(*rxmsg) + *kgnilnd_tunables.kgn_max_immediate);
+
+               /* release ref from eager_recv */
+               kgnilnd_conn_decref(conn);
+       } else {
+               GNIDBG_MSG(D_NET, rxmsg, "rx %p processed", rx);
+               kgnilnd_release_msg(conn);
+       }
+
+       cfs_mem_cache_free(kgnilnd_data.kgn_rx_cache, rx);
+       CDEBUG(D_MALLOC, "slab-freed 'rx': %lu at %p.\n",
+              sizeof(*rx), rx);
+
+       return;
+}
+
+int
+kgnilnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+       lnet_hdr_t       *hdr = &lntmsg->msg_hdr;
+       int               type = lntmsg->msg_type;
+       lnet_process_id_t target = lntmsg->msg_target;
+       int               target_is_router = lntmsg->msg_target_is_router;
+       int               routing = lntmsg->msg_routing;
+       unsigned int      niov = lntmsg->msg_niov;
+       struct iovec     *iov = lntmsg->msg_iov;
+       lnet_kiov_t      *kiov = lntmsg->msg_kiov;
+       unsigned int      offset = lntmsg->msg_offset;
+       unsigned int      nob = lntmsg->msg_len;
+       unsigned int      msg_vmflush = lntmsg->msg_vmflush;
+       kgn_net_t        *net = ni->ni_data;
+       kgn_tx_t         *tx;
+       int               rc = 0;
+       int               mpflag = 0;
+
+       /* NB 'private' is different depending on what we're sending.... */
+       LASSERT(!in_interrupt());
+
+       CDEBUG(D_NET, "sending msg type %d with %d bytes in %d frags to %s\n",
+              type, nob, niov, libcfs_id2str(target));
+
+       LASSERTF(nob == 0 || niov > 0,
+               "lntmsg %p nob %d niov %d\n", lntmsg, nob, niov);
+       LASSERTF(niov <= LNET_MAX_IOV,
+               "lntmsg %p niov %d\n", lntmsg, niov);
+
+       /* payload is either all vaddrs or all pages */
+       LASSERTF(!(kiov != NULL && iov != NULL),
+               "lntmsg %p kiov %p iov %p\n", lntmsg, kiov, iov);
+
+       if (msg_vmflush)
+               mpflag = cfs_memory_pressure_get_and_set();
+
+       switch (type) {
+       default:
+               CERROR("lntmsg %p with unexpected type %d\n",
+                       lntmsg, type);
+               LBUG();
+
+       case LNET_MSG_ACK:
+               LASSERTF(nob == 0, "lntmsg %p nob %d\n",
+                       lntmsg, nob);
+               break;
+
+       case LNET_MSG_GET:
+               LASSERT(niov == 0);
+               LASSERT(nob == 0);
+
+               if (routing || target_is_router)
+                       break;                  /* send IMMEDIATE */
+
+               /* it is safe to do direct GET with out mapping buffer for RDMA as we
+                * check the eventual sink buffer here - if small enough, remote
+                * end is perfectly capable of returning data in short message -
+                * The magic is that we call lnet_parse in kgnilnd_recv with rdma_req=0
+                * for IMMEDIATE messages which will have it send a real reply instead
+                * of doing kgnilnd_recv to have the RDMA continued */
+               if (lntmsg->msg_md->md_length <= *kgnilnd_tunables.kgn_max_immediate)
+                      break;
+
+               tx = kgnilnd_new_tx_msg(GNILND_MSG_GET_REQ, ni->ni_nid);
+               if (tx == NULL) {
+                       rc = -ENOMEM;
+                       goto out;
+               }
+
+               /* slightly different options as we might actually have a GET with a
+                * MD_KIOV set but a non-NULL md_iov.iov */
+               if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0)
+                       rc = kgnilnd_setup_rdma_buffer(tx, lntmsg->msg_md->md_niov,
+                                                     lntmsg->msg_md->md_iov.iov, NULL,
+                                                     0, lntmsg->msg_md->md_length);
+               else
+                       rc = kgnilnd_setup_rdma_buffer(tx, lntmsg->msg_md->md_niov,
+                                                     NULL, lntmsg->msg_md->md_iov.kiov,
+                                                     0, lntmsg->msg_md->md_length);
+               if (rc != 0) {
+                       CERROR("unable to setup buffer: %d\n", rc);
+                       kgnilnd_tx_done(tx, rc);
+                       rc = -EIO;
+                       goto out;
+               }
+
+               tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg);
+               if (tx->tx_lntmsg[1] == NULL) {
+                       CERROR("Can't create reply for GET to %s\n",
+                              libcfs_nid2str(target.nid));
+                       kgnilnd_tx_done(tx, rc);
+                       rc = -EIO;
+                       goto out;
+               }
+
+               tx->tx_lntmsg[0] = lntmsg;
+               tx->tx_msg.gnm_u.get.gngm_hdr = *hdr;
+               /* rest of tx_msg is setup just before it is sent */
+               kgnilnd_launch_tx(tx, net, &target);
+               goto out;
+
+       case LNET_MSG_REPLY:
+       case LNET_MSG_PUT:
+               /* to save on MDDs, we'll handle short kiov by vmap'ing
+                * and sending via SMSG */
+               if (nob <= *kgnilnd_tunables.kgn_max_immediate)
+                      break;
+
+               tx = kgnilnd_new_tx_msg(GNILND_MSG_PUT_REQ, ni->ni_nid);
+               if (tx == NULL) {
+                       rc = -ENOMEM;
+                       goto out;
+               }
+
+               rc = kgnilnd_setup_rdma_buffer(tx, niov, iov, kiov, offset, nob);
+               if (rc != 0) {
+                       kgnilnd_tx_done(tx, rc);
+                       rc = -EIO;
+                       goto out;
+               }
+
+               tx->tx_lntmsg[0] = lntmsg;
+               tx->tx_msg.gnm_u.putreq.gnprm_hdr = *hdr;
+               /* rest of tx_msg is setup just before it is sent */
+               kgnilnd_launch_tx(tx, net, &target);
+               goto out;
+       }
+
+       /* send IMMEDIATE */
+
+       LASSERTF(nob <= *kgnilnd_tunables.kgn_max_immediate,
+               "lntmsg 0x%p too large %d\n", lntmsg, nob);
+
+       tx = kgnilnd_new_tx_msg(GNILND_MSG_IMMEDIATE, ni->ni_nid);
+       if (tx == NULL) {
+               rc = -ENOMEM;
+               goto out;
+       }
+
+       rc = kgnilnd_setup_immediate_buffer(tx, niov, iov, kiov, offset, nob);
+       if (rc != 0) {
+               kgnilnd_tx_done(tx, rc);
+               goto out;
+       }
+
+       tx->tx_msg.gnm_u.immediate.gnim_hdr = *hdr;
+       tx->tx_lntmsg[0] = lntmsg;
+       kgnilnd_launch_tx(tx, net, &target);
+
+out:
+       /* use stored value as we could have already finalized lntmsg here from a failed launch */
+       if (msg_vmflush)
+               cfs_memory_pressure_restore(mpflag);
+       return rc;
+}
+
+void
+kgnilnd_reply(lnet_ni_t *ni, kgn_rx_t *rx, lnet_msg_t *lntmsg)
+{
+       kgn_conn_t    *conn = rx->grx_conn;
+       kgn_msg_t     *rxmsg = rx->grx_msg;
+       unsigned int   niov = lntmsg->msg_niov;
+       struct iovec  *iov = lntmsg->msg_iov;
+       lnet_kiov_t   *kiov = lntmsg->msg_kiov;
+       unsigned int   offset = lntmsg->msg_offset;
+       unsigned int   nob = lntmsg->msg_len;
+       kgn_tx_t      *tx;
+       int            rc = 0;
+
+       tx = kgnilnd_new_tx_msg(GNILND_MSG_GET_DONE, ni->ni_nid);
+       if (tx == NULL)
+               goto failed_0;
+
+       rc = kgnilnd_set_tx_id(tx, conn);
+       if (rc != 0)
+               goto failed_1;
+
+       rc = kgnilnd_setup_rdma_buffer(tx, niov, iov, kiov, offset, nob);
+       if (rc != 0)
+               goto failed_1;
+
+       tx->tx_lntmsg[0] = lntmsg;
+       tx->tx_getinfo = rxmsg->gnm_u.get;
+
+       /* we only queue from kgnilnd_recv - we might get called from other contexts
+        * and we don't want to block the mutex in those cases */
+
+       spin_lock(&tx->tx_conn->gnc_device->gnd_lock);
+       kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_MAPQ, 1);
+       spin_unlock(&tx->tx_conn->gnc_device->gnd_lock);
+       kgnilnd_schedule_device(tx->tx_conn->gnc_device);
+
+       return;
+
+ failed_1:
+       kgnilnd_tx_done(tx, rc);
+       kgnilnd_nak_rdma(conn, GNILND_MSG_GET_NAK, rc, rxmsg->gnm_u.get.gngm_cookie, ni->ni_nid);
+ failed_0:
+       lnet_finalize(ni, lntmsg, rc);
+}
+
+int
+kgnilnd_eager_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+                  void **new_private)
+{
+       kgn_rx_t        *rx = private;
+       kgn_conn_t      *conn = rx->grx_conn;
+       kgn_msg_t       *rxmsg = rx->grx_msg;
+       kgn_msg_t       *eagermsg = NULL;
+
+       GNIDBG_MSG(D_NET, rxmsg, "eager recv for conn %p, rxmsg %p, lntmsg %p",
+               conn, rxmsg, lntmsg);
+
+       if (rxmsg->gnm_payload_len > *kgnilnd_tunables.kgn_max_immediate) {
+               GNIDBG_MSG(D_ERROR, rxmsg, "payload too large %d",
+                       rxmsg->gnm_payload_len);
+               return -EPROTO;
+       }
+
+       /* we have no credits or buffers for this message, so copy it
+        * somewhere for a later kgnilnd_recv */
+       LIBCFS_ALLOC(eagermsg, sizeof(*eagermsg) + *kgnilnd_tunables.kgn_max_immediate);
+       if (eagermsg == NULL) {
+               CERROR("couldn't allocate eager rx message for conn %p to %s\n",
+                       conn, libcfs_nid2str(conn->gnc_peer->gnp_nid));
+               return -ENOMEM;
+       }
+
+       /* copy msg and payload */
+       memcpy(eagermsg, rxmsg, sizeof(*rxmsg) + rxmsg->gnm_payload_len);
+       rx->grx_msg = eagermsg;
+       rx->grx_eager = 1;
+
+       /* stash this for lnet_finalize on cancel-on-conn-close */
+       rx->grx_lntmsg = lntmsg;
+
+       /* add conn ref to ensure it doesn't go away until all eager messages processed */
+       kgnilnd_conn_addref(conn);
+
+       /* keep the same rx_t, it just has a new grx_msg now */
+       *new_private = private;
+
+       /* release SMSG buffer */
+       kgnilnd_release_msg(conn);
+
+       return 0;
+}
+
+int
+kgnilnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+            int delayed, unsigned int niov,
+            struct iovec *iov, lnet_kiov_t *kiov,
+            unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+       kgn_rx_t    *rx = private;
+       kgn_conn_t  *conn = rx->grx_conn;
+       kgn_msg_t   *rxmsg = rx->grx_msg;
+       kgn_tx_t    *tx;
+       int          rc = 0;
+       __u32        pload_cksum;
+       ENTRY;
+
+       LASSERT(!in_interrupt());
+       LASSERTF(mlen <= rlen, "%d <= %d\n", mlen, rlen);
+       /* Either all pages or all vaddrs */
+       LASSERTF(!(kiov != NULL && iov != NULL), "kiov %p iov %p\n",
+               kiov, iov);
+
+       GNIDBG_MSG(D_NET, rxmsg, "conn %p, rxmsg %p, lntmsg %p"
+               " niov=%d kiov=%p iov=%p offset=%d mlen=%d rlen=%d",
+               conn, rxmsg, lntmsg,
+               niov, kiov, iov, offset, mlen, rlen);
+
+       /* we need to lock here as recv can be called from any context */
+       read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+       if (rx->grx_eager && conn->gnc_state != GNILND_CONN_ESTABLISHED) {
+               read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+               /* someone closed the conn after we copied this out, nuke it */
+               kgnilnd_consume_rx(rx);
+               lnet_finalize(ni, lntmsg, conn->gnc_error);
+               RETURN(0);
+       }
+       read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       switch (rxmsg->gnm_type) {
+       default:
+               LBUG();
+
+       case GNILND_MSG_IMMEDIATE:
+               if (mlen > rxmsg->gnm_payload_len) {
+                       GNIDBG_MSG(D_ERROR, rxmsg,
+                               "Immediate message from %s too big: %d > %d",
+                               libcfs_nid2str(conn->gnc_peer->gnp_nid), mlen,
+                               rxmsg->gnm_payload_len);
+                       rc = -EINVAL;
+                       kgnilnd_consume_rx(rx);
+                       RETURN(rc);
+               }
+
+               /* rxmsg[1] is a pointer to the payload, sitting in the buffer
+                * right after the kgn_msg_t header - so just 'cute' way of saying
+                * rxmsg + sizeof(kgn_msg_t) */
+
+               /* check payload checksum if sent */
+
+               if (*kgnilnd_tunables.kgn_checksum >= 2 &&
+                       !rxmsg->gnm_payload_cksum &&
+                       rxmsg->gnm_payload_len != 0)
+                       GNIDBG_MSG(D_WARNING, rxmsg, "no msg payload checksum when enabled");
+
+               if (rxmsg->gnm_payload_cksum != 0) {
+                       /* gnm_payload_len set in kgnilnd_sendmsg from tx->tx_nob,
+                        * which is what is used to calculate the cksum on the TX side */
+                       pload_cksum = kgnilnd_cksum(&rxmsg[1], rxmsg->gnm_payload_len);
+
+                       if (rxmsg->gnm_payload_cksum != pload_cksum) {
+                               GNIDBG_MSG(D_NETERROR, rxmsg,
+                                          "Bad payload checksum (%x expected %x)",
+                                           pload_cksum, rxmsg->gnm_payload_cksum);
+                               switch (*kgnilnd_tunables.kgn_checksum_dump) {
+                               case 2:
+                                       kgnilnd_dump_blob(D_BUFFS, "bad payload checksum",
+                                                         &rxmsg[1], rxmsg->gnm_payload_len);
+                                       /* fall through to dump */
+                               case 1:
+                                       libcfs_debug_dumplog();
+                                       break;
+                               default:
+                                       break;
+                               }
+                               rc = -ENOKEY;
+                               /* checksum problems are fatal, kill the conn */
+                               kgnilnd_consume_rx(rx);
+                               kgnilnd_close_conn(conn, rc);
+                               RETURN(rc);
+                       }
+               }
+
+               if (kiov != NULL)
+                       lnet_copy_flat2kiov(
+                               niov, kiov, offset,
+                               *kgnilnd_tunables.kgn_max_immediate,
+                               &rxmsg[1], 0, mlen);
+               else
+                       lnet_copy_flat2iov(
+                               niov, iov, offset,
+                               *kgnilnd_tunables.kgn_max_immediate,
+                               &rxmsg[1], 0, mlen);
+
+               kgnilnd_consume_rx(rx);
+               lnet_finalize(ni, lntmsg, 0);
+               RETURN(0);
+
+       case GNILND_MSG_PUT_REQ:
+               /* LNET wants to truncate or drop transaction, sending NAK */
+               if (mlen == 0) {
+                       kgnilnd_consume_rx(rx);
+                       lnet_finalize(ni, lntmsg, 0);
+
+                       /* only error if lntmsg == NULL, otherwise we are just
+                        * short circuiting the rdma process of 0 bytes */
+                       kgnilnd_nak_rdma(conn, GNILND_MSG_PUT_NAK,
+                                       lntmsg == NULL ? -ENOENT : 0,
+                                       rxmsg->gnm_u.get.gngm_cookie,
+                                       ni->ni_nid);
+                       RETURN(0);
+               }
+               /* sending ACK with sink buff. info */
+               tx = kgnilnd_new_tx_msg(GNILND_MSG_PUT_ACK, ni->ni_nid);
+               if (tx == NULL) {
+                       kgnilnd_consume_rx(rx);
+                       RETURN(-ENOMEM);
+               }
+
+               rc = kgnilnd_set_tx_id(tx, conn);
+               if (rc != 0) {
+                       GOTO(nak_put_req, rc);
+               }
+
+               rc = kgnilnd_setup_rdma_buffer(tx, niov, iov, kiov, offset, mlen);
+               if (rc != 0) {
+                       GOTO(nak_put_req, rc);
+               }
+
+               tx->tx_msg.gnm_u.putack.gnpam_src_cookie =
+                       rxmsg->gnm_u.putreq.gnprm_cookie;
+               tx->tx_msg.gnm_u.putack.gnpam_dst_cookie = tx->tx_id.txe_cookie;
+               tx->tx_msg.gnm_u.putack.gnpam_desc.gnrd_addr =
+                       (__u64)((unsigned long)tx->tx_buffer);
+               tx->tx_msg.gnm_u.putack.gnpam_desc.gnrd_nob = mlen;
+
+               tx->tx_lntmsg[0] = lntmsg; /* finalize this on RDMA_DONE */
+
+               /* we only queue from kgnilnd_recv - we might get called from other contexts
+                * and we don't want to block the mutex in those cases */
+
+               spin_lock(&tx->tx_conn->gnc_device->gnd_lock);
+               kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_MAPQ, 1);
+               spin_unlock(&tx->tx_conn->gnc_device->gnd_lock);
+               kgnilnd_schedule_device(tx->tx_conn->gnc_device);
+
+               kgnilnd_consume_rx(rx);
+               RETURN(0);
+
+nak_put_req:
+               /* make sure we send an error back when the PUT fails */
+               kgnilnd_nak_rdma(conn, GNILND_MSG_PUT_NAK, rc, rxmsg->gnm_u.get.gngm_cookie, ni->ni_nid);
+               kgnilnd_tx_done(tx, rc);
+               kgnilnd_consume_rx(rx);
+
+               /* return magic LNet network error */
+               RETURN(-EIO);
+
+       case GNILND_MSG_GET_REQ:
+               if (lntmsg != NULL) {
+                       /* Matched! */
+                       kgnilnd_reply(ni, rx, lntmsg);
+               } else {
+                       /* No match */
+                       kgnilnd_nak_rdma(conn, GNILND_MSG_GET_NAK,
+                                       -ENOENT,
+                                       rxmsg->gnm_u.get.gngm_cookie,
+                                       ni->ni_nid);
+               }
+               kgnilnd_consume_rx(rx);
+               RETURN(0);
+       }
+       RETURN(0);
+}
+
+/* needs write_lock on kgn_peer_conn_lock held */
+int
+kgnilnd_check_conn_timeouts_locked(kgn_conn_t *conn)
+{
+       unsigned long      timeout, keepalive;
+       unsigned long      now = jiffies;
+       unsigned long      newest_last_rx;
+       kgn_tx_t          *tx;
+
+       /* given that we found this conn hanging off a peer, it better damned
+        * well be connected */
+       LASSERTF(conn->gnc_state == GNILND_CONN_ESTABLISHED,
+                "conn 0x%p->%s with bad state%s\n", conn,
+                conn->gnc_peer ? libcfs_nid2str(conn->gnc_peer->gnp_nid)
+                              : "<?>",
+                kgnilnd_conn_state2str(conn));
+
+       CDEBUG(D_NET, "checking conn %p->%s timeout %d keepalive %d "
+                     "rx_diff %lu tx_diff %lu\n",
+               conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+               conn->gnc_timeout, GNILND_TO2KA(conn->gnc_timeout),
+               cfs_duration_sec(now - conn->gnc_last_rx_cq),
+               cfs_duration_sec(now - conn->gnc_last_tx));
+
+       timeout = cfs_time_seconds(conn->gnc_timeout);
+       keepalive = cfs_time_seconds(GNILND_TO2KA(conn->gnc_timeout));
+
+       /* just in case our lack of RX msg processing is gumming up the works - give the
+        * remove an extra chance */
+
+       newest_last_rx = GNILND_LASTRX(conn);
+
+       if (time_after_eq(now, newest_last_rx + timeout)) {
+               GNIDBG_CONN(D_CONSOLE|D_NETERROR, conn, "No gnilnd traffic received from %s for %lu "
+                       "seconds, terminating connection. Is node down? ",
+                       libcfs_nid2str(conn->gnc_peer->gnp_nid),
+                       cfs_duration_sec(now - newest_last_rx));
+               return -ETIMEDOUT;
+       }
+
+       /* we don't timeout on last_tx stalls - we are going to trust the
+        * underlying network to let us know when sends are failing.
+        * At worst, the peer will timeout our RX stamp and drop the connection
+        * at that point. We'll then see his CLOSE or at worst his RX
+        * stamp stop and drop the connection on our end */
+
+       if (time_after_eq(now, conn->gnc_last_tx + keepalive)) {
+               CDEBUG(D_NET, "sending NOOP -> %s (%p idle %lu(%lu)) "
+                      "last %lu/%lu/%lu %lus/%lus/%lus\n",
+                      libcfs_nid2str(conn->gnc_peer->gnp_nid), conn,
+                      cfs_duration_sec(jiffies - conn->gnc_last_tx),
+                      keepalive,
+                      conn->gnc_last_noop_want, conn->gnc_last_noop_sent,
+                      conn->gnc_last_noop_cq,
+                      cfs_duration_sec(jiffies - conn->gnc_last_noop_want),
+                      cfs_duration_sec(jiffies - conn->gnc_last_noop_sent),
+                      cfs_duration_sec(jiffies - conn->gnc_last_noop_cq));
+               set_mb(conn->gnc_last_noop_want, jiffies);
+               atomic_inc(&conn->gnc_reaper_noop);
+               if (CFS_FAIL_CHECK(CFS_FAIL_GNI_NOOP_SEND))
+                       return 0;
+
+               tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP, conn->gnc_peer->gnp_net->gnn_ni->ni_nid);
+               if (tx == NULL)
+                       return 0;
+               kgnilnd_queue_tx(conn, tx);
+       }
+
+       return 0;
+}
+
+/* needs write_lock on kgn_peer_conn_lock held */
+void
+kgnilnd_check_peer_timeouts_locked(kgn_peer_t *peer, struct list_head *todie,
+                                   struct list_head *souls)
+{
+       unsigned long           timeout;
+       kgn_conn_t             *conn, *connN = NULL;
+       kgn_tx_t               *tx, *txN;
+       int                     rc = 0;
+       int                     count = 0;
+       int                     reconnect;
+       short                   releaseconn = 0;
+       unsigned long           first_rx = 0;
+
+       CDEBUG(D_NET, "checking peer 0x%p->%s for timeouts; interval %lus\n",
+               peer, libcfs_nid2str(peer->gnp_nid),
+               peer->gnp_reconnect_interval);
+
+       timeout = cfs_time_seconds(MAX(*kgnilnd_tunables.kgn_timeout,
+                                      GNILND_MIN_TIMEOUT));
+
+       conn = kgnilnd_find_conn_locked(peer);
+       if (conn) {
+               /* if there is a valid conn, check the queues for timeouts */
+               rc = kgnilnd_check_conn_timeouts_locked(conn);
+               if (rc) {
+                       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_RX_CLOSE_CLOSING)) {
+                               /* simulate a RX CLOSE after the timeout but before
+                                * the scheduler thread gets it */
+                               conn->gnc_close_recvd = GNILND_CLOSE_INJECT1;
+                               conn->gnc_peer_error = -ETIMEDOUT;
+                       }
+                       /* Once we mark closed, any of the scheduler threads could
+                        * get it and move through before we hit the fail loc code */
+                       kgnilnd_close_conn_locked(conn, rc);
+               } else {
+                       /* first_rx is used to decide when to release a conn from purgatory.
+                        */
+                       first_rx = conn->gnc_first_rx;
+               }
+       }
+
+       /* now regardless of starting new conn, find tx on peer queue that
+        * are old and smell bad - do this first so we don't trigger
+        * reconnect on empty queue if we timeout all */
+       list_for_each_entry_safe(tx, txN, &peer->gnp_tx_queue, tx_list) {
+               if (time_after_eq(jiffies, tx->tx_qtime + timeout)) {
+                       if (count == 0) {
+                               LCONSOLE_INFO("could not send to %s due to connection"
+                                      " setup failure after %lu seconds\n",
+                                      libcfs_nid2str(peer->gnp_nid),
+                                      cfs_duration_sec(jiffies - tx->tx_qtime));
+                       }
+                       kgnilnd_tx_del_state_locked(tx, peer, NULL,
+                                                  GNILND_TX_ALLOCD);
+                       list_add_tail(&tx->tx_list, todie);
+                       count++;
+               }
+       }
+
+       if (count || peer->gnp_connecting == GNILND_PEER_KILL) {
+               CDEBUG(D_NET, "canceling %d tx for peer 0x%p->%s\n",
+                       count, peer, libcfs_nid2str(peer->gnp_nid));
+               /* if we nuked all the TX, stop peer connection attempt (if there is one..) */
+               if (list_empty(&peer->gnp_tx_queue) ||
+                       peer->gnp_connecting == GNILND_PEER_KILL) {
+                       /* we pass down todie to use a common function - but we know there are
+                        * no TX to add */
+                       kgnilnd_cancel_peer_connect_locked(peer, todie);
+               }
+       }
+
+       /* Don't reconnect if we are still trying to clear out old conns.
+        * This prevents us sending traffic on the new mbox before ensuring we are done
+        * with the old one */
+       reconnect = (atomic_read(&peer->gnp_dirty_eps) == 0);
+
+       /* if we are not connected and there are tx on the gnp_tx_queue waiting
+        * to be sent, we'll check the reconnect interval and fire up a new
+        * connection request */
+
+       if ((peer->gnp_connecting == GNILND_PEER_IDLE) &&
+           (time_after_eq(jiffies, peer->gnp_reconnect_time)) &&
+            !list_empty(&peer->gnp_tx_queue) && reconnect) {
+
+               CDEBUG(D_NET, "starting connect to %s\n",
+                       libcfs_nid2str(peer->gnp_nid));
+               LASSERTF(peer->gnp_connecting == GNILND_PEER_IDLE, "Peer was idle and we"
+                       "have a write_lock, state issue %d\n", peer->gnp_connecting);
+
+               peer->gnp_connecting = GNILND_PEER_CONNECT;
+               kgnilnd_peer_addref(peer); /* extra ref for connd */
+
+               spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
+               list_add_tail(&peer->gnp_connd_list,
+                             &peer->gnp_net->gnn_dev->gnd_connd_peers);
+               spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
+
+               kgnilnd_schedule_dgram(peer->gnp_net->gnn_dev);
+       }
+
+       /* fail_loc to allow us to delay release of purgatory */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PURG_REL_DELAY))
+               return;
+
+       /* This check allows us to verify that the new conn is actually being used. This allows us to
+        * pull the old conns out of purgatory if they have actually seen traffic.
+        * We only release a conn from purgatory during stack reset, admin command, or when a peer reconnects
+        */
+       if (first_rx &&
+               time_after(jiffies, first_rx + cfs_time_seconds(*kgnilnd_tunables.kgn_hardware_timeout))) {
+               CDEBUG(D_NET,"We can release conn %p from purgatory %lu\n",
+                      conn, first_rx + cfs_time_seconds(*kgnilnd_tunables.kgn_hardware_timeout));
+               releaseconn = 1;
+       }
+
+       list_for_each_entry_safe (conn, connN, &peer->gnp_conns, gnc_list) {
+       /* check for purgatory timeouts */
+               if (conn->gnc_in_purgatory) {
+                       /* We cannot detach this conn from purgatory if it has not been closed so we reschedule it
+                        * that way the next time we check it we can detach it from purgatory
+                        */
+
+                       if (conn->gnc_state != GNILND_CONN_DONE) {
+                               /* Skip over conns that are currently not DONE. If they arent already scheduled
+                                * for completion something in the state machine is broken.
+                                */
+                               continue;
+                       }
+
+                       /* We only detach a conn that is in purgatory if we have received a close message,
+                        * we have a new valid connection that has successfully received data, or an admin
+                        * command tells us we need to detach.
+                        */
+
+                       if (conn->gnc_close_recvd || releaseconn || conn->gnc_needs_detach) {
+                               unsigned long   waiting;
+
+                               waiting = (long) jiffies - conn->gnc_last_rx_cq;
+
+                               /* C.E: The remote peer is expected to close the
+                                * connection (see kgnilnd_check_conn_timeouts)
+                                * via the reaper thread and nuke out the MDD and
+                                * FMA resources after conn->gnc_timeout has expired
+                                * without an FMA RX */
+                               CDEBUG(D_NET, "Reconnected to %s in %lds or admin forced detach, dropping "
+                                       " held resources\n",
+                                       libcfs_nid2str(conn->gnc_peer->gnp_nid),
+                                       cfs_duration_sec(waiting));
+
+                               kgnilnd_detach_purgatory_locked(conn, souls);
+                       }
+               }
+       }
+
+       return;
+}
+
+void
+kgnilnd_reaper_check(int idx)
+{
+       struct list_head  *peers = &kgnilnd_data.kgn_peers[idx];
+       struct list_head  *ctmp, *ctmpN;
+       struct list_head   geriatrics;
+       struct list_head   souls;
+
+       INIT_LIST_HEAD(&geriatrics);
+       INIT_LIST_HEAD(&souls);
+
+       write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       list_for_each_safe(ctmp, ctmpN, peers) {
+               kgn_peer_t        *peer = NULL;
+
+               /* don't timeout stuff if the network is mucked or shutting down */
+               if (kgnilnd_check_hw_quiesce()) {
+                       break;
+               }
+               peer = list_entry(ctmp, kgn_peer_t, gnp_list);
+
+               kgnilnd_check_peer_timeouts_locked(peer, &geriatrics, &souls);
+       }
+
+       write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       kgnilnd_txlist_done(&geriatrics, -EHOSTUNREACH);
+       kgnilnd_release_purgatory_list(&souls);
+}
+
+void
+kgnilnd_update_reaper_timeout(long timeout)
+{
+       LASSERT(timeout > 0);
+
+       spin_lock(&kgnilnd_data.kgn_reaper_lock);
+
+       if (timeout < kgnilnd_data.kgn_new_min_timeout)
+               kgnilnd_data.kgn_new_min_timeout = timeout;
+
+       spin_unlock(&kgnilnd_data.kgn_reaper_lock);
+}
+
+static void
+kgnilnd_reaper_poke_with_stick(unsigned long arg)
+{
+       wake_up(&kgnilnd_data.kgn_reaper_waitq);
+}
+
+int
+kgnilnd_reaper(void *arg)
+{
+       long               timeout;
+       int                i;
+       int                hash_index = 0;
+       unsigned long      next_check_time = jiffies;
+       long               current_min_timeout = MAX_SCHEDULE_TIMEOUT;
+       struct timer_list  timer;
+       DEFINE_WAIT(wait);
+
+       cfs_daemonize("kgnilnd_rpr");
+       cfs_block_allsigs();
+
+       /* all gnilnd threads need to run fairly urgently */
+       set_user_nice(current, *kgnilnd_tunables.kgn_nice);
+       spin_lock(&kgnilnd_data.kgn_reaper_lock);
+
+       while (!kgnilnd_data.kgn_shutdown) {
+               /* I wake up every 'p' seconds to check for timeouts on some
+                * more peers.  I try to check every connection 'n' times
+                * within the global minimum of all keepalive and timeout
+                * intervals, to ensure I attend to every connection within
+                * (n+1)/n times its timeout intervals. */
+               const int     p = GNILND_REAPER_THREAD_WAKE;
+               const int     n = GNILND_REAPER_NCHECKS;
+               int           chunk;
+               /* to quiesce or to not quiesce, that is the question */
+               if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+                       spin_unlock(&kgnilnd_data.kgn_reaper_lock);
+                       KGNILND_SPIN_QUIESCE;
+                       spin_lock(&kgnilnd_data.kgn_reaper_lock);
+               }
+
+               /* careful with the jiffy wrap... */
+               timeout = (long)(next_check_time - jiffies);
+
+               if (timeout > 0) {
+                       prepare_to_wait(&kgnilnd_data.kgn_reaper_waitq, &wait,
+                                       TASK_INTERRUPTIBLE);
+                       spin_unlock(&kgnilnd_data.kgn_reaper_lock);
+                       setup_timer(&timer, kgnilnd_reaper_poke_with_stick,
+                                   next_check_time);
+                       mod_timer(&timer, (long) jiffies + timeout);
+
+                       /* check flag variables before comitting */
+                       if (!kgnilnd_data.kgn_shutdown &&
+                           !kgnilnd_data.kgn_quiesce_trigger) {
+                               CDEBUG(D_INFO, "schedule timeout %ld (%lu sec)\n",
+                                      timeout, cfs_duration_sec(timeout));
+                               schedule();
+                               CDEBUG(D_INFO, "awake after schedule\n");
+                       }
+
+                       del_singleshot_timer_sync(&timer);
+                       spin_lock(&kgnilnd_data.kgn_reaper_lock);
+                       finish_wait(&kgnilnd_data.kgn_reaper_waitq, &wait);
+                       continue;
+               }
+
+               /* new_min_timeout is set from the conn timeouts and keepalive
+                * this should end up with a min timeout of
+                * GNILND_TIMEOUT2KEEPALIVE(t) or roughly LND_TIMEOUT/2 */
+               if (kgnilnd_data.kgn_new_min_timeout < current_min_timeout) {
+                       current_min_timeout = kgnilnd_data.kgn_new_min_timeout;
+                       CDEBUG(D_NET, "Set new min timeout %ld\n",
+                              current_min_timeout);
+               }
+
+               spin_unlock(&kgnilnd_data.kgn_reaper_lock);
+
+               /* Compute how many table entries to check now so I get round
+                * the whole table fast enough given that I do this at fixed
+                * intervals of 'p' seconds) */
+               chunk = *kgnilnd_tunables.kgn_peer_hash_size;
+               if (kgnilnd_data.kgn_new_min_timeout > n * p)
+                       chunk = (chunk * n * p) /
+                               kgnilnd_data.kgn_new_min_timeout;
+               if (chunk == 0)
+                       chunk = 1;
+               for (i = 0; i < chunk; i++) {
+                       kgnilnd_reaper_check(hash_index);
+                       hash_index = (hash_index + 1) %
+                               *kgnilnd_tunables.kgn_peer_hash_size;
+               }
+               next_check_time = (long) jiffies + cfs_time_seconds(p);
+               CDEBUG(D_INFO, "next check at %lu or in %d sec\n", next_check_time, p);
+
+               spin_lock(&kgnilnd_data.kgn_reaper_lock);
+       }
+
+       spin_unlock(&kgnilnd_data.kgn_reaper_lock);
+
+       kgnilnd_thread_fini();
+       return 0;
+}
+
+int
+kgnilnd_check_rdma_cq(kgn_device_t *dev)
+{
+       gni_return_t           rrc;
+       gni_post_descriptor_t *desc;
+       __u64                  event_data;
+       kgn_tx_ev_id_t         ev_id;
+       char                   err_str[256];
+       int                    should_retry, rc;
+       long                   num_processed = 0;
+       kgn_conn_t            *conn = NULL;
+       kgn_tx_t              *tx = NULL;
+
+       for (;;) {
+               /* make sure we don't keep looping if we need to reset */
+               if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+                       return num_processed;
+               }
+               rc = kgnilnd_mutex_trylock(&dev->gnd_cq_mutex);
+               if (!rc) {
+                       /* we didn't get the mutex, so return that there is still work
+                        * to be done */
+                       return 1;
+               }
+               if (CFS_FAIL_CHECK(CFS_FAIL_GNI_DELAY_RDMA)) {
+                       /* a bit gross - but we need a good way to test for
+                        * delayed RDMA completions and the easiest way to do
+                        * that is to delay the RDMA CQ events */
+                       rrc = GNI_RC_NOT_DONE;
+               } else {
+                       rrc = kgnilnd_cq_get_event(dev->gnd_snd_rdma_cqh, &event_data);
+               }
+
+               if (rrc == GNI_RC_NOT_DONE) {
+                       mutex_unlock(&dev->gnd_cq_mutex);
+                       CDEBUG(D_INFO, "SEND RDMA CQ %d empty processed %ld\n",
+                              dev->gnd_id, num_processed);
+                       return num_processed;
+               }
+               dev->gnd_sched_alive = jiffies;
+               num_processed++;
+
+               LASSERTF(!GNI_CQ_OVERRUN(event_data),
+                       "this is bad, somehow our credits didn't protect us"
+                       " from CQ overrun\n");
+               LASSERTF(GNI_CQ_GET_TYPE(event_data) == GNI_CQ_EVENT_TYPE_POST,
+                       "rrc %d, GNI_CQ_GET_TYPE("LPX64") = "LPX64"\n", rrc,
+                       event_data, GNI_CQ_GET_TYPE(event_data));
+
+               rrc = kgnilnd_get_completed(dev->gnd_snd_rdma_cqh, event_data,
+                                           &desc);
+               mutex_unlock(&dev->gnd_cq_mutex);
+
+               /* XXX Nic: Need better error handling here... */
+               LASSERTF((rrc == GNI_RC_SUCCESS) ||
+                         (rrc == GNI_RC_TRANSACTION_ERROR),
+                        "rrc %d\n", rrc);
+
+               ev_id.txe_cookie = desc->post_id;
+
+               kgnilnd_validate_tx_ev_id(&ev_id, &tx, &conn);
+
+               if (conn == NULL || tx == NULL) {
+                       /* either conn or tx was already nuked and this is a "late"
+                        * completion, so drop it */
+                       continue;
+               }
+
+               GNITX_ASSERTF(tx, tx->tx_msg.gnm_type == GNILND_MSG_PUT_DONE ||
+                       tx->tx_msg.gnm_type == GNILND_MSG_GET_DONE,
+                       "tx %p with type %d\n", tx, tx->tx_msg.gnm_type);
+
+               GNIDBG_TX(D_NET, tx, "RDMA completion for %d bytes", tx->tx_nob);
+
+               /* remove from rdmaq */
+               spin_lock(&conn->gnc_list_lock);
+               kgnilnd_tx_del_state_locked(tx, NULL, conn, GNILND_TX_ALLOCD);
+               spin_unlock(&conn->gnc_list_lock);
+
+               if (likely(desc->status == GNI_RC_SUCCESS)) {
+                       atomic_inc(&dev->gnd_rdma_ntx);
+                       atomic64_add(tx->tx_nob, &dev->gnd_rdma_txbytes);
+                       /* transaction succeeded, add into fmaq */
+                       kgnilnd_queue_tx(conn, tx);
+                       kgnilnd_peer_alive(conn->gnc_peer);
+
+                       /* drop ref from kgnilnd_validate_tx_ev_id */
+                       kgnilnd_conn_decref(conn);
+                       continue;
+               }
+
+               /* fall through to the TRANSACTION_ERROR case */
+               tx->tx_retrans++;
+
+               /* get stringified version for log messages */
+               kgnilnd_cq_error_str(event_data, &err_str, 256);
+               kgnilnd_cq_error_recoverable(event_data, &should_retry);
+
+               /* make sure we are not off in the weeds with this tx */
+               if (tx->tx_retrans >
+                       *kgnilnd_tunables.kgn_max_retransmits) {
+                       GNIDBG_TX(D_NETERROR, tx,
+                              "giving up on TX, too many retries", NULL);
+                       should_retry = 0;
+               }
+
+               GNIDBG_TX(D_NETERROR, tx, "RDMA %s error (%s)",
+                       should_retry ? "transient" : "unrecoverable", err_str);
+
+               if (tx->tx_msg.gnm_type == GNILND_MSG_PUT_DONE) {
+                       if (should_retry) {
+                               kgnilnd_rdma(tx, GNILND_MSG_PUT_DONE,
+                                            &tx->tx_putinfo.gnpam_desc,
+                                            tx->tx_putinfo.gnpam_desc.gnrd_nob,
+                                            tx->tx_putinfo.gnpam_dst_cookie);
+                       } else {
+                               kgnilnd_nak_rdma(conn, GNILND_MSG_PUT_NAK,
+                                               -EFAULT,
+                                               tx->tx_putinfo.gnpam_dst_cookie,
+                                               tx->tx_msg.gnm_srcnid);
+                               kgnilnd_tx_done(tx, -EFAULT);
+                       }
+               } else {
+                       if (should_retry) {
+                               kgnilnd_rdma(tx, GNILND_MSG_GET_DONE,
+                                            &tx->tx_getinfo.gngm_desc,
+                                            tx->tx_lntmsg[0]->msg_len,
+                                            tx->tx_getinfo.gngm_cookie);
+                       } else {
+                               kgnilnd_nak_rdma(conn, GNILND_MSG_GET_NAK,
+                                               -EFAULT,
+                                               tx->tx_getinfo.gngm_cookie,
+                                               tx->tx_msg.gnm_srcnid);
+                               kgnilnd_tx_done(tx, -EFAULT);
+                       }
+               }
+
+               /* drop ref from kgnilnd_validate_tx_ev_id */
+               kgnilnd_conn_decref(conn);
+       }
+}
+
+int
+kgnilnd_check_fma_send_cq(kgn_device_t *dev)
+{
+       gni_return_t           rrc;
+       __u64                  event_data;
+       kgn_tx_ev_id_t         ev_id;
+       kgn_tx_t              *tx = NULL;
+       kgn_conn_t            *conn = NULL;
+       int                    queued_fma, saw_reply, rc;
+       long                   num_processed = 0;
+
+       for (;;) {
+               /* make sure we don't keep looping if we need to reset */
+               if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+                       return num_processed;
+               }
+
+               rc = kgnilnd_mutex_trylock(&dev->gnd_cq_mutex);
+               if (!rc) {
+                       /* we didn't get the mutex, so return that there is still work
+                        * to be done */
+                       return 1;
+               }
+
+               rrc = kgnilnd_cq_get_event(dev->gnd_snd_fma_cqh, &event_data);
+               mutex_unlock(&dev->gnd_cq_mutex);
+
+               if (rrc == GNI_RC_NOT_DONE) {
+                       CDEBUG(D_INFO,
+                              "SMSG send CQ %d not ready (data "LPX64") "
+                              "processed %ld\n", dev->gnd_id, event_data,
+                              num_processed);
+                       return num_processed;
+               }
+
+               dev->gnd_sched_alive = jiffies;
+               num_processed++;
+
+               LASSERTF(!GNI_CQ_OVERRUN(event_data),
+                       "this is bad, somehow our credits didn't "
+                       "protect us from CQ overrun\n");
+               LASSERTF(GNI_CQ_GET_TYPE(event_data) == GNI_CQ_EVENT_TYPE_SMSG,
+                       "rrc %d, GNI_CQ_GET_TYPE("LPX64") = "LPX64"\n", rrc,
+                       event_data, GNI_CQ_GET_TYPE(event_data));
+
+               /* if SMSG couldn't handle an error, time for conn to die */
+               if (unlikely(rrc == GNI_RC_TRANSACTION_ERROR)) {
+                       char            err_str[256];
+
+                       /* need to take the write_lock to ensure atomicity
+                        * on the conn state if we need to close it */
+                       write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+                       conn = kgnilnd_cqid2conn_locked(GNI_CQ_GET_INST_ID(event_data));
+                       if (conn == NULL) {
+                               /* Conn was destroyed? */
+                               CDEBUG(D_NET,
+                                       "SMSG CQID lookup "LPX64" failed\n",
+                                       GNI_CQ_GET_INST_ID(event_data));
+                               write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+                               continue;
+                       }
+
+                       kgnilnd_cq_error_str(event_data, &err_str, 256);
+                       CNETERR("SMSG send error to %s: rc %d (%s)\n",
+                              libcfs_nid2str(conn->gnc_peer->gnp_nid),
+                              rrc, err_str);
+                       kgnilnd_close_conn_locked(conn, -ECOMM);
+
+                       write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+                       /* no need to process rest of this tx -
+                        * it is getting canceled */
+                       continue;
+               }
+
+               /* fall through to GNI_RC_SUCCESS case */
+               ev_id.txe_smsg_id = GNI_CQ_GET_MSG_ID(event_data);
+
+               kgnilnd_validate_tx_ev_id(&ev_id, &tx, &conn);
+               if (conn == NULL || tx == NULL) {
+                       /* either conn or tx was already nuked and this is a "late"
+                        * completion, so drop it */
+                       continue;
+               }
+
+               tx->tx_conn->gnc_last_tx_cq = jiffies;
+               if (tx->tx_msg.gnm_type == GNILND_MSG_NOOP) {
+                       set_mb(conn->gnc_last_noop_cq, jiffies);
+               }
+
+               /* lock tx_list_state and tx_state */
+               spin_lock(&tx->tx_conn->gnc_list_lock);
+
+               GNITX_ASSERTF(tx, tx->tx_list_state == GNILND_TX_LIVE_FMAQ,
+                               "state not GNILND_TX_LIVE_FMAQ", NULL);
+               GNITX_ASSERTF(tx, tx->tx_state & GNILND_TX_WAITING_COMPLETION,
+                       "not waiting for completion", NULL);
+
+               GNIDBG_TX(D_NET, tx, "SMSG complete tx_state %x rc %d",
+                       tx->tx_state, rrc);
+
+               tx->tx_state &= ~GNILND_TX_WAITING_COMPLETION;
+
+               /* This will trigger other FMA sends that were
+                * pending this completion */
+               queued_fma = !list_empty(&tx->tx_conn->gnc_fmaq);
+
+               /* we either did not expect reply or we already got it */
+               saw_reply = !(tx->tx_state & GNILND_TX_WAITING_REPLY);
+
+               spin_unlock(&tx->tx_conn->gnc_list_lock);
+
+               if (queued_fma) {
+                       CDEBUG(D_NET, "scheduling conn 0x%p->%s for fmaq\n",
+                              conn,
+                              libcfs_nid2str(conn->gnc_peer->gnp_nid));
+                       kgnilnd_schedule_conn(conn);
+               }
+
+               /* If saw_reply is false as soon as gnc_list_lock is dropped the tx could be nuked
+                * If saw_reply is true we know that the tx is safe to use as the other thread
+                * is already finished with it.
+                */
+
+               if (saw_reply) {
+                       /* no longer need to track on the live_fmaq */
+                       kgnilnd_tx_del_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_ALLOCD);
+
+                       if (tx->tx_state & GNILND_TX_PENDING_RDMA) {
+                               /* we already got reply & were waiting for
+                                * completion of initial send */
+                               /* to initiate RDMA transaction */
+                               GNIDBG_TX(D_NET, tx,
+                                        "Pending RDMA 0x%p type 0x%02x",
+                                        tx->tx_msg.gnm_type);
+                               tx->tx_state &= ~GNILND_TX_PENDING_RDMA;
+                               rc = kgnilnd_send_mapped_tx(tx, 0);
+                               GNITX_ASSERTF(tx, rc == 0, "RDMA send failed: %d\n", rc);
+                       } else {
+                               /* we are done with this tx */
+                               GNIDBG_TX(D_NET, tx,
+                                        "Done with tx type 0x%02x",
+                                        tx->tx_msg.gnm_type);
+                               kgnilnd_tx_done(tx, tx->tx_rc);
+                       }
+               }
+
+               /* drop ref from kgnilnd_validate_tx_ev_id */
+               kgnilnd_conn_decref(conn);
+
+               /* if we are waiting for a REPLY, we'll handle the tx then */
+       } /* end for loop */
+}
+
+int
+kgnilnd_check_fma_rcv_cq(kgn_device_t *dev)
+{
+       kgn_conn_t         *conn;
+       gni_return_t        rrc;
+       __u64               event_data;
+       long                num_processed = 0;
+       struct list_head   *conns;
+       struct list_head   *tmp;
+       int                 rc;
+
+       for (;;) {
+               /* make sure we don't keep looping if we need to reset */
+               if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+                       return num_processed;
+               }
+
+               rc = kgnilnd_mutex_trylock(&dev->gnd_cq_mutex);
+               if (!rc) {
+                       /* we didn't get the mutex, so return that there is still work
+                        * to be done */
+                       return 1;
+               }
+               rrc = kgnilnd_cq_get_event(dev->gnd_rcv_fma_cqh, &event_data);
+               mutex_unlock(&dev->gnd_cq_mutex);
+
+               if (rrc == GNI_RC_NOT_DONE) {
+                       CDEBUG(D_INFO, "SMSG RX CQ %d empty data "LPX64" "
+                               "processed %ld\n",
+                               dev->gnd_id, event_data, num_processed);
+                       return num_processed;
+               }
+               dev->gnd_sched_alive = jiffies;
+               num_processed++;
+
+               /* this is the only CQ that can really handle transient
+                * CQ errors */
+               if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CQ_GET_EVENT)) {
+                       rrc = cfs_fail_val ? cfs_fail_val
+                                          : GNI_RC_ERROR_RESOURCE;
+                       if (rrc == GNI_RC_ERROR_RESOURCE) {
+                               /* set overrun too */
+                               event_data |= (1UL << 63);
+                               LASSERTF(GNI_CQ_OVERRUN(event_data),
+                                        "(1UL << 63) is no longer the bit to"
+                                        "set to indicate CQ_OVERRUN\n");
+                       }
+               }
+               /* sender should get error event too and take care
+               of failed transaction by re-transmitting */
+               if (rrc == GNI_RC_TRANSACTION_ERROR) {
+                       CDEBUG(D_NET, "SMSG RX CQ error "LPX64"\n", event_data);
+                       continue;
+               }
+
+               if (likely(!GNI_CQ_OVERRUN(event_data))) {
+                       read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+                       conn = kgnilnd_cqid2conn_locked(
+                                                GNI_CQ_GET_INST_ID(event_data));
+                       if (conn == NULL) {
+                               CDEBUG(D_NET, "SMSG RX CQID lookup "LPU64" "
+                                       "failed, dropping event "LPX64"\n",
+                                       GNI_CQ_GET_INST_ID(event_data),
+                                       event_data);
+                       } else {
+                               CDEBUG(D_NET, "SMSG RX: CQID "LPU64" "
+                                      "conn %p->%s\n",
+                                       GNI_CQ_GET_INST_ID(event_data),
+                                       conn, conn->gnc_peer ?
+                                       libcfs_nid2str(conn->gnc_peer->gnp_nid) :
+                                       "<?>");
+
+                               conn->gnc_last_rx_cq = jiffies;
+
+                               /* stash first rx so we can clear out purgatory.
+                                */
+                               if (conn->gnc_first_rx == 0) {
+                                       conn->gnc_first_rx = jiffies;
+                               }
+                               kgnilnd_peer_alive(conn->gnc_peer);
+                               kgnilnd_schedule_conn(conn);
+                       }
+                       read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+                       continue;
+               }
+
+               /* FMA CQ has overflowed: check ALL conns */
+               CNETERR("SMSG RX CQ overflow: scheduling ALL "
+                      "conns on device %d\n", dev->gnd_id);
+
+               for (rc = 0; rc < *kgnilnd_tunables.kgn_peer_hash_size; rc++) {
+
+                       read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+                       conns = &kgnilnd_data.kgn_conns[rc];
+
+                       list_for_each(tmp, conns) {
+                               conn = list_entry(tmp, kgn_conn_t,
+                                                 gnc_hashlist);
+
+                               if (conn->gnc_device == dev) {
+                                       kgnilnd_schedule_conn(conn);
+                                       conn->gnc_last_rx_cq = jiffies;
+                               }
+                       }
+
+                       /* don't block write lockers for too long... */
+                       read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+               }
+       }
+}
+
+/* try_map_if_full should only be used when processing TX from list of
+ * backlog TX waiting on mappings to free up
+ *
+ * Return Codes:
+ *  try_map_if_full = 0: 0 (sent or queued), (-|+)errno failure of kgnilnd_sendmsg
+ *  try_map_if_full = 1: 0 (sent), -ENOMEM for caller to requeue, (-|+)errno failure of kgnilnd_sendmsg */
+
+int
+kgnilnd_send_mapped_tx(kgn_tx_t *tx, int try_map_if_full)
+{
+       /* slight bit of race if multiple people calling, but at worst we'll have
+        * order altered just a bit... which would not be determenistic anyways */
+       int     rc = atomic_read(&tx->tx_conn->gnc_device->gnd_nq_map);
+
+       GNIDBG_TX(D_NET, tx, "try %d nq_map %d", try_map_if_full, rc);
+
+       /* We know that we have a GART reservation that should guarantee forward progress.
+        * This means we don't need to take any extraordinary efforts if we are failing
+        * mappings here - even if we are holding a very small number of these. */
+
+       if (try_map_if_full || (rc == 0)) {
+               rc = kgnilnd_map_buffer(tx);
+       }
+
+       /* rc should be 0 if we mapped succesfully here, if non-zero we are queueing */
+       if (rc != 0) {
+               /* if try_map_if_full set, they handle requeuing */
+               if (unlikely(try_map_if_full)) {
+                       RETURN(rc);
+               } else {
+                       spin_lock(&tx->tx_conn->gnc_device->gnd_lock);
+                       kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_MAPQ, 1);
+                       spin_unlock(&tx->tx_conn->gnc_device->gnd_lock);
+                       /* make sure we wake up sched to run this */
+                       kgnilnd_schedule_device(tx->tx_conn->gnc_device);
+                       /* return 0 as this is now queued for later sending */
+                       RETURN(0);
+               }
+       }
+
+       switch (tx->tx_msg.gnm_type) {
+       default:
+               LBUG();
+               break;
+       /* GET_REQ and PUT_ACK are outbound messages sending our mapping key to
+        * remote node where the RDMA will be started
+        * Special case -EAGAIN logic - this should just queued as if the mapping couldn't
+        * be satisified. The rest of the errors are "hard" errors that require
+        * upper layers to handle themselves */
+       case GNILND_MSG_GET_REQ:
+               tx->tx_msg.gnm_u.get.gngm_desc.gnrd_key = tx->tx_map_key;
+               tx->tx_msg.gnm_u.get.gngm_cookie = tx->tx_id.txe_cookie;
+               tx->tx_msg.gnm_u.get.gngm_desc.gnrd_addr = (__u64)((unsigned long)tx->tx_buffer);
+               tx->tx_msg.gnm_u.get.gngm_desc.gnrd_nob = tx->tx_nob;
+               tx->tx_state = GNILND_TX_WAITING_COMPLETION | GNILND_TX_WAITING_REPLY;
+               if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GET_REQ_AGAIN)) {
+                       tx->tx_state |= GNILND_TX_FAIL_SMSG;
+               }
+               /* redirect to FMAQ on failure, no need to infinite loop here in MAPQ */
+               rc = kgnilnd_sendmsg(tx, NULL, 0, &tx->tx_conn->gnc_list_lock, GNILND_TX_FMAQ);
+               break;
+       case GNILND_MSG_PUT_ACK:
+               tx->tx_msg.gnm_u.putack.gnpam_desc.gnrd_key = tx->tx_map_key;
+               tx->tx_state = GNILND_TX_WAITING_COMPLETION | GNILND_TX_WAITING_REPLY;
+               if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PUT_ACK_AGAIN)) {
+                       tx->tx_state |= GNILND_TX_FAIL_SMSG;
+               }
+               /* redirect to FMAQ on failure, no need to infinite loop here in MAPQ */
+               rc = kgnilnd_sendmsg(tx, NULL, 0, &tx->tx_conn->gnc_list_lock, GNILND_TX_FMAQ);
+               break;
+
+       /* PUT_REQ and GET_DONE are where we do the actual RDMA */
+       case GNILND_MSG_PUT_REQ:
+               kgnilnd_rdma(tx, GNILND_MSG_PUT_DONE,
+                            &tx->tx_putinfo.gnpam_desc,
+                            tx->tx_putinfo.gnpam_desc.gnrd_nob,
+                            tx->tx_putinfo.gnpam_dst_cookie);
+               break;
+       case GNILND_MSG_GET_DONE:
+               kgnilnd_rdma(tx, GNILND_MSG_GET_DONE,
+                            &tx->tx_getinfo.gngm_desc,
+                            tx->tx_lntmsg[0]->msg_len,
+                            tx->tx_getinfo.gngm_cookie);
+
+               break;
+       }
+
+       RETURN(rc);
+}
+
+void
+kgnilnd_process_fmaq(kgn_conn_t *conn)
+{
+       int           more_to_do = 0;
+       kgn_tx_t     *tx = NULL;
+       void         *buffer = NULL;
+       unsigned int  nob = 0;
+       int           rc;
+
+       /* NB 1. kgnilnd_sendmsg() may fail if I'm out of credits right now.
+        *       However I will be rescheduled by an FMA completion event
+        *       when I eventually get some.
+        * NB 2. Sampling gnc_state here races with setting it elsewhere.
+        *       But it doesn't matter if I try to send a "real" message just
+        *       as I start closing because I'll get scheduled to send the
+        *       close anyway. */
+
+       /* Short circuit if the ep_handle is null we cant send anyway. */
+       if (conn->gnc_ephandle == NULL)
+               return;
+
+       LASSERTF(!conn->gnc_close_sent, "Conn %p close was sent\n", conn);
+
+       spin_lock(&conn->gnc_list_lock);
+
+       if (list_empty(&conn->gnc_fmaq)) {
+               int     keepalive = GNILND_TO2KA(conn->gnc_timeout);
+
+               spin_unlock(&conn->gnc_list_lock);
+
+               if (time_after_eq(jiffies, conn->gnc_last_tx + cfs_time_seconds(keepalive))) {
+                       CDEBUG(D_NET, "sending NOOP -> %s (%p idle %lu(%d)) "
+                              "last %lu/%lu/%lu %lus/%lus/%lus\n",
+                              libcfs_nid2str(conn->gnc_peer->gnp_nid), conn,
+                              cfs_duration_sec(jiffies - conn->gnc_last_tx),
+                              keepalive,
+                              conn->gnc_last_noop_want, conn->gnc_last_noop_sent,
+                              conn->gnc_last_noop_cq,
+                              cfs_duration_sec(jiffies - conn->gnc_last_noop_want),
+                              cfs_duration_sec(jiffies - conn->gnc_last_noop_sent),
+                              cfs_duration_sec(jiffies - conn->gnc_last_noop_cq));
+                       atomic_inc(&conn->gnc_sched_noop);
+                       set_mb(conn->gnc_last_noop_want, jiffies);
+
+                       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_NOOP_SEND))
+                               return;
+
+                       tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP, conn->gnc_peer->gnp_net->gnn_ni->ni_nid);
+                       if (tx != NULL) {
+                               int     rc;
+
+                               rc = kgnilnd_set_tx_id(tx, conn);
+                               if (rc != 0) {
+                                       kgnilnd_tx_done(tx, rc);
+                                       return;
+                               }
+                       }
+               }
+       } else {
+               tx = list_first_entry(&conn->gnc_fmaq, kgn_tx_t, tx_list);
+               /* move from fmaq to allocd, kgnilnd_sendmsg will move to live_fmaq */
+               kgnilnd_tx_del_state_locked(tx, NULL, conn, GNILND_TX_ALLOCD);
+               more_to_do = !list_empty(&conn->gnc_fmaq);
+               spin_unlock(&conn->gnc_list_lock);
+       }
+
+       /* if there is no real TX or no NOOP to send, bail */
+       if (tx == NULL) {
+               return;
+       }
+
+       if (!tx->tx_retrans)
+               tx->tx_cred_wait = jiffies;
+
+       GNITX_ASSERTF(tx, tx->tx_id.txe_smsg_id != 0,
+                     "tx with zero id", NULL);
+
+       CDEBUG(D_NET, "sending regular msg: %p, type %s(0x%02x), cookie "LPX64"\n",
+              tx, kgnilnd_msgtype2str(tx->tx_msg.gnm_type),
+              tx->tx_msg.gnm_type, tx->tx_id.txe_cookie);
+
+       rc = 0;
+
+       switch (tx->tx_msg.gnm_type) {
+       default:
+               LBUG();
+
+       case GNILND_MSG_NOOP:
+       case GNILND_MSG_CLOSE:
+       case GNILND_MSG_IMMEDIATE:
+               tx->tx_state = GNILND_TX_WAITING_COMPLETION;
+               buffer = tx->tx_buffer;
+               nob = tx->tx_nob;
+               break;
+
+       case GNILND_MSG_GET_DONE:
+       case GNILND_MSG_PUT_DONE:
+       case GNILND_MSG_PUT_NAK:
+       case GNILND_MSG_GET_NAK:
+               tx->tx_state = GNILND_TX_WAITING_COMPLETION;
+               break;
+
+       case GNILND_MSG_PUT_REQ:
+               tx->tx_msg.gnm_u.putreq.gnprm_cookie = tx->tx_id.txe_cookie;
+
+       case GNILND_MSG_PUT_ACK:
+       case GNILND_MSG_GET_REQ:
+               /* This is really only to handle the retransmit of SMSG once these
+                * two messages are setup in send_mapped_tx */
+               tx->tx_state = GNILND_TX_WAITING_COMPLETION | GNILND_TX_WAITING_REPLY;
+               break;
+       }
+
+       if (likely(rc == 0)) {
+               rc = kgnilnd_sendmsg(tx, buffer, nob, &conn->gnc_list_lock, GNILND_TX_FMAQ);
+       }
+
+       if (rc > 0) {
+               /* don't explicitly reschedule here - we are short credits and will rely on
+                * kgnilnd_sendmsg to resched the conn if need be */
+               more_to_do = 0;
+       } else if (rc < 0) {
+               /* bail: it wasn't sent and we didn't get EAGAIN indicating we should retrans
+                * almost certainly a software bug, but lets play nice with the other kids */
+               kgnilnd_tx_done(tx, rc);
+               /* just for fun, kick peer in arse - resetting conn might help to correct
+                * this almost certainly buggy software caused return code */
+               kgnilnd_close_conn(conn, rc);
+       }
+
+       if (more_to_do) {
+               CDEBUG(D_NET, "Rescheduling %p (more to do)\n", conn);
+               kgnilnd_schedule_conn(conn);
+       }
+}
+
+int
+kgnilnd_process_rdmaq(kgn_device_t *dev)
+{
+       int               found_work = 0;
+       kgn_tx_t         *tx;
+
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_DELAY_RDMAQ)) {
+               RETURN(found_work);
+       }
+
+       if (time_after_eq(jiffies, dev->gnd_rdmaq_deadline)) {
+               unsigned long           dead_bump;
+               long                    new_ok;
+
+               /* if we think we need to adjust, take lock to serialize and recheck */
+               spin_lock(&dev->gnd_rdmaq_lock);
+               if (time_after_eq(jiffies, dev->gnd_rdmaq_deadline)) {
+                       del_singleshot_timer_sync(&dev->gnd_rdmaq_timer);
+
+                       dead_bump = cfs_time_seconds(1) / *kgnilnd_tunables.kgn_rdmaq_intervals;
+
+                       /* roll the bucket forward */
+                       dev->gnd_rdmaq_deadline = jiffies + dead_bump;
+
+                       if (kgnilnd_data.kgn_rdmaq_override &&
+                               (*kgnilnd_tunables.kgn_rdmaq_intervals != 0)) {
+                               new_ok = kgnilnd_data.kgn_rdmaq_override / *kgnilnd_tunables.kgn_rdmaq_intervals;
+                       }  else {
+                               new_ok = ~0UL >> 1;
+                       }
+
+                       /* roll current outstanding forward to make sure we carry outstanding
+                        * committment forward
+                        * new_ok starts out as the whole interval value
+                        *  - first subtract bytes_out from last interval, as that would push us over
+                        *    strict limits for this interval
+                        *  - second, set bytes_ok to new_ok to ensure it doesn't exceed the current auth
+                        *
+                        * there is a small race here if someone is actively processing mappings and
+                        * adding to rdmaq_bytes_out, but it should be small as the mappings are triggered
+                        * quite quickly after kgnilnd_auth_rdma_bytes gives us the go-ahead
+                        * - if this gives us problems in the future, we could use a read/write lock
+                        * to protect the resetting of these values */
+                       new_ok -= atomic64_read(&dev->gnd_rdmaq_bytes_out);
+                       atomic64_set(&dev->gnd_rdmaq_bytes_ok, new_ok);
+
+                       CDEBUG(D_NET, "resetting rdmaq bytes to %ld, deadline +%lu -> %lu, "
+                                      "current out %ld\n",
+                              atomic64_read(&dev->gnd_rdmaq_bytes_ok), dead_bump, dev->gnd_rdmaq_deadline,
+                              atomic64_read(&dev->gnd_rdmaq_bytes_out));
+               }
+               spin_unlock(&dev->gnd_rdmaq_lock);
+       }
+
+       spin_lock(&dev->gnd_rdmaq_lock);
+       while (!list_empty(&dev->gnd_rdmaq)) {
+               int     rc;
+
+               /* make sure we break out early on quiesce */
+               if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+                       /* always break with lock held - we unlock outside loop */
+                       break;
+               }
+
+               tx = list_first_entry(&dev->gnd_rdmaq, kgn_tx_t, tx_list);
+               kgnilnd_tx_del_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_ALLOCD);
+               found_work++;
+
+               /* sample with lock held, serializing with kgnilnd_complete_closed_conn */
+               if (tx->tx_conn->gnc_state != GNILND_CONN_ESTABLISHED) {
+                       /* if conn is dying, mark tx in tx_ref_table for
+                        * kgnilnd_complete_closed_conn to finish up */
+                       kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_DYING, 1);
+
+                       /* tx was moved to DYING, get next */
+                       continue;
+               }
+               spin_unlock(&dev->gnd_rdmaq_lock);
+
+               rc = kgnilnd_auth_rdma_bytes(dev, tx);
+               spin_lock(&dev->gnd_rdmaq_lock);
+
+               if (rc < 0) {
+                       /* no ticket! add back to head */
+                       kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_RDMAQ, 0);
+                       /* clear found_work so scheduler threads wait for timer */
+                       found_work = 0;
+                       break;
+               } else {
+                       /* TX is GO for launch */
+                       tx->tx_qtime = jiffies;
+                       kgnilnd_send_mapped_tx(tx, 0);
+                       found_work++;
+               }
+       }
+       spin_unlock(&dev->gnd_rdmaq_lock);
+
+       RETURN(found_work);
+}
+
+static inline void
+kgnilnd_swab_rdma_desc(kgn_rdma_desc_t *d)
+{
+       __swab64s(&d->gnrd_key.qword1);
+       __swab64s(&d->gnrd_key.qword2);
+       __swab64s(&d->gnrd_addr);
+       __swab32s(&d->gnrd_nob);
+}
+
+#define kgnilnd_match_reply_either(w, x, y, z) _kgnilnd_match_reply(w, x, y, z)
+#define kgnilnd_match_reply(x, y, z) _kgnilnd_match_reply(x, y, GNILND_MSG_NONE, z)
+
+kgn_tx_t *
+_kgnilnd_match_reply(kgn_conn_t *conn, int type1, int type2, __u64 cookie)
+{
+       kgn_tx_ev_id_t    ev_id;
+       kgn_tx_t         *tx;
+
+       /* we use the cookie from the original TX, so we can find the match
+        * by parsing that and using the txe_idx */
+       ev_id.txe_cookie = cookie;
+
+       tx = conn->gnc_tx_ref_table[ev_id.txe_idx];
+
+       if (tx != NULL) {
+               /* check tx to make sure kgni didn't eat it */
+               GNITX_ASSERTF(tx, tx->tx_msg.gnm_magic == GNILND_MSG_MAGIC,
+                             "came back from kgni with bad magic %x\n", tx->tx_msg.gnm_magic);
+
+               GNITX_ASSERTF(tx, ((tx->tx_id.txe_idx == ev_id.txe_idx) &&
+                                 (tx->tx_id.txe_cookie = cookie)),
+                             "conn 0x%p->%s tx_ref_table hosed: wanted "
+                             "txe_cookie "LPX64" txe_idx %d "
+                             "found tx %p cookie "LPX64" txe_idx %d\n",
+                             conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+                             cookie, ev_id.txe_idx,
+                             tx, tx->tx_id.txe_cookie, tx->tx_id.txe_idx);
+
+               LASSERTF((((tx->tx_msg.gnm_type == type1) || (tx->tx_msg.gnm_type == type2)) &&
+                       (tx->tx_state & GNILND_TX_WAITING_REPLY)),
+                       "Unexpected TX type (%x, %x or %x) "
+                       "or state (%x, expected +%x) "
+                       "matched reply from %s\n",
+                       tx->tx_msg.gnm_type, type1, type2,
+                       tx->tx_state, GNILND_TX_WAITING_REPLY,
+                       libcfs_nid2str(conn->gnc_peer->gnp_nid));
+       } else {
+               CWARN("Unmatched reply %02x, or %02x/"LPX64" from %s\n",
+                     type1, type2, cookie, libcfs_nid2str(conn->gnc_peer->gnp_nid));
+       }
+       return tx;
+}
+
+static inline void
+kgnilnd_complete_tx(kgn_tx_t *tx, int rc)
+{
+       int             complete = 0;
+       kgn_conn_t      *conn = tx->tx_conn;
+
+       spin_lock(&conn->gnc_list_lock);
+
+       GNITX_ASSERTF(tx, tx->tx_state & GNILND_TX_WAITING_REPLY,
+               "not waiting for reply", NULL);
+
+       tx->tx_rc = rc;
+       tx->tx_state &= ~GNILND_TX_WAITING_REPLY;
+
+       if (!(tx->tx_state & GNILND_TX_WAITING_COMPLETION)) {
+               kgnilnd_tx_del_state_locked(tx, NULL, conn, GNILND_TX_ALLOCD);
+               /* sample under lock as follow on steps require gnc_list_lock
+                * - or call kgnilnd_tx_done which requires no locks held over
+                *   call to lnet_finalize */
+               complete = 1;
+       }
+       spin_unlock(&conn->gnc_list_lock);
+
+       if (complete) {
+               kgnilnd_tx_done(tx, tx->tx_rc);
+       }
+}
+
+static inline void
+kgnilnd_finalize_rx_done(kgn_tx_t *tx, kgn_msg_t *msg)
+{
+       int              rc;
+       kgn_conn_t      *conn = tx->tx_conn;
+
+       atomic_inc(&conn->gnc_device->gnd_rdma_nrx);
+       atomic64_add(tx->tx_nob, &conn->gnc_device->gnd_rdma_rxbytes);
+
+       rc = kgnilnd_verify_rdma_cksum(tx, msg->gnm_payload_cksum);
+
+       kgnilnd_complete_tx(tx, rc);
+}
+
+void
+kgnilnd_check_fma_rx(kgn_conn_t *conn)
+{
+       __u32         seq;
+       kgn_tx_t     *tx;
+       kgn_rx_t     *rx;
+       kgn_msg_t    *msg;
+       void         *prefix;
+       gni_return_t  rrc;
+       kgn_peer_t   *peer = conn->gnc_peer;
+       kgn_net_t    *net;
+       int           rc = 0;
+       __u16         tmp_cksum = 0, msg_cksum = 0;
+       int           repost = 1, saw_complete;
+       unsigned long timestamp, newest_last_rx, timeout;
+       int           last_seq;
+       void         *memory = NULL;
+       ENTRY;
+
+       /* Short circuit if the ep_handle is null.
+        * It's likely that its about to be closed as stale.
+        */
+       if (conn->gnc_ephandle == NULL)
+               RETURN_EXIT;
+
+       timestamp = jiffies;
+       mutex_lock(&conn->gnc_device->gnd_cq_mutex);
+       /* delay in jiffies - we are really concerned only with things that
+        * result in a schedule() or really holding this off for long times .
+        * NB - mutex_lock could spin for 2 jiffies before going to sleep to wait */
+       conn->gnc_device->gnd_mutex_delay += (long) jiffies - timestamp;
+
+       /* Resample current time as we have no idea how long it took to get the mutex */
+       timestamp = jiffies;
+
+       /* We check here when the last time we received an rx, we do this before
+        * we call getnext in case the thread has been blocked for a while. If we
+        * havent received an rx since our timeout value we close the connection
+        * as we should assume the other side has closed the connection. This will
+        * stop us from sending replies to a mailbox that is already in purgatory.
+        */
+
+       timeout = cfs_time_seconds(conn->gnc_timeout);
+       newest_last_rx = GNILND_LASTRX(conn);
+
+       /* Error injection to validate that timestamp checking works and closing the conn */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_RECV_TIMEOUT)) {
+               timestamp = timestamp + (GNILND_TIMEOUTRX(timeout) * 2);
+       }
+
+       if (time_after_eq(timestamp, newest_last_rx + (GNILND_TIMEOUTRX(timeout)))) {
+               GNIDBG_CONN(D_NETERROR|D_CONSOLE, conn, "Cant receive from %s after timeout lapse of %lu; TO %lu",
+               libcfs_nid2str(conn->gnc_peer->gnp_nid),
+               cfs_duration_sec(timestamp - newest_last_rx),
+               cfs_duration_sec(GNILND_TIMEOUTRX(timeout)));
+               mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+               rc = -ETIME;
+               kgnilnd_close_conn(conn, rc);
+               RETURN_EXIT;
+       }
+
+       rrc = kgnilnd_smsg_getnext(conn->gnc_ephandle, &prefix);
+
+       if (rrc == GNI_RC_NOT_DONE) {
+               mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+               CDEBUG(D_INFO, "SMSG RX empty\n");
+               RETURN_EXIT;
+       }
+
+       if (rrc == GNI_RC_INVALID_STATE) {
+               LIBCFS_ALLOC(memory, conn->gnpr_smsg_attr.buff_size);
+               if (memory == NULL) {
+                       memory = (void *)0xdeadbeef;
+               } else {
+                       memcpy(memory, conn->gnpr_smsg_attr.msg_buffer + conn->gnpr_smsg_attr.mbox_offset, conn->gnpr_smsg_attr.buff_size);
+               }
+       }
+
+       LASSERTF(rrc == GNI_RC_SUCCESS,
+               "bad rc %d on conn %p from peer %s mailbox copy %p\n",
+                rrc, conn, libcfs_nid2str(peer->gnp_nid), memory);
+
+       msg = (kgn_msg_t *)prefix;
+
+       rx = kgnilnd_alloc_rx();
+       if (rx == NULL) {
+               mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+               kgnilnd_release_msg(conn);
+               GNIDBG_MSG(D_NETERROR, msg, "Dropping SMSG RX from 0x%p->%s, no RX memory",
+                          conn, libcfs_nid2str(peer->gnp_nid));
+               RETURN_EXIT;
+       }
+
+       GNIDBG_MSG(D_INFO, msg, "SMSG RX on %p from %s",
+               conn, libcfs_nid2str(peer->gnp_nid));
+
+       timestamp = conn->gnc_last_rx;
+       last_seq = conn->gnc_rx_seq;
+
+       conn->gnc_last_rx = jiffies;
+       /* stash first rx so we can clear out purgatory
+        */
+       if (conn->gnc_first_rx == 0)
+               conn->gnc_first_rx = jiffies;
+
+       seq = conn->gnc_rx_seq++;
+
+       /* needs to linger to protect gnc_rx_seq like we do with gnc_tx_seq */
+       mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+       kgnilnd_peer_alive(conn->gnc_peer);
+
+       rx->grx_msg = msg;
+       rx->grx_conn = conn;
+       rx->grx_eager = 0;
+       rx->grx_received = current_kernel_time();
+
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_NET_LOOKUP)) {
+               rc = -ENONET;
+       } else {
+               rc = kgnilnd_find_net(msg->gnm_srcnid, &net);
+       }
+
+       if (rc < 0) {
+               GOTO(out, rc);
+       } else {
+               kgnilnd_net_decref(net);
+       }
+
+       if (*kgnilnd_tunables.kgn_checksum && !msg->gnm_cksum)
+               GNIDBG_MSG(D_WARNING, msg, "no msg header checksum when enabled");
+
+       /* XXX Nic: Do we need to swab cksum */
+       if (msg->gnm_cksum != 0) {
+               msg_cksum = msg->gnm_cksum;
+               msg->gnm_cksum = 0;
+               tmp_cksum = kgnilnd_cksum(msg, sizeof(kgn_msg_t));
+
+               if (tmp_cksum != msg_cksum) {
+                       GNIDBG_MSG(D_NETERROR, msg, "Bad hdr checksum (%x expected %x)",
+                                       tmp_cksum, msg_cksum);
+                       kgnilnd_dump_msg(D_BUFFS, msg);
+                       rc = -ENOKEY;
+                       GOTO(out, rc);
+               }
+       }
+       /* restore checksum for future debug messages */
+       msg->gnm_cksum = tmp_cksum;
+
+       if (msg->gnm_magic != GNILND_MSG_MAGIC) {
+               if (__swab32(msg->gnm_magic) != GNILND_MSG_MAGIC) {
+                       GNIDBG_MSG(D_NETERROR, msg, "Unexpected magic %08x from %s",
+                              msg->gnm_magic, libcfs_nid2str(peer->gnp_nid));
+                       rc = -EPROTO;
+                       GOTO(out, rc);
+               }
+
+               __swab32s(&msg->gnm_magic);
+               __swab16s(&msg->gnm_version);
+               __swab16s(&msg->gnm_type);
+               __swab64s(&msg->gnm_srcnid);
+               __swab64s(&msg->gnm_connstamp);
+               __swab32s(&msg->gnm_seq);
+
+               /* NB message type checked below; NOT here... */
+               switch (msg->gnm_type) {
+               case GNILND_MSG_PUT_ACK:
+                       kgnilnd_swab_rdma_desc(&msg->gnm_u.putack.gnpam_desc);
+                       break;
+
+               case GNILND_MSG_GET_REQ:
+                       kgnilnd_swab_rdma_desc(&msg->gnm_u.get.gngm_desc);
+                       break;
+
+               default:
+                       break;
+               }
+       }
+
+       if (msg->gnm_version != GNILND_MSG_VERSION) {
+               GNIDBG_MSG(D_NETERROR, msg, "Unexpected protocol version %d from %s",
+                      msg->gnm_version, libcfs_nid2str(peer->gnp_nid));
+               rc = -EPROTO;
+               GOTO(out, rc);
+       }
+
+       if (LNET_NIDADDR(msg->gnm_srcnid) != LNET_NIDADDR(peer->gnp_nid)) {
+               GNIDBG_MSG(D_NETERROR, msg, "Unexpected peer %s from %s",
+                      libcfs_nid2str(msg->gnm_srcnid),
+                      libcfs_nid2str(peer->gnp_nid));
+               rc = -EPROTO;
+               GOTO(out, rc);
+       }
+
+       if (msg->gnm_connstamp != conn->gnc_peer_connstamp) {
+               GNIDBG_MSG(D_NETERROR, msg, "Unexpected connstamp "LPX64"("LPX64
+                      " expected) from %s",
+                      msg->gnm_connstamp, conn->gnc_peer_connstamp,
+                      libcfs_nid2str(peer->gnp_nid));
+               rc = -EPROTO;
+               GOTO(out, rc);
+       }
+
+       if (msg->gnm_seq != seq) {
+               GNIDBG_MSG(D_NETERROR, msg, "Unexpected sequence number %d(%d expected) from %s",
+                      msg->gnm_seq, seq, libcfs_nid2str(peer->gnp_nid));
+               rc = -EPROTO;
+               GOTO(out, rc);
+       }
+
+       atomic_inc(&conn->gnc_device->gnd_short_nrx);
+
+       if (msg->gnm_type == GNILND_MSG_CLOSE) {
+               CDEBUG(D_NETTRACE, "%s sent us CLOSE msg\n",
+                             libcfs_nid2str(conn->gnc_peer->gnp_nid));
+               write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+               conn->gnc_close_recvd = GNILND_CLOSE_RX;
+               conn->gnc_peer_error = msg->gnm_u.completion.gncm_retval;
+               /* double check state with lock held */
+               if (conn->gnc_state == GNILND_CONN_ESTABLISHED) {
+                       /* only error if we are not already closing */
+                       if (conn->gnc_peer_error == -ETIMEDOUT) {
+                               unsigned long           now = jiffies;
+                               CNETERR("peer 0x%p->%s closed connection 0x%p due to timeout. "
+                                      "Is node down? "
+                                      "RX %d @ %lus/%lus; TX %d @ %lus/%lus; "
+                                      "NOOP %lus/%lus/%lus; sched %lus/%lus/%lus ago\n",
+                                      conn->gnc_peer, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+                                      conn, last_seq,
+                                      cfs_duration_sec(now - timestamp),
+                                      cfs_duration_sec(now - conn->gnc_last_rx_cq),
+                                      conn->gnc_tx_seq,
+                                      cfs_duration_sec(now - conn->gnc_last_tx),
+                                      cfs_duration_sec(now - conn->gnc_last_tx_cq),
+                                      cfs_duration_sec(now - conn->gnc_last_noop_want),
+                                      cfs_duration_sec(now - conn->gnc_last_noop_sent),
+                                      cfs_duration_sec(now - conn->gnc_last_noop_cq),
+                                      cfs_duration_sec(now - conn->gnc_last_sched_ask),
+                                      cfs_duration_sec(now - conn->gnc_last_sched_do),
+                                      cfs_duration_sec(now - conn->gnc_device->gnd_sched_alive));
+                       }
+                       kgnilnd_close_conn_locked(conn, -ECONNRESET);
+               }
+               write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+               GOTO(out, rc);
+       }
+
+       if (conn->gnc_close_recvd) {
+               GNIDBG_MSG(D_NETERROR, msg, "Unexpected message %s(%d/%d) after CLOSE from %s",
+                      kgnilnd_msgtype2str(msg->gnm_type),
+                      msg->gnm_type, conn->gnc_close_recvd,
+                      libcfs_nid2str(conn->gnc_peer->gnp_nid));
+               rc = -EPROTO;
+               GOTO(out, rc);
+       }
+
+       if (conn->gnc_state != GNILND_CONN_ESTABLISHED) {
+               /* XXX Nic: log message received on bad connection state */
+               GOTO(out, rc);
+       }
+
+       switch (msg->gnm_type) {
+       case GNILND_MSG_NOOP:
+               /* Nothing to do; just a keepalive */
+               break;
+
+       case GNILND_MSG_IMMEDIATE:
+               /* only get SMSG payload for IMMEDIATE */
+               atomic64_add(msg->gnm_payload_len, &conn->gnc_device->gnd_short_rxbytes);
+               rc = lnet_parse(net->gnn_ni, &msg->gnm_u.immediate.gnim_hdr,
+                               msg->gnm_srcnid, rx, 0);
+               repost = rc < 0;
+               break;
+
+       case GNILND_MSG_PUT_REQ:
+               rc = lnet_parse(net->gnn_ni, &msg->gnm_u.putreq.gnprm_hdr,
+                               msg->gnm_srcnid, rx, 1);
+               repost = rc < 0;
+               break;
+
+       case GNILND_MSG_PUT_NAK:
+               tx = kgnilnd_match_reply_either(conn, GNILND_MSG_PUT_REQ, GNILND_MSG_PUT_ACK,
+                                       msg->gnm_u.completion.gncm_cookie);
+               if (tx == NULL)
+                       break;
+
+               kgnilnd_complete_tx(tx, msg->gnm_u.completion.gncm_retval);
+               break;
+
+       case GNILND_MSG_PUT_ACK:
+               tx = kgnilnd_match_reply(conn, GNILND_MSG_PUT_REQ,
+                                       msg->gnm_u.putack.gnpam_src_cookie);
+               if (tx == NULL)
+                       break;
+
+               /* store putack data for later: deferred rdma or re-try */
+               tx->tx_putinfo = msg->gnm_u.putack;
+
+               saw_complete = 0;
+               spin_lock(&tx->tx_conn->gnc_list_lock);
+
+               GNITX_ASSERTF(tx, tx->tx_state & GNILND_TX_WAITING_REPLY,
+                       "not waiting for reply", NULL);
+
+               tx->tx_state &= ~GNILND_TX_WAITING_REPLY;
+
+               if (likely(!(tx->tx_state & GNILND_TX_WAITING_COMPLETION))) {
+                       kgnilnd_tx_del_state_locked(tx, NULL, conn, GNILND_TX_ALLOCD);
+                       /* sample under lock as follow on steps require gnc_list_lock
+                        * - or call kgnilnd_tx_done which requires no locks held over
+                        *   call to lnet_finalize */
+                       saw_complete = 1;
+               } else {
+                       /* cannot launch rdma if still waiting for fma-msg completion */
+                       CDEBUG(D_NET, "tx 0x%p type 0x%02x will need to "
+                                      "wait for SMSG completion\n", tx, tx->tx_msg.gnm_type);
+                       tx->tx_state |= GNILND_TX_PENDING_RDMA;
+               }
+               spin_unlock(&tx->tx_conn->gnc_list_lock);
+
+               if (saw_complete) {
+                       rc = kgnilnd_send_mapped_tx(tx, 0);
+                       if (rc < 0)
+                               kgnilnd_tx_done(tx, rc);
+               }
+               break;
+
+       case GNILND_MSG_PUT_DONE:
+               tx = kgnilnd_match_reply(conn, GNILND_MSG_PUT_ACK,
+                                       msg->gnm_u.completion.gncm_cookie);
+               if (tx == NULL)
+                       break;
+
+               GNITX_ASSERTF(tx, tx->tx_buftype == GNILND_BUF_PHYS_MAPPED ||
+                              tx->tx_buftype == GNILND_BUF_VIRT_MAPPED,
+                              "bad tx buftype %d", tx->tx_buftype);
+
+               kgnilnd_finalize_rx_done(tx, msg);
+               break;
+
+       case GNILND_MSG_GET_REQ:
+               rc = lnet_parse(net->gnn_ni, &msg->gnm_u.get.gngm_hdr,
+                               msg->gnm_srcnid, rx, 1);
+               repost = rc < 0;
+               break;
+
+       case GNILND_MSG_GET_NAK:
+               tx = kgnilnd_match_reply(conn, GNILND_MSG_GET_REQ,
+                                       msg->gnm_u.completion.gncm_cookie);
+               if (tx == NULL)
+                       break;
+
+               GNITX_ASSERTF(tx, tx->tx_buftype == GNILND_BUF_PHYS_MAPPED ||
+                              tx->tx_buftype == GNILND_BUF_VIRT_MAPPED,
+                              "bad tx buftype %d", tx->tx_buftype);
+
+               kgnilnd_complete_tx(tx, msg->gnm_u.completion.gncm_retval);
+               break;
+
+       case GNILND_MSG_GET_DONE:
+               tx = kgnilnd_match_reply(conn, GNILND_MSG_GET_REQ,
+                                       msg->gnm_u.completion.gncm_cookie);
+               if (tx == NULL)
+                       break;
+
+               GNITX_ASSERTF(tx, tx->tx_buftype == GNILND_BUF_PHYS_MAPPED ||
+                              tx->tx_buftype == GNILND_BUF_VIRT_MAPPED,
+                              "bad tx buftype %d", tx->tx_buftype);
+
+               lnet_set_reply_msg_len(net->gnn_ni, tx->tx_lntmsg[1],
+                                      msg->gnm_u.completion.gncm_retval);
+
+               kgnilnd_finalize_rx_done(tx, msg);
+               break;
+       }
+
+ out:
+       if (rc < 0)                             /* protocol/comms error */
+               kgnilnd_close_conn(conn, rc);
+
+       if (repost && rx != NULL) {
+               kgnilnd_consume_rx(rx);
+       }
+
+       /* we got an event so assume more there and call for reschedule */
+       if (rc >= 0)
+               kgnilnd_schedule_conn(conn);
+       EXIT;
+}
+
+/* Do the failure injections that we need to affect conn processing in the following function.
+ * When writing tests that use this function make sure to use a fail_loc with a fail mask.
+ * If you dont you can cause the scheduler threads to spin on the conn without it leaving
+ * process_conns.
+ *
+ * intent is used to signal the calling function whether or not the conn needs to be rescheduled.
+ */
+
+static inline int
+kgnilnd_check_conn_fail_loc(kgn_device_t *dev, kgn_conn_t *conn, int *intent)
+{
+       int     rc = 0;
+
+       /* short circuit out when not set */
+       if (likely(!cfs_fail_loc)) {
+               RETURN(rc);
+       }
+
+       /* failure injection to test for stack reset clean ups */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_DROP_CLOSING)) {
+               /* we can't rely on busy loops being nice enough to get the
+                *  stack reset triggered - it'd just spin on this conn */
+               CFS_RACE(CFS_FAIL_GNI_DROP_CLOSING);
+               rc = 1;
+               *intent = 1;
+               GOTO(did_fail_loc, rc);
+       }
+
+       if (conn->gnc_state == GNILND_CONN_DESTROY_EP) {
+               /* DESTROY_EP set in kgnilnd_conn_decref on gnc_refcount = 1 */
+
+               if (CFS_FAIL_CHECK(CFS_FAIL_GNI_DROP_DESTROY_EP)) {
+                       CFS_RACE(CFS_FAIL_GNI_DROP_DESTROY_EP);
+                       rc = 1;
+                       *intent = 1;
+                       GOTO(did_fail_loc, rc);
+               }
+       }
+
+       /* CFS_FAIL_GNI_FINISH_PURG2 is used to stop a connection from fully closing. This scheduler
+        * will spin on the CFS_FAIL_TIMEOUT until the fail_loc is cleared at which time the connection
+        * will be closed by kgnilnd_complete_closed_conn.
+        */
+       if ((conn->gnc_state == GNILND_CONN_CLOSED) && CFS_FAIL_CHECK(CFS_FAIL_GNI_FINISH_PURG2)) {
+               while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_FINISH_PURG2, 1)) {};
+               rc = 1;
+               *intent = 1;
+               GOTO(did_fail_loc, rc);
+       }
+
+       /* this one is a bit gross - we can't hold the mutex from process_conns
+        * across a CFS_RACE here - it'd block the conn threads from doing an ep_bind
+        * and moving onto finish_connect
+        * so, we'll just set the rc - kgnilnd_process_conns will clear
+        * found_work on a fail_loc, getting the scheduler thread to call schedule()
+        * and effectively getting this thread to sleep */
+       if ((conn->gnc_state == GNILND_CONN_CLOSED) && CFS_FAIL_CHECK(CFS_FAIL_GNI_FINISH_PURG)) {
+               rc = 1;
+               *intent = 1;
+               GOTO(did_fail_loc, rc);
+       }
+
+did_fail_loc:
+       RETURN(rc);
+}
+
+static inline void
+kgnilnd_send_conn_close(kgn_conn_t *conn)
+{
+       kgn_tx_t        *tx;
+
+       /* we are closing the conn - we will try to send the CLOSE msg
+        * but will not wait for anything else to flush */
+
+       /* send the close if not already done so or received one */
+       if (!conn->gnc_close_sent && !conn->gnc_close_recvd) {
+               /* set close_sent regardless of the success of the
+                * CLOSE message. We are going to try once and then
+                * kick him out of the sandbox */
+               conn->gnc_close_sent = 1;
+               mb();
+
+               /* EP might be null already if remote side initiated a new connection.
+                * kgnilnd_finish_connect destroys existing ep_handles before wiring up the new connection,
+                * so this check is here to make sure we dont attempt to send with a null ep_handle.
+                */
+               if (conn->gnc_ephandle != NULL) {
+                       int rc = 0;
+
+                       tx = kgnilnd_new_tx_msg(GNILND_MSG_CLOSE, conn->gnc_peer->gnp_net->gnn_ni->ni_nid);
+                       if (tx != NULL) {
+                               tx->tx_msg.gnm_u.completion.gncm_retval = conn->gnc_error;
+                               tx->tx_state = GNILND_TX_WAITING_COMPLETION;
+                               tx->tx_qtime = jiffies;
+
+                               if (tx->tx_id.txe_idx == 0) {
+                                       rc = kgnilnd_set_tx_id(tx, conn);
+                                       if (rc != 0) {
+                                               kgnilnd_tx_done(tx, rc);
+                                       }
+                               }
+
+                               CDEBUG(D_NETTRACE, "sending close with errno %d\n",
+                                               conn->gnc_error);
+
+                               if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CLOSE_SEND)) {
+                                       kgnilnd_tx_done(tx, -EAGAIN);
+                               } else if (!rc) {
+                                       rc = kgnilnd_sendmsg(tx, NULL, 0, NULL, GNILND_TX_FMAQ);
+                                       if (rc) {
+                                               /* It wasnt sent and we dont care. */
+                                               kgnilnd_tx_done(tx, rc);
+                                       }
+                               }
+
+                       }
+               }
+       }
+
+       conn->gnc_state = GNILND_CONN_CLOSED;
+       /* mark this conn as CLOSED now that we processed it
+        * do after TX, so we can use CLOSING in asserts */
+
+       mb();
+
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_RX_CLOSE_CLOSED)) {
+               /* simulate a RX CLOSE after the timeout but before
+                * the scheduler thread gets it */
+               conn->gnc_close_recvd = GNILND_CLOSE_INJECT2;
+               conn->gnc_peer_error = -ETIMEDOUT;
+       }
+       /* schedule to allow potential CLOSE and get the complete phase run */
+       kgnilnd_schedule_conn(conn);
+}
+
+int
+kgnilnd_process_mapped_tx(kgn_device_t *dev)
+{
+       int             found_work = 0;
+       int             rc = 0;
+       kgn_tx_t        *tx;
+       int             max_retrans = *kgnilnd_tunables.kgn_max_retransmits;
+       int             log_retrans, log_retrans_level;
+       static int      last_map_version;
+       ENTRY;
+
+       spin_lock(&dev->gnd_lock);
+       if (list_empty(&dev->gnd_map_tx)) {
+               spin_unlock(&dev->gnd_lock);
+               RETURN(0);
+       }
+
+       dev->gnd_sched_alive = jiffies;
+
+       /* we'll retry as fast as possible up to 25% of the limit, then we start
+        * backing off until our map version changes - indicating we unmapped
+        * something */
+       tx = list_first_entry(&dev->gnd_map_tx, kgn_tx_t, tx_list);
+       if ((tx->tx_retrans > (max_retrans / 4)) &&
+           (last_map_version == dev->gnd_map_version)) {
+               GNIDBG_TX(D_NET, tx, "waiting for mapping event event to retry", NULL);
+               spin_unlock(&dev->gnd_lock);
+               RETURN(0);
+       }
+
+       /* stash the last map version to let us know when a good one was seen */
+       last_map_version = dev->gnd_map_version;
+
+       /* we need to to take the lock and continually refresh the head of the list as
+        * kgnilnd_complete_closed_conn might be nuking stuff and we are cycling the lock
+        * allowing them to squeeze in */
+
+       while (!list_empty(&dev->gnd_map_tx)) {
+               /* make sure we break out early on quiesce */
+               if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+                       /* always break with lock held - we unlock outside loop */
+                       break;
+               }
+
+               tx = list_first_entry(&dev->gnd_map_tx, kgn_tx_t, tx_list);
+
+               kgnilnd_tx_del_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_ALLOCD);
+               found_work++;
+
+               /* sample with lock held, serializing with kgnilnd_complete_closed_conn */
+               if (tx->tx_conn->gnc_state != GNILND_CONN_ESTABLISHED) {
+                       /* if conn is dying, mark tx in tx_ref_table for
+                        * kgnilnd_complete_closed_conn to finish up */
+                       kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_DYING, 1);
+                       found_work++;
+
+                       /* tx was moved to DYING, get next */
+                       continue;
+               }
+
+               spin_unlock(&dev->gnd_lock);
+               rc = kgnilnd_send_mapped_tx(tx, 1);
+
+               /* We made it! skip error handling.. */
+               if (rc >= 0) {
+                       /* OK to continue on +ve errors as it won't get seen until
+                        * this function is called again - we operate on a copy of the original
+                        * list and not the live list */
+                       spin_lock(&dev->gnd_lock);
+                       continue;
+               } else if (rc != -ENOMEM) {
+                       /* carp, failure we can't handle */
+                       kgnilnd_tx_done(tx, rc);
+                       spin_lock(&dev->gnd_lock);
+                       continue;
+               }
+
+               /* time to handle the retry cases.. */
+               tx->tx_retrans++;
+               if (tx->tx_retrans == 1)
+                       tx->tx_qtime = jiffies;
+
+               /* only log occasionally once we've retried max / 2 */
+               log_retrans = (tx->tx_retrans >= (max_retrans / 2)) &&
+                             ((tx->tx_retrans % 32) == 0);
+               log_retrans_level = log_retrans ? D_NETERROR : D_NET;
+
+               /* make sure we are not off in the weeds with this tx */
+               if (tx->tx_retrans > *kgnilnd_tunables.kgn_max_retransmits) {
+                      GNIDBG_TX(D_NETERROR, tx,
+                              "giving up on TX, too many retries", NULL);
+                      kgnilnd_tx_done(tx, -ENOMEM);
+                      GOTO(get_out_mapped, rc);
+               } else {
+                      GNIDBG_TX(log_retrans_level, tx,
+                               "transient map failure #%d %d pages/%d bytes phys %u@%u "
+                               "virt %u@"LPU64" "
+                               "nq_map %d mdd# %d/%d GART %ld",
+                               tx->tx_retrans, tx->tx_phys_npages, tx->tx_nob,
+                               dev->gnd_map_nphys, dev->gnd_map_physnop * PAGE_SIZE,
+                               dev->gnd_map_nvirt, dev->gnd_map_virtnob,
+                               atomic_read(&dev->gnd_nq_map),
+                               atomic_read(&dev->gnd_n_mdd), atomic_read(&dev->gnd_n_mdd_held),
+                               atomic64_read(&dev->gnd_nbytes_map));
+               }
+
+               /* we need to stop processing the rest of the list, so add it back in */
+               spin_lock(&dev->gnd_lock);
+               kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_MAPQ, 0);
+               spin_unlock(&dev->gnd_lock);
+               GOTO(get_out_mapped, rc);
+       }
+       spin_unlock(&dev->gnd_lock);
+get_out_mapped:
+       RETURN(found_work);
+}
+
+int
+kgnilnd_process_conns(kgn_device_t *dev)
+{
+       int              found_work = 0;
+       int              conn_sched;
+       int              intent = 0;
+       kgn_conn_t      *conn;
+
+       spin_lock(&dev->gnd_lock);
+       while (!list_empty(&dev->gnd_ready_conns)) {
+               dev->gnd_sched_alive = jiffies;
+
+               if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+                       /* break with lock held */
+                       break;
+               }
+
+               conn = list_first_entry(&dev->gnd_ready_conns, kgn_conn_t, gnc_schedlist);
+               list_del_init(&conn->gnc_schedlist);
+               spin_unlock(&dev->gnd_lock);
+
+               conn_sched = xchg(&conn->gnc_scheduled, GNILND_CONN_PROCESS);
+
+               LASSERTF(conn_sched != GNILND_CONN_IDLE &&
+                        conn_sched != GNILND_CONN_PROCESS,
+                        "conn %p on ready list but in bad state: %d\n",
+                        conn, conn_sched);
+
+               CDEBUG(D_INFO, "conn %p@%s for processing\n",
+                       conn, kgnilnd_conn_state2str(conn));
+
+               found_work++;
+               set_mb(conn->gnc_last_sched_do, jiffies);
+
+               if (kgnilnd_check_conn_fail_loc(dev, conn, &intent)) {
+
+                       /* based on intent see if we should run again. */
+                       kgnilnd_schedule_process_conn(conn, intent);
+
+                       /* drop ref from gnd_ready_conns */
+                       kgnilnd_conn_decref(conn);
+                       /* clear this so that scheduler thread doesn't spin */
+                       found_work = 0;
+                       /* break with lock held... */
+                       spin_lock(&dev->gnd_lock);
+                       break;
+               }
+
+               if (unlikely(conn->gnc_state == GNILND_CONN_CLOSED)) {
+                       /* CONN_CLOSED set in procces_fmaq when CLOSE is sent */
+                       kgnilnd_complete_closed_conn(conn);
+               } else if (unlikely(conn->gnc_state == GNILND_CONN_DESTROY_EP)) {
+                       /* DESTROY_EP set in kgnilnd_conn_decref on gnc_refcount = 1 */
+                       /* serialize SMSG CQs with ep_bind and smsg_release */
+                       kgnilnd_destroy_conn_ep(conn);
+               } else if (unlikely(conn->gnc_state == GNILND_CONN_CLOSING)) {
+                      /* if we need to do some CLOSE sending, etc done here do it */
+                       kgnilnd_send_conn_close(conn);
+                       kgnilnd_check_fma_rx(conn);
+               } else if (atomic_read(&conn->gnc_peer->gnp_dirty_eps) == 0) {
+                       /* start moving traffic if the old conns are cleared out */
+                       kgnilnd_check_fma_rx(conn);
+                       kgnilnd_process_fmaq(conn);
+               }
+
+               kgnilnd_schedule_process_conn(conn, 0);
+
+               /* drop ref from gnd_ready_conns */
+               kgnilnd_conn_decref(conn);
+
+               /* check list again with lock held */
+               spin_lock(&dev->gnd_lock);
+       }
+       spin_unlock(&dev->gnd_lock);
+
+       RETURN(found_work);
+}
+
+int
+kgnilnd_scheduler(void *arg)
+{
+       int               threadno = (long)arg;
+       kgn_device_t     *dev;
+       char              name[16];
+       int               busy_loops = 0;
+       DEFINE_WAIT(wait);
+
+       dev = &kgnilnd_data.kgn_devices[(threadno + 1) % kgnilnd_data.kgn_ndevs];
+
+       snprintf(name, sizeof(name), "kgnilnd_sd_%02d", threadno);
+       cfs_daemonize(name);
+       cfs_block_allsigs();
+
+       /* all gnilnd threads need to run fairly urgently */
+       set_user_nice(current, *kgnilnd_tunables.kgn_nice);
+
+       while (!kgnilnd_data.kgn_shutdown) {
+               int     found_work = 0;
+               /* Safe: kgn_shutdown only set when quiescent */
+
+               /* to quiesce or to not quiesce, that is the question */
+
+               if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+                       KGNILND_SPIN_QUIESCE;
+               }
+
+               /* tracking for when thread goes AWOL */
+               dev->gnd_sched_alive = jiffies;
+
+               /* let folks know we are up and kicking
+                * - they can use this for latency savings, etc
+                * - only change if IRQ, if IDLE leave alone as that
+                *   schedule_device calls to put us back to IRQ */
+               (void)cmpxchg(&dev->gnd_ready, GNILND_DEV_IRQ, GNILND_DEV_LOOP);
+
+               /* always check these - they are super low cost  */
+               found_work += kgnilnd_check_fma_send_cq(dev);
+               found_work += kgnilnd_check_fma_rcv_cq(dev);
+
+               /* rdma CQ doesn't care about eps */
+               found_work += kgnilnd_check_rdma_cq(dev);
+
+               /* move some RDMA ? */
+               found_work += kgnilnd_process_rdmaq(dev);
+
+               /* map some pending RDMA requests ? */
+               found_work += kgnilnd_process_mapped_tx(dev);
+
+               /* the EP for a conn is not destroyed until all the references
+                * to it are gone, so these checks should be safe
+                * even if run in parallel with the CQ checking functions
+                * _AND_ a thread that processes the CLOSED->DONE
+                * transistion
+                * ...should.... */
+
+               /* process all conns ready now */
+               found_work += kgnilnd_process_conns(dev);
+
+               /* do an eager check to avoid the IRQ disabling in
+                * prepare_to_wait and friends */
+
+               if (found_work && busy_loops++ < *kgnilnd_tunables.kgn_loops) {
+                       found_work = 0;
+                       if ((busy_loops % 10) == 0) {
+                               /* tickle heartbeat and watchdog to ensure our
+                                * piggishness doesn't turn into heartbeat failure */
+                               touch_nmi_watchdog();
+                               if (kgnilnd_hssops.hb_to_l0 != NULL) {
+                                       kgnilnd_hssops.hb_to_l0();
+                               }
+                       }
+                       continue;
+               }
+
+               /* if we got here, found_work was zero or busy_loops means we
+                * need to take a break. We'll clear gnd_ready but we'll check
+                * one last time if there is an IRQ that needs processing */
+
+               prepare_to_wait(&dev->gnd_waitq, &wait, TASK_INTERRUPTIBLE);
+
+               /* the first time this will go LOOP -> IDLE and let us do one final check
+                * during which we might get an IRQ, then IDLE->IDLE and schedule()
+                * - this might allow other threads to block us for a bit if they
+                *   try to get the mutex, but that is good as we'd need to wake
+                *   up soon to handle the CQ or other processing anyways */
+
+               found_work += xchg(&dev->gnd_ready, GNILND_DEV_IDLE);
+
+               if (busy_loops >= *kgnilnd_tunables.kgn_loops) {
+                       CDEBUG(D_INFO,
+                              "yeilding: found_work %d busy_loops %d\n",
+                              found_work, busy_loops);
+                       busy_loops = 0;
+                       /* use yield if we are bailing due to busy_loops
+                        * - this will ensure we wake up soonish. This closes
+                        * a race with kgnilnd_device_callback - where it'd
+                        * not call wake_up() because gnd_ready == 1, but then
+                        * we come down and schedule() because of busy_loops.
+                        * We'd not be woken up until something poked our waitq
+                        * again. yield() ensures we wake up without another
+                        * waitq poke in that case */
+                       atomic_inc(&dev->gnd_n_yield);
+                       yield();
+                       CDEBUG(D_INFO, "awake after yeild\n");
+               } else if (found_work == GNILND_DEV_IDLE) {
+                       /* busy_loops is low and there is nothing to do,
+                        * go to sleep and wait for a waitq poke */
+                       CDEBUG(D_INFO,
+                              "scheduling: found_work %d busy_loops %d\n",
+                              found_work, busy_loops);
+                       atomic_inc(&dev->gnd_n_schedule);
+                       schedule();
+                       CDEBUG(D_INFO, "awake after schedule\n");
+               }
+               finish_wait(&dev->gnd_waitq, &wait);
+       }
+
+       kgnilnd_thread_fini();
+       return 0;
+}
diff --git a/lnet/klnds/gnilnd/gnilnd_conn.c b/lnet/klnds/gnilnd/gnilnd_conn.c
new file mode 100644 (file)
index 0000000..38aee5b
--- /dev/null
@@ -0,0 +1,2408 @@
+/*
+ * Copyright (C) 2012 Cray, Inc.
+ *
+ *   Author: Igor Gorodetsky <iogordet@cray.com>
+ *   Author: Nic Henke <nic@cray.com>
+ *   Author: James Shimek <jshimek@cray.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "gnilnd.h"
+
+void
+kgnilnd_setup_smsg_attr(gni_smsg_attr_t *smsg_attr)
+{
+       smsg_attr->mbox_maxcredit = *kgnilnd_tunables.kgn_mbox_credits;
+       smsg_attr->msg_maxsize = GNILND_MAX_MSG_SIZE;
+       smsg_attr->msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT;
+}
+
+int
+kgnilnd_map_fmablk(kgn_device_t *device, kgn_fma_memblock_t *fma_blk)
+{
+       gni_return_t            rrc;
+       __u32                   flags = GNI_MEM_READWRITE;
+
+       if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
+               flags |= GNI_MEM_PHYS_CONT;
+       }
+
+       /* make sure we are mapping a clean block */
+       LASSERTF(fma_blk->gnm_hndl.qword1 == 0UL, "fma_blk %p dirty\n", fma_blk);
+
+       rrc = kgnilnd_mem_register(device->gnd_handle, (__u64)fma_blk->gnm_block,
+                                  fma_blk->gnm_blk_size, device->gnd_rcv_fma_cqh,
+                                  flags, &fma_blk->gnm_hndl);
+       if (rrc != GNI_RC_SUCCESS) {
+               /* XXX Nic: need a way to silence this for runtime stuff that is ok to fail
+                * -- like when under MDD or GART pressure on big systems
+                */
+               CNETERR("register fmablk failed 0x%p mbox_size %d flags %u\n",
+                       fma_blk, fma_blk->gnm_mbox_size, flags);
+               RETURN(-ENOMEM);
+       }
+
+       /* PHYS_CONT memory isn't really mapped, at least not in GART -
+        *  but all mappings chew up a MDD
+        */
+       if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
+               atomic64_add(fma_blk->gnm_blk_size, &device->gnd_nbytes_map);
+       }
+
+       atomic_inc(&device->gnd_n_mdd);
+       /* nfmablk is live (mapped) blocks */
+       atomic_inc(&device->gnd_nfmablk);
+
+       RETURN(0);
+}
+
+int
+kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
+{
+       int                     rc = 0;
+       int                     num_mbox;
+       kgn_fma_memblock_t     *fma_blk;
+       gni_smsg_attr_t         smsg_attr;
+       unsigned long           fmablk_vers;
+
+       /* we'll use fmablk_vers and the gnd_fmablk_sem to gate access
+        * to this allocation code. Everyone will sample the version
+        * before and after getting the semaphore. If it has changed,
+        * we'll bail out to check the lists again - this indicates that
+        * some sort of change was made to the lists and it is possible
+        * that there is a mailbox for us to find now. This should prevent
+        * a ton of spinning in the case where there are lots of threads
+        * that need a yet-to-be-allocated mailbox for a connection. */
+
+       fmablk_vers = atomic_read(&device->gnd_fmablk_vers);
+       down(&device->gnd_fmablk_sem);
+
+       if (fmablk_vers != atomic_read(&device->gnd_fmablk_vers)) {
+               /* version changed while we were waiting for semaphore,
+                * we'll recheck the lists assuming something nice happened */
+               up(&device->gnd_fmablk_sem);
+               return 0;
+       }
+
+       LIBCFS_ALLOC(fma_blk, sizeof(kgn_fma_memblock_t));
+       if (fma_blk == NULL) {
+               CNETERR("could not allocate fma block descriptor\n");
+               rc = -ENOMEM;
+               GOTO(out, rc);
+       }
+
+       INIT_LIST_HEAD(&fma_blk->gnm_bufflist);
+
+       kgnilnd_setup_smsg_attr(&smsg_attr);
+
+       gni_smsg_buff_size_needed(&smsg_attr, &fma_blk->gnm_mbox_size);
+
+       LASSERTF(fma_blk->gnm_mbox_size, "mbox size %d\n", fma_blk->gnm_mbox_size);
+
+       /* gni_smsg_buff_size_needed calculates the base mailbox size and since
+        * we want to hold kgn_peer_credits worth of messages in both directions,
+        * we add PAYLOAD to grow the mailbox size
+        */
+
+       fma_blk->gnm_mbox_size += GNILND_MBOX_PAYLOAD;
+
+       /* we'll only use physical during preallocate at startup -- this keeps it nice and
+        * clean for runtime decisions. We'll keep the PHYS ones around until shutdown
+        * as reallocating them is tough if there is memory fragmentation */
+
+       if (use_phys) {
+               fma_blk->gnm_block = cfs_mem_cache_alloc(kgnilnd_data.kgn_mbox_cache, CFS_ALLOC_ATOMIC);
+               if (fma_blk->gnm_block == NULL) {
+                       CNETERR("could not allocate physical SMSG mailbox memory\n");
+                       rc = -ENOMEM;
+                       GOTO(free_desc, rc);
+               }
+               fma_blk->gnm_blk_size = KMALLOC_MAX_SIZE;
+               num_mbox = fma_blk->gnm_blk_size / fma_blk->gnm_mbox_size;
+
+               LASSERTF(num_mbox >= 1,
+                        "num_mbox %d blk_size %u mbox_size %d\n",
+                         num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size);
+
+               fma_blk->gnm_state = GNILND_FMABLK_PHYS;
+
+       } else {
+               num_mbox = *kgnilnd_tunables.kgn_mbox_per_block;
+               fma_blk->gnm_blk_size = num_mbox * fma_blk->gnm_mbox_size;
+
+               LASSERTF(num_mbox >= 1 && num_mbox >= *kgnilnd_tunables.kgn_mbox_per_block,
+                        "num_mbox %d blk_size %u mbox_size %d tunable %d\n",
+                        num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size,
+                        *kgnilnd_tunables.kgn_mbox_per_block);
+
+               LIBCFS_ALLOC(fma_blk->gnm_block, fma_blk->gnm_blk_size);
+               if (fma_blk->gnm_block == NULL) {
+                       CNETERR("could not allocate virtual SMSG mailbox memory, %d bytes\n", fma_blk->gnm_blk_size);
+                       rc = -ENOMEM;
+                       GOTO(free_desc, rc);
+               }
+
+               fma_blk->gnm_state = GNILND_FMABLK_VIRT;
+       }
+
+       /* allocate just enough space for the bits to track the mailboxes */
+       LIBCFS_ALLOC(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox) * sizeof(unsigned long));
+       if (fma_blk->gnm_bit_array == NULL) {
+               CNETERR("could not allocate mailbox bitmask, %lu bytes for %d mbox\n",
+                      sizeof(unsigned long) * BITS_TO_LONGS(num_mbox), num_mbox);
+               rc = -ENOMEM;
+               GOTO(free_blk, rc);
+       }
+       bitmap_zero(fma_blk->gnm_bit_array, num_mbox);
+
+       /* now that the num_mbox is set based on allocation type, get debug info setup */
+       LIBCFS_ALLOC(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t) * num_mbox);
+       if (fma_blk->gnm_mbox_info == NULL) {
+               CNETERR("could not allocate mailbox debug, %lu bytes for %d mbox\n",
+                      sizeof(kgn_mbox_info_t) * num_mbox, num_mbox);
+               rc = -ENOMEM;
+               GOTO(free_bit, rc);
+       }
+
+       rc = kgnilnd_map_fmablk(device, fma_blk);
+       if (rc) {
+               GOTO(free_info, rc);
+       }
+
+       fma_blk->gnm_next_avail_mbox = 0;
+       fma_blk->gnm_avail_mboxs = fma_blk->gnm_num_mboxs = num_mbox;
+
+       CDEBUG(D_MALLOC, "alloc fmablk 0x%p num %d msg_maxsize %d credits %d "
+               "mbox_size %d MDD "LPX64"."LPX64"\n",
+               fma_blk, num_mbox, smsg_attr.msg_maxsize, smsg_attr.mbox_maxcredit,
+               fma_blk->gnm_mbox_size, fma_blk->gnm_hndl.qword1,
+               fma_blk->gnm_hndl.qword2);
+
+       /* lock Is protecting data structures, not semaphore */
+
+       spin_lock(&device->gnd_fmablk_lock);
+       list_add_tail(&fma_blk->gnm_bufflist, &device->gnd_fma_buffs);
+
+       /* toggle under the lock so once they change the list is also
+        * ready for others to traverse */
+       atomic_inc(&device->gnd_fmablk_vers);
+
+       spin_unlock(&device->gnd_fmablk_lock);
+
+       up(&device->gnd_fmablk_sem);
+
+       return 0;
+
+free_info:
+       LIBCFS_FREE(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t)*num_mbox);
+free_bit:
+       LIBCFS_FREE(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox) * sizeof (unsigned long));
+free_blk:
+       if (fma_blk->gnm_state == GNILND_FMABLK_VIRT) {
+               LIBCFS_FREE(fma_blk->gnm_block, fma_blk->gnm_blk_size);
+       } else {
+               cfs_mem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
+       }
+free_desc:
+       LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
+out:
+       up(&device->gnd_fmablk_sem);
+       return rc;
+}
+
+void
+kgnilnd_unmap_fmablk(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
+{
+       gni_return_t            rrc;
+
+       /* if some held, set hold_timeout from conn timeouts used in this block
+        * but not during shutdown, then just nuke and pave */
+       if (fma_blk->gnm_held_mboxs && (!kgnilnd_data.kgn_shutdown)) {
+               fma_blk->gnm_hold_timeout = GNILND_TIMEOUT2DEADMAN;
+       }
+
+       /* we are changing the state of a block, tickle version to tell
+        * proc code list is stale now */
+       atomic_inc(&dev->gnd_fmablk_vers);
+
+       rrc = kgnilnd_mem_deregister(dev->gnd_handle, &fma_blk->gnm_hndl, fma_blk->gnm_hold_timeout);
+
+       CDEBUG(rrc == GNI_RC_SUCCESS ? D_MALLOC : D_CONSOLE|D_NETERROR,
+              "unmap fmablk 0x%p@%s sz %u total %d avail %d held %d mbox_size %d "
+               "hold_timeout %d\n",
+              fma_blk, kgnilnd_fmablk_state2str(fma_blk->gnm_state),
+              fma_blk->gnm_blk_size, fma_blk->gnm_num_mboxs,
+              fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs,
+              fma_blk->gnm_mbox_size, fma_blk->gnm_hold_timeout);
+
+       LASSERTF(rrc == GNI_RC_SUCCESS,
+               "tried to double unmap or something bad, fma_blk %p (rrc %d)\n",
+               fma_blk, rrc);
+
+       if (fma_blk->gnm_hold_timeout) {
+               atomic_inc(&dev->gnd_n_mdd_held);
+       } else {
+               atomic_dec(&dev->gnd_n_mdd);
+       }
+
+       /* PHYS blocks don't get mapped */
+       if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
+               atomic64_sub(fma_blk->gnm_blk_size, &dev->gnd_nbytes_map);
+       } else if (kgnilnd_data.kgn_in_reset) {
+               /* in stack reset, clear MDD handle for PHYS blocks, as we'll
+                * re-use the fma_blk after reset so we don't have to drop/allocate
+                * all of those physical blocks */
+               fma_blk->gnm_hndl.qword1 = fma_blk->gnm_hndl.qword2 = 0UL;
+       }
+
+       /* Decrement here as this is the # of mapped blocks */
+       atomic_dec(&dev->gnd_nfmablk);
+}
+
+
+/* needs lock on gnd_fmablk_lock to cover gnd_fma_buffs */
+void
+kgnilnd_free_fmablk_locked(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
+{
+       LASSERTF(fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs,
+                "fma_blk %p@%d free in bad state (%d): blk total %d avail %d held %d\n",
+                fma_blk, fma_blk->gnm_state, fma_blk->gnm_hold_timeout, fma_blk->gnm_num_mboxs,
+               fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs);
+
+       atomic_inc(&dev->gnd_fmablk_vers);
+
+       if (fma_blk->gnm_hold_timeout) {
+               CDEBUG(D_MALLOC, "mdd release fmablk 0x%p sz %u avail %d held %d "
+                       "mbox_size %d\n",
+                       fma_blk, fma_blk->gnm_blk_size, fma_blk->gnm_avail_mboxs,
+                       fma_blk->gnm_held_mboxs, fma_blk->gnm_mbox_size);
+
+               /* We leave MDD dangling over stack reset */
+               if (!kgnilnd_data.kgn_in_reset) {
+                       kgnilnd_mem_mdd_release(dev->gnd_handle, &fma_blk->gnm_hndl);
+               }
+               /* ignoring the return code - if kgni/ghal can't find it
+                * it must be released already */
+               atomic_dec(&dev->gnd_n_mdd_held);
+               atomic_dec(&dev->gnd_n_mdd);
+       }
+
+       /* we cant' free the gnm_block until all the conns have released their
+        * purgatory holds. While we have purgatory holds, we might check the conn
+        * RX mailbox during the CLOSING process. It is possible that kgni might
+        * try to look into the RX side for credits when sending the CLOSE msg too */
+       CDEBUG(D_MALLOC, "fmablk %p free buffer %p mbox_size %d\n",
+               fma_blk, fma_blk->gnm_block, fma_blk->gnm_mbox_size);
+
+       if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
+               cfs_mem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
+       } else {
+               LIBCFS_FREE(fma_blk->gnm_block, fma_blk->gnm_blk_size);
+       }
+       fma_blk->gnm_state = GNILND_FMABLK_FREED;
+
+       list_del(&fma_blk->gnm_bufflist);
+
+       LIBCFS_FREE(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t)*fma_blk->gnm_num_mboxs);
+       LIBCFS_FREE(fma_blk->gnm_bit_array, BITS_TO_LONGS(fma_blk->gnm_num_mboxs) * sizeof (unsigned long));
+       LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
+}
+
+void
+kgnilnd_find_free_mbox(kgn_conn_t *conn)
+{
+       kgn_device_t            *dev = conn->gnc_device;
+       gni_smsg_attr_t         *smsg_attr = &conn->gnpr_smsg_attr;
+       kgn_fma_memblock_t      *fma_blk;
+       kgn_mbox_info_t         *mbox = NULL;
+       int                     id;
+
+       spin_lock(&dev->gnd_fmablk_lock);
+
+       list_for_each_entry(fma_blk, &conn->gnc_device->gnd_fma_buffs,
+                           gnm_bufflist) {
+               if (fma_blk->gnm_avail_mboxs <= 0 ||
+                   fma_blk->gnm_state <= GNILND_FMABLK_IDLE) {
+                       continue;
+               }
+               /* look in bitarray for available mailbox */
+               do {
+                       id = find_next_zero_bit(
+                               fma_blk->gnm_bit_array,
+                               fma_blk->gnm_num_mboxs,
+                               fma_blk->gnm_next_avail_mbox);
+                     if (id == fma_blk->gnm_num_mboxs &&
+                         fma_blk->gnm_next_avail_mbox != 0) {
+                               /* wrap around */
+                               fma_blk->gnm_next_avail_mbox = 0;
+                       } else {
+                               break;
+                       }
+               } while (1);
+
+               LASSERTF(id < fma_blk->gnm_num_mboxs, "id %d max %d\n",
+                        id, fma_blk->gnm_num_mboxs);
+               set_bit(id, (volatile unsigned long *)fma_blk->gnm_bit_array);
+               conn->gnc_mbox_id = id;
+
+               fma_blk->gnm_next_avail_mbox =
+                       (id == (fma_blk->gnm_num_mboxs - 1)) ? 0 : (id + 1);
+               fma_blk->gnm_avail_mboxs--;
+               conn->gnc_fma_blk = fma_blk;
+
+               kgnilnd_setup_smsg_attr(smsg_attr);
+
+               smsg_attr->msg_buffer = fma_blk->gnm_block;
+               smsg_attr->mbox_offset = fma_blk->gnm_mbox_size * id;
+               smsg_attr->mem_hndl = fma_blk->gnm_hndl;
+               smsg_attr->buff_size = fma_blk->gnm_mbox_size;
+
+               /* We'll set the hndl to zero for PHYS blocks unmapped during stack
+                * reset and re-use the same fma_blk after stack reset. This ensures we've
+                * properly mapped it before we use it */
+               LASSERTF(fma_blk->gnm_hndl.qword1 != 0UL, "unmapped fma_blk %p, state %d\n",
+                        fma_blk, fma_blk->gnm_state);
+
+               CDEBUG(D_NET, "conn %p smsg %p fmablk %p "
+                       "allocating SMSG mbox %d buf %p "
+                       "offset %u hndl "LPX64"."LPX64"\n",
+                       conn, smsg_attr, fma_blk, id,
+                       smsg_attr->msg_buffer, smsg_attr->mbox_offset,
+                       fma_blk->gnm_hndl.qword1,
+                       fma_blk->gnm_hndl.qword2);
+
+               mbox = &fma_blk->gnm_mbox_info[id];
+               mbox->mbx_create_conn_memset = jiffies;
+
+               /* zero mbox to remove any old data from our last use.
+                * this better be safe, if not our purgatory timers
+                * are too short or a peer really is misbehaving */
+               memset(smsg_attr->msg_buffer + smsg_attr->mbox_offset,
+                      0, smsg_attr->buff_size);
+               break;
+       }
+
+       spin_unlock(&dev->gnd_fmablk_lock);
+}
+
+int
+kgnilnd_setup_mbox(kgn_conn_t *conn)
+{
+       gni_smsg_attr_t         *smsg_attr = &conn->gnpr_smsg_attr;
+       int                      err = 0;
+
+       smsg_attr->msg_buffer = NULL;
+       /* Look for available mbox */
+       do {
+               kgnilnd_find_free_mbox(conn);
+
+               /* nothing in the existing buffers, make a new one */
+               if (smsg_attr->msg_buffer == NULL) {
+                       /* for runtime allocations, we only want vmalloc */
+                       err = kgnilnd_alloc_fmablk(conn->gnc_device, 0);
+                       if (err) {
+                               break;
+                       }
+               }
+       } while (smsg_attr->msg_buffer == NULL);
+
+       if (err)
+               CNETERR("couldn't allocate SMSG mbox for conn %p Error: %d\n",
+                       conn, err);
+       return err;
+}
+
+void
+kgnilnd_release_mbox(kgn_conn_t *conn, int purgatory_hold)
+{
+       kgn_device_t           *dev = conn->gnc_device;
+       gni_smsg_attr_t        *smsg_attr = &conn->gnpr_smsg_attr;
+       kgn_fma_memblock_t     *fma_blk = NULL;
+       kgn_mbox_info_t        *mbox = NULL;
+       int                     found = 0;
+       int                     id;
+
+       /* if we failed to setup mbox and now destroying conn */
+       if (smsg_attr->msg_buffer == NULL) {
+               return;
+       }
+
+       id = conn->gnc_mbox_id;
+
+       spin_lock(&dev->gnd_fmablk_lock);
+       /* make sure our conn points at a valid fma_blk
+        * We use this instead of a mem block search out of smsg_attr
+        * because we could have freed a block for fma_blk #1 but the fma_blk
+        * is still in the list for a purgatory hold. This would induce a false
+        * match if that same block gets reallocated to fma_blk #2 */
+       list_for_each_entry(fma_blk, &dev->gnd_fma_buffs, gnm_bufflist) {
+               if (fma_blk == conn->gnc_fma_blk) {
+                       found = 1;
+                       break;
+               }
+       }
+       LASSERTF(found, "unable to find conn 0x%p with gnc_fma_blk %p "
+                "anywhere in the world\n", conn, conn->gnc_fma_blk);
+
+       LASSERTF(id < fma_blk->gnm_num_mboxs,
+               "bad id %d max %d\n",
+               id, fma_blk->gnm_num_mboxs);
+
+       /* < 0 - was held, now free it
+        * == 0 - just free it
+        * > 0 - hold it for now */
+       if (purgatory_hold == 0) {
+               CDEBUG(D_NET, "conn %p smsg %p fmablk %p freeing SMSG mbox %d "
+                       "hndl "LPX64"."LPX64"\n",
+                       conn, smsg_attr, fma_blk, id,
+                       fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
+               fma_blk->gnm_avail_mboxs++;
+
+       } else if (purgatory_hold > 0) {
+               CDEBUG(D_NET, "conn %p smsg %p fmablk %p holding SMSG mbox %d "
+                       "hndl "LPX64"."LPX64"\n",
+                       conn, smsg_attr, fma_blk, id,
+                       fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
+
+               fma_blk->gnm_held_mboxs++;
+               fma_blk->gnm_max_timeout = MAX(fma_blk->gnm_max_timeout,
+                                               conn->gnc_timeout);
+       } else {
+               CDEBUG(D_NET, "conn %p smsg %p fmablk %p release SMSG mbox %d "
+                       "hndl "LPX64"."LPX64"\n",
+                       conn, smsg_attr, fma_blk, id,
+                       fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
+
+               fma_blk->gnm_held_mboxs--;
+               fma_blk->gnm_avail_mboxs++;
+       }
+
+       if (purgatory_hold <= 0) {
+               /* if kgni is retransmitting, freeing the smsg block before the EP
+                * is destroyed gets messy. Bug 768295. */
+               LASSERTF(conn->gnc_ephandle == NULL,
+                        "can't release mbox before EP is nuked. conn 0x%p\n", conn);
+
+               mbox = &fma_blk->gnm_mbox_info[id];
+               mbox->mbx_release_from_purgatory = jiffies;
+
+               /* clear conn gnc_fmablk if it is gone - this allows us to
+                * not worry about state so much in kgnilnd_destroy_conn
+                * and makes the guaranteed cleanup of the resources easier */
+               LASSERTF(test_and_clear_bit(id, fma_blk->gnm_bit_array),
+                       "conn %p bit %d already cleared in fma_blk %p\n",
+                        conn, id, fma_blk);
+               conn->gnc_fma_blk = NULL;
+       }
+
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FMABLK_AVAIL)) {
+               CERROR("LBUGs in your future: forcibly marking fma_blk %p "
+                      "as mapped\n", fma_blk);
+               fma_blk->gnm_state = GNILND_FMABLK_VIRT;
+       }
+
+       /* we don't release or unmap PHYS blocks as part of the normal cycle --
+        * those are controlled manually from startup/shutdown */
+       if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
+               /* we can unmap once all are unused (held or avail)
+                * but check hold_timeout to make sure we are not trying to double
+                * unmap this buffer. If there was no hold_timeout set due to
+                * held_mboxs, we'll free the mobx here shortly and won't have to
+                * worry about catching a double free for a 'clean' fma_blk */
+               if (((fma_blk->gnm_avail_mboxs + fma_blk->gnm_held_mboxs) == fma_blk->gnm_num_mboxs) &&
+                   (!fma_blk->gnm_hold_timeout)) {
+                       kgnilnd_unmap_fmablk(dev, fma_blk);
+               }
+
+               /* But we can only free once they are all avail */
+               if (fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs &&
+                   fma_blk->gnm_held_mboxs == 0) {
+                       /* all mailboxes are released, free fma_blk */
+                       kgnilnd_free_fmablk_locked(dev, fma_blk);
+               }
+       }
+
+       spin_unlock(&dev->gnd_fmablk_lock);
+}
+
+int
+kgnilnd_count_phys_mbox(kgn_device_t *device)
+{
+       int                     i = 0;
+       kgn_fma_memblock_t     *fma_blk;
+
+       spin_lock(&device->gnd_fmablk_lock);
+
+       list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
+               if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
+                       i += fma_blk->gnm_num_mboxs;
+       }
+       spin_unlock(&device->gnd_fmablk_lock);
+
+       RETURN(i);
+}
+
+int
+kgnilnd_allocate_phys_fmablk(kgn_device_t *device)
+{
+       int     rc;
+
+       while (kgnilnd_count_phys_mbox(device) < *kgnilnd_tunables.kgn_nphys_mbox) {
+
+               rc = kgnilnd_alloc_fmablk(device, 1);
+               if (rc) {
+                       CERROR("failed phys mbox allocation, stopping at %d, rc %d\n",
+                               kgnilnd_count_phys_mbox(device), rc);
+                       RETURN(rc);
+               }
+       }
+       RETURN(0);
+}
+
+int
+kgnilnd_map_phys_fmablk(kgn_device_t *device)
+{
+
+       int                     rc = 0;
+       kgn_fma_memblock_t     *fma_blk;
+
+       /* use sem to gate access to single thread, just in case */
+       down(&device->gnd_fmablk_sem);
+
+       spin_lock(&device->gnd_fmablk_lock);
+
+       list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
+               if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
+                       rc = kgnilnd_map_fmablk(device, fma_blk);
+                       if (rc)
+                               break;
+       }
+       spin_unlock(&device->gnd_fmablk_lock);
+
+       up(&device->gnd_fmablk_sem);
+
+       RETURN(rc);
+}
+
+void
+kgnilnd_unmap_phys_fmablk(kgn_device_t *device)
+{
+
+       kgn_fma_memblock_t      *fma_blk;
+
+       /* use sem to gate access to single thread, just in case */
+       down(&device->gnd_fmablk_sem);
+
+       spin_lock(&device->gnd_fmablk_lock);
+
+       list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
+               if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
+                       kgnilnd_unmap_fmablk(device, fma_blk);
+       }
+       spin_unlock(&device->gnd_fmablk_lock);
+
+       up(&device->gnd_fmablk_sem);
+}
+
+void
+kgnilnd_free_phys_fmablk(kgn_device_t *device)
+{
+
+       kgn_fma_memblock_t      *fma_blk, *fma_blkN;
+
+       /* use sem to gate access to single thread, just in case */
+       down(&device->gnd_fmablk_sem);
+
+       spin_lock(&device->gnd_fmablk_lock);
+
+       list_for_each_entry_safe(fma_blk, fma_blkN, &device->gnd_fma_buffs, gnm_bufflist) {
+               if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
+                       kgnilnd_free_fmablk_locked(device, fma_blk);
+       }
+       spin_unlock(&device->gnd_fmablk_lock);
+
+       up(&device->gnd_fmablk_sem);
+}
+
+/* kgnilnd dgram nid->struct managment */
+
+static inline struct list_head *
+kgnilnd_nid2dgramlist(kgn_device_t *dev, lnet_nid_t nid)
+{
+       unsigned int hash = ((unsigned int)nid) % *kgnilnd_tunables.kgn_peer_hash_size;
+
+       RETURN(&dev->gnd_dgrams[hash]);
+}
+
+
+/* needs dev->gnd_dgram_lock held */
+kgn_dgram_t *
+kgnilnd_find_dgram_locked(kgn_device_t *dev, lnet_nid_t dst_nid)
+{
+       struct list_head *dgram_list = kgnilnd_nid2dgramlist(dev, dst_nid);
+       kgn_dgram_t      *dgram;
+
+       list_for_each_entry(dgram, dgram_list, gndg_list) {
+
+               /* if state > POSTED, we are already handling cancel/completion */
+               if ((dgram->gndg_conn_out.gncr_dstnid != dst_nid) ||
+                    dgram->gndg_state > GNILND_DGRAM_POSTED)
+                       continue;
+
+               CDEBUG(D_NET, "got dgram [%p] -> %s\n",
+                      dgram, libcfs_nid2str(dst_nid));
+               return dgram;
+       }
+       return NULL;
+}
+
+int
+kgnilnd_find_and_cancel_dgram(kgn_device_t *dev, lnet_nid_t dst_nid)
+{
+       kgn_dgram_t     *dgram;
+
+       spin_lock(&dev->gnd_dgram_lock);
+       dgram = kgnilnd_find_dgram_locked(dev, dst_nid);
+
+       if (dgram) {
+               kgnilnd_cancel_dgram_locked(dgram);
+       }
+       spin_unlock(&dev->gnd_dgram_lock);
+
+       RETURN(!!(dgram == NULL));
+}
+
+int
+kgnilnd_pack_connreq(kgn_connreq_t *connreq, kgn_conn_t *conn,
+                    lnet_nid_t srcnid, lnet_nid_t dstnid,
+                    kgn_connreq_type_t type)
+{
+       int err = 0;
+
+       /* ensure we haven't violated max datagram size */
+       CLASSERT(sizeof(kgn_connreq_t) <= GNI_DATAGRAM_MAXSIZE);
+
+       /* no need to zero out, we do that when allocating dgram */
+       connreq->gncr_magic     = GNILND_MSG_MAGIC;
+
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_SRCNID)) {
+               srcnid = 0xABADBABE;
+       } else if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) {
+               dstnid = 0xDEFEC8ED;
+       }
+
+       connreq->gncr_srcnid    = srcnid;
+       connreq->gncr_dstnid    = dstnid;
+
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
+               connreq->gncr_version = 99;
+       } else {
+               connreq->gncr_version   = GNILND_CONNREQ_VERSION;
+       }
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
+               connreq->gncr_type = 99;
+       } else {
+               connreq->gncr_type      = type;
+       }
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
+               connreq->gncr_peerstamp = 0;
+       } else {
+               connreq->gncr_peerstamp = kgnilnd_data.kgn_peerstamp;
+       }
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
+               connreq->gncr_connstamp = 0;
+       } else {
+               connreq->gncr_connstamp = conn->gnc_my_connstamp;
+       }
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
+               connreq->gncr_timeout = 0;
+       } else {
+               connreq->gncr_timeout   = conn->gnc_timeout;
+       }
+
+       /* the rest pack the data into the payload in other places */
+       if (type == GNILND_CONNREQ_REQ) {
+               kgn_gniparams_t       *req_params = &connreq->gncr_gnparams;
+               req_params->gnpr_host_id = conn->gnc_device->gnd_host_id;
+               req_params->gnpr_cqid = conn->gnc_cqid;
+
+               /* allocate mailbox for this connection */
+               err = kgnilnd_setup_mbox(conn);
+               if (err != 0) {
+                       CERROR("Failed to setup FMA mailbox (%d)\n", err);
+               }
+               req_params->gnpr_smsg_attr = conn->gnpr_smsg_attr;
+       }
+
+       /* XXX Nic: TBD - checksum computation */
+
+       return err;
+}
+
+int
+kgnilnd_unpack_connreq(kgn_dgram_t *dgram)
+{
+       kgn_connreq_t           *connreq = &dgram->gndg_conn_in;
+       int                      swab, rc = 0;
+       kgn_net_t               *net;
+
+       /* the following fields must be handled in a backwards compatible
+        * manner to ensure we can always send and interpret NAKs */
+
+       if (connreq->gncr_magic != GNILND_MSG_MAGIC &&
+           connreq->gncr_magic != __swab32(GNILND_MSG_MAGIC)) {
+               /* Unexpected magic! */
+               CERROR("Unexpected magic %08x\n",
+                      connreq->gncr_magic);
+               return -EBADF;
+       }
+
+       swab = (connreq->gncr_magic == __swab32(GNILND_MSG_MAGIC));
+       if (swab) {
+               __swab32s(&connreq->gncr_magic);
+               __swab32s(&connreq->gncr_cksum);
+               __swab16s(&connreq->gncr_type);
+               __swab16s(&connreq->gncr_version);
+               __swab32s(&connreq->gncr_timeout);
+               __swab64s(&connreq->gncr_srcnid);
+               __swab64s(&connreq->gncr_dstnid);
+               __swab64s(&connreq->gncr_peerstamp);
+               __swab64s(&connreq->gncr_connstamp);
+       }
+
+       /* Do NOT return anything but -EBADF before we munge
+        * connreq->gncr_srcnid - we need that to send the nak */
+
+       if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
+               lnet_nid_t      incoming = connreq->gncr_srcnid;
+
+               /* even if the incoming packet is hosed, we know who we sent
+                * the original and can set the srcnid so that we can properly
+                * look up our peer to close the loop on this connreq. We still use
+                * -EBADF to prevent a NAK - just in case there are issues with
+                * the payload coming from a random spot, etc. */
+               connreq->gncr_srcnid = dgram->gndg_conn_out.gncr_dstnid;
+
+               if (LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid) !=
+                               LNET_NIDADDR(incoming)) {
+                       /* we got a datagram match for the wrong nid... */
+                       CERROR("matched datagram 0x%p with srcnid %s "
+                               "(%x), expecting %s (%x)\n",
+                               dgram,
+                               libcfs_nid2str(incoming),
+                               LNET_NIDADDR(incoming),
+                               libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
+                               LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid));
+                       return -EBADF;
+               }
+       } else {
+               /* if we have a wildcard datagram it should match an
+                * incoming "active" datagram that should have a fully formed
+                * srcnid and dstnid. If we couldn't unpack it, we drop as
+                * corrupted packet, otherwise we'll just verify that the dstnid
+                * matches the NID for the NET that the dgram was posted */
+
+               /* make sure their wildcard didn't match ours, that is unpossible */
+               LASSERTF(connreq->gncr_dstnid != LNET_NID_ANY,
+                        "dgram 0x%p from %s, connreq 0x%p; "
+                        "wildcard matched wildcard \n", dgram,
+                        libcfs_nid2str(connreq->gncr_srcnid), connreq);
+
+               rc = kgnilnd_find_net(connreq->gncr_dstnid, &net);
+
+               if (rc == -ESHUTDOWN) {
+                       CERROR("Looking up network: device is in shutdown");
+                       return rc;
+               } else if (rc == -ENONET) {
+                       CERROR("Connection data from %s: she sent "
+                       "dst_nid %s, but net lookup failed on "
+                       "dgram 0x%p@%s\n",
+                       libcfs_nid2str(connreq->gncr_srcnid),
+                       libcfs_nid2str(connreq->gncr_dstnid),
+                       dgram, kgnilnd_dgram_type2str(dgram));
+                       return rc;
+               }
+
+               if (net->gnn_ni->ni_nid != connreq->gncr_dstnid) {
+                       CERROR("Bad connection data from %s: she sent "
+                              "dst_nid %s, but I am %s with dgram 0x%p@%s\n",
+                              libcfs_nid2str(connreq->gncr_srcnid),
+                              libcfs_nid2str(connreq->gncr_dstnid),
+                              libcfs_nid2str(net->gnn_ni->ni_nid),
+                              dgram, kgnilnd_dgram_type2str(dgram));
+                       kgnilnd_net_decref(net);
+                       return -EBADSLT;
+               }
+
+               /* kgnilnd_find_net takes a ref on the net it finds, You need to decref it when not needed. */
+               kgnilnd_net_decref(net);
+       }
+
+       if (connreq->gncr_version != GNILND_CONNREQ_VERSION) {
+               CERROR("Unexpected version %d\n", connreq->gncr_version);
+               return -EPROTO;
+       }
+
+       /* XXX Nic: TBD - checksum validation */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_DROP)) {
+               return -EBADF;
+       }
+
+       if (swab && connreq->gncr_type == GNILND_CONNREQ_REQ) {
+               __u64 msg_addr = (__u64) connreq->gncr_gnparams.gnpr_smsg_attr.msg_buffer;
+
+               __swab32s(&connreq->gncr_gnparams.gnpr_host_id);
+               __swab32s(&connreq->gncr_gnparams.gnpr_cqid);
+               __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.buff_size);
+               __swab16s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_maxcredit);
+               __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_offset);
+               __swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword1);
+               __swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword2);
+               __swab64s(&msg_addr);
+               __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_maxsize);
+               __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_type);
+       } else if (swab && connreq->gncr_type == GNILND_CONNREQ_NAK) {
+               __swab32s(&connreq->gncr_nakdata.gnnd_errno);
+       }
+
+       /* since we use a unique instance ID for each network, the driver
+        * will take care of dropping datagrams if we don't have that network.
+        */
+
+       /* few more idiot software or configuration checks */
+
+       switch (connreq->gncr_type) {
+       case GNILND_CONNREQ_REQ:
+               /* wire up EP and SMSG block - this will check the incoming data
+                * and barf a NAK back if need to */
+               rc = kgnilnd_set_conn_params(dgram);
+               if (rc)
+                       return rc;
+               break;
+       case GNILND_CONNREQ_NAK:
+       case GNILND_CONNREQ_CLOSE:
+               break;
+       default:
+               CERROR("unknown connreq packet type %d\n", connreq->gncr_type);
+               return -EPROTO;
+       }
+
+       if (connreq->gncr_peerstamp == 0 || connreq->gncr_connstamp == 0) {
+               CERROR("Recived bad timestamps peer "LPU64" conn "LPU64"\n",
+               connreq->gncr_peerstamp, connreq->gncr_connstamp);
+               return -EPROTO;
+       }
+
+       if (connreq->gncr_timeout < GNILND_MIN_TIMEOUT) {
+               CERROR("Received timeout %d < MIN %d\n",
+                      connreq->gncr_timeout, GNILND_MIN_TIMEOUT);
+               return -EPROTO;
+       }
+
+       return 0;
+}
+
+int
+kgnilnd_alloc_dgram(kgn_dgram_t **dgramp, kgn_device_t *dev, kgn_dgram_type_t type)
+{
+       kgn_dgram_t         *dgram;
+
+       dgram = cfs_mem_cache_alloc(kgnilnd_data.kgn_dgram_cache,
+                                   CFS_ALLOC_ATOMIC);
+       if (dgram == NULL)
+               return -ENOMEM;
+
+       /* cache alloc'd memory is not zeroed */
+       memset((void *)dgram, 0, sizeof(*dgram)) ;
+
+       INIT_LIST_HEAD(&dgram->gndg_list);
+       dgram->gndg_state = GNILND_DGRAM_USED;
+       dgram->gndg_type = type;
+       dgram->gndg_magic = GNILND_DGRAM_MAGIC;
+
+       atomic_inc(&dev->gnd_ndgrams);
+
+       CDEBUG(D_MALLOC|D_NETTRACE, "slab-alloced 'dgram': %lu at %p.\n",
+              sizeof(*dgram), dgram);
+
+       *dgramp = dgram;
+       return 0;
+}
+
+/* call this on a dgram that came back from kgnilnd_ep_postdata_test_by_id
+ * returns < 0 on dgram to be cleaned up
+ * > 0 on dgram that isn't done yet
+ * == 0 on dgram that is ok and needs connreq processing */
+int
+kgnilnd_process_dgram(kgn_dgram_t *dgram, gni_post_state_t post_state)
+{
+       int rc = 0;
+
+       switch (post_state) {
+       case GNI_POST_COMPLETED:
+               /* normal state for dgrams that need actual processing */
+               /* GOTO to avoid processing dgram as canceled/done */
+               GOTO(process_out, rc);
+
+       case GNI_POST_PENDING:
+               /* we should only see this if we are testing a WC dgram after a
+                * cancel - it means that it needs a full cycle of waiting
+                * for kgni_sm_task to finish moving it to TERMINATED */
+               LASSERTF((dgram->gndg_type == GNILND_DGRAM_WC_REQ) &&
+                         (dgram->gndg_state == GNILND_DGRAM_CANCELED),
+                        "POST_PENDING dgram 0x%p with bad type %d(%s) or state %d(%s)\n",
+                        dgram, dgram->gndg_type, kgnilnd_dgram_type2str(dgram),
+                        dgram->gndg_state, kgnilnd_dgram_state2str(dgram));
+
+               /* positive RC as this dgram isn't done yet */
+               rc = EINPROGRESS;
+
+               /* GOTO as this isn't done yet */
+               GOTO(process_out, rc);
+               break;
+
+       case GNI_POST_TERMINATED:
+               /* we've called cancel and it is done or remote guy called cancel and
+                * we've receved it on a WC dgram */
+#if 0
+               /* we are seeing weird terminations on non WC dgrams when we have not
+                * canceled them */
+
+               LASSERTF(dgram->gndg_state == GNILND_DGRAM_CANCELED ||
+                        dgram->gndg_conn_out.gncr_dstnid == LNET_NID_ANY,
+                       "dgram 0x%p with bad state %d(%s) or dst nid %s\n",
+                       dgram, dgram->gndg_state, kgnilnd_dgram_state2str(dgram),
+                       libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid));
+#endif
+
+               CDEBUG(D_NETTRACE, "dgram 0x%p saw %s, cleaning it up\n", dgram,
+                      dgram->gndg_state == GNILND_DGRAM_CANCELED ?  "canceled" : "terminated");
+
+               rc =  -ECANCELED;
+               break;
+
+       case GNI_POST_TIMEOUT:
+               /* we could have a timeout on a wildcard dgram too - if
+                * we got the incoming request but the remote node beefed
+                * before kgni could send the match data back. We'll just error
+                * on the active case and bail out gracefully */
+               if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
+                       CNETERR("hardware timeout for connect to "
+                              "%s after %lu seconds. Is node dead?\n",
+                              libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
+                              cfs_duration_sec(jiffies - dgram->gndg_post_time));
+               }
+
+               rc = -ETIMEDOUT;
+               break;
+
+       default:
+               CERROR("dgram 0x%p with bad post_state %d\n", dgram, post_state);
+               LBUG();
+       }
+
+       /* now finish cleaning up a dgram that is canceled/terminated and needs to
+        * go away */
+
+       /* If this was actively canceled, drop the count now that we are processing */
+       if (dgram->gndg_state == GNILND_DGRAM_CANCELED) {
+               atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
+               /* caller responsible for gndg_list removal */
+       }
+
+process_out:
+
+       RETURN(rc);
+}
+
+/* needs dev->gnd_dgram_lock held */
+void
+kgnilnd_cancel_dgram_locked(kgn_dgram_t *dgram)
+{
+       gni_return_t            grc;
+
+       if (dgram->gndg_state != GNILND_DGRAM_POSTED) {
+               return;
+       }
+
+       LASSERTF(dgram->gndg_conn != NULL,
+                "dgram 0x%p with NULL conn\n", dgram);
+
+       /* C.E - WC dgrams could be canceled immediately but
+        * if there was some match pending, we need to call
+        * test_by_id to clear it out. If that test returns
+        * POST_PENDING, it is half done and needs to go along
+        * with the rest of dgrams and go through a kgni_sm_task cycle
+        * and deliver a GNI_POST_TERMINATED event before they
+        * are actually canceled */
+
+       dgram->gndg_state = GNILND_DGRAM_CANCELED;
+
+       if (dgram->gndg_conn->gnc_state >= GNILND_CONN_ESTABLISHED) {
+               /* we don't need to cancel_by_id if the datagram was good */
+               return;
+       }
+
+       /* let folks know there are outstanding cancels */
+       atomic_inc(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
+       /* leave on nid list until cancel is done for debugging fun */
+       grc = kgnilnd_ep_postdata_cancel_by_id(dgram->gndg_conn->gnc_ephandle, (__u64) dgram);
+
+       /* if we don't get success here, we have hosed up the dgram tracking
+        * code and need to bail out */
+       LASSERTF(grc == GNI_RC_SUCCESS,
+                "postdata_cancel returned %d for conn 0x%p to %s\n",
+                grc, dgram->gndg_conn,
+                dgram->gndg_conn->gnc_peer ?
+                 libcfs_nid2str(dgram->gndg_conn->gnc_peer->gnp_nid)
+                 : "<?>");
+
+       CDEBUG(D_NETTRACE,
+               "canceled dgram 0x%p conn 0x%p ephandle 0x%p\n",
+               dgram, dgram->gndg_conn,
+               dgram->gndg_conn->gnc_ephandle);
+
+       if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) {
+               gni_post_state_t         post_state;
+               int                      rc = 0;
+               __u32                    remote_addr = 0, remote_id = 0;
+
+               grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle,
+                                                    (__u64)dgram, &post_state,
+                                                    &remote_addr, &remote_id);
+
+               LASSERTF(grc == GNI_RC_NO_MATCH || grc == GNI_RC_SUCCESS,
+                        "bad grc %d from test_by_id on dgram 0x%p\n",
+                       grc, dgram);
+
+               /* if WC was canceled immediately, we get NO_MATCH, if needs to go
+                * through full cycle, we get SUCCESS and need to parse post_state */
+
+               CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
+                       "remote_addr %u remote_id %u\n", grc, dgram,
+                       kgnilnd_dgram_type2str(dgram),
+                       post_state, remote_addr, remote_id);
+
+               if (grc == GNI_RC_NO_MATCH) {
+                       /* she's gone, reduce count and move along */
+                       dgram->gndg_state = GNILND_DGRAM_DONE;
+                       atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
+                       RETURN_EXIT;
+               }
+
+               rc = kgnilnd_process_dgram(dgram, post_state);
+
+               if (rc <= 0) {
+                       /* if for some weird reason we get a valid dgram back, just mark as done
+                        * so we can drop it and move along.
+                        * C.E - if it was completed, we'll just release the conn/mbox
+                        * back into the pool and it'll get reused. That said, we should only
+                        * be canceling a WC dgram on stack rest or shutdown, so that is moot */
+                       dgram->gndg_state = GNILND_DGRAM_DONE;
+                       atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
+
+                       /* caller context responsible for calling kgnilnd_release_dgram() */
+               } else {
+                       /* still pending, let it simmer until golden brown and delicious */
+               }
+       }
+
+       /* for non WC dgrams, they are still on the nid list but marked canceled waiting
+        * for kgni to return their ID to us via probe - that is when we'll complete their
+        * cancel processing */
+}
+
+void
+kgnilnd_cleanup_dgram(kgn_dgram_t *dgram)
+{
+       /* release the dgram ref on conn */
+       if (dgram->gndg_conn) {
+               kgnilnd_conn_decref(dgram->gndg_conn);
+               dgram->gndg_conn = NULL;
+       }
+}
+
+void
+kgnilnd_free_dgram(kgn_device_t *dev, kgn_dgram_t *dgram)
+{
+       LASSERTF(dgram->gndg_state == GNILND_DGRAM_USED ||
+                dgram->gndg_state == GNILND_DGRAM_DONE,
+                "dgram 0x%p with bad state %s\n",
+                dgram, kgnilnd_dgram_state2str(dgram));
+
+       /* bit of poisoning to help detect bad driver data */
+       dgram->gndg_magic = 0x6f5a6b5f;
+       atomic_dec(&dev->gnd_ndgrams);
+
+       cfs_mem_cache_free(kgnilnd_data.kgn_dgram_cache, dgram);
+       CDEBUG(D_MALLOC|D_NETTRACE, "slab-freed 'dgram': %lu at %p.\n",
+              sizeof(*dgram), dgram);
+}
+
+int
+kgnilnd_post_dgram(kgn_device_t *dev, lnet_nid_t dstnid, kgn_connreq_type_t type,
+                  int data_rc)
+{
+       int              rc = 0;
+       kgn_dgram_t     *dgram = NULL;
+       kgn_dgram_t     *tmpdgram;
+       kgn_dgram_type_t dgtype;
+       gni_return_t     grc;
+       __u64            srcnid;
+       ENTRY;
+
+       switch (type) {
+       case GNILND_CONNREQ_REQ:
+               if (dstnid == LNET_NID_ANY)
+                       dgtype = GNILND_DGRAM_WC_REQ;
+               else
+                       dgtype = GNILND_DGRAM_REQ;
+               break;
+       case GNILND_CONNREQ_NAK:
+               LASSERTF(dstnid != LNET_NID_ANY, "can't NAK to LNET_NID_ANY\n");
+               dgtype = GNILND_DGRAM_NAK;
+               break;
+       default:
+               CERROR("unknown connreq type %d\n", type);
+               LBUG();
+       }
+
+       rc = kgnilnd_alloc_dgram(&dgram, dev, dgtype);
+       if (rc < 0) {
+               rc = -ENOMEM;
+               GOTO(post_failed, rc);
+       }
+
+       rc = kgnilnd_create_conn(&dgram->gndg_conn, dev);
+       if (rc) {
+               GOTO(post_failed, rc);
+       }
+
+       if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) {
+               /* clear buffer for sanity on reuse of wildcard */
+               memset(&dgram->gndg_conn_in, 0, sizeof(kgn_connreq_t));
+       }
+
+       if (dstnid == LNET_NID_ANY) {
+               /* set here to reset any dgram re-use */
+               dgram->gndg_conn->gnc_state = GNILND_CONN_LISTEN;
+       } else {
+               __u32            host_id;
+
+               rc = kgnilnd_nid_to_nicaddrs(LNET_NIDADDR(dstnid), 1, &host_id);
+               if (rc <= 0) {
+                       rc = -ESRCH;
+                       GOTO(post_failed, rc);
+               }
+
+               dgram->gndg_conn->gnc_state = GNILND_CONN_CONNECTING;
+
+               /* don't need to serialize, there are no CQs for the dgram
+                * EP on the kgn_net_t */
+               grc = kgnilnd_ep_bind(dgram->gndg_conn->gnc_ephandle, host_id, dev->gnd_id);
+
+               if (grc != GNI_RC_SUCCESS) {
+                       rc = -ECONNABORTED;
+                       GOTO(post_failed, rc);
+               }
+
+       }
+
+       /* If we are posting wildcards post using a net of 0, otherwise we'll use the
+        * net of the destination node.
+        */
+
+       if (dstnid == LNET_NID_ANY) {
+               srcnid = LNET_MKNID(LNET_MKNET(GNILND, 0), dev->gnd_nid);
+       } else {
+               srcnid = LNET_MKNID(LNET_NIDNET(dstnid), dev->gnd_nid);
+       }
+
+       rc = kgnilnd_pack_connreq(&dgram->gndg_conn_out, dgram->gndg_conn,
+                                 srcnid, dstnid, type);
+       if (rc) {
+               GOTO(post_failed, rc);
+       }
+
+       if (type == GNILND_CONNREQ_NAK)
+               dgram->gndg_conn_out.gncr_nakdata.gnnd_errno = data_rc;
+
+       dgram->gndg_post_time = jiffies;
+
+       /* XXX Nic: here is where we'd add in logical network multiplexing */
+
+       CDEBUG(D_NETTRACE, "dgram 0x%p type %s %s->%s cdm %d\n",
+              dgram, kgnilnd_dgram_type2str(dgram),
+              libcfs_nid2str(srcnid),
+              libcfs_nid2str(dstnid), dev->gnd_id);
+
+       /* this allocates memory, can't hold locks across */
+       grc = kgnilnd_ep_postdata_w_id(dgram->gndg_conn->gnc_ephandle,
+                                  &dgram->gndg_conn_out, sizeof(kgn_connreq_t),
+                                  &dgram->gndg_conn_in, sizeof(kgn_connreq_t),
+                                  (__u64)dgram);
+
+       if (grc != GNI_RC_SUCCESS) {
+               CNETERR("dropping failed dgram post id 0x%p type %s"
+                       " reqtype %s to %s: rc %d\n",
+                       dgram, kgnilnd_dgram_type2str(dgram),
+                       kgnilnd_connreq_type2str(&dgram->gndg_conn_out),
+                       libcfs_nid2str(dstnid), grc);
+               rc = (grc == GNI_RC_ERROR_NOMEM) ? -ENOMEM : -EBADR;
+               GOTO(post_failed, rc);
+       }
+
+       /* we don't need to add earlier - if someone does del_peer during post,
+        * that peer will get marked as unlinked and the callers wil take care of it.
+        * The dgram code is largely kgn_peer_t ignorant, so at worst, we'll just drop
+        * the completed dgram later when we cant find a peer to stuff it into */
+
+       spin_lock(&dev->gnd_dgram_lock);
+
+       /* make sure we are not double posting targeted dgrams
+        * - we can multiple post WC dgrams to help with processing speed */
+       if (dstnid != LNET_NID_ANY) {
+               tmpdgram = kgnilnd_find_dgram_locked(dev, dstnid);
+
+               LASSERTF(tmpdgram == NULL,
+                       "dgram 0x%p->%s already posted\n",
+                        dgram, libcfs_nid2str(dstnid));
+       }
+
+       /* unmunge dstnid to help processing code cope... */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) {
+               dgram->gndg_conn_out.gncr_dstnid = dstnid;
+       }
+
+       list_add_tail(&dgram->gndg_list, kgnilnd_nid2dgramlist(dev, dstnid));
+       dgram->gndg_state = GNILND_DGRAM_POSTED;
+       spin_unlock(&dev->gnd_dgram_lock);
+
+post_failed:
+       if (rc < 0 && dgram != NULL) {
+               kgnilnd_cleanup_dgram(dgram);
+               kgnilnd_free_dgram(dev, dgram);
+       }
+
+       RETURN(rc);
+}
+
+void
+kgnilnd_release_dgram(kgn_device_t *dev, kgn_dgram_t *dgram)
+{
+       spin_lock(&dev->gnd_dgram_lock);
+       kgnilnd_cancel_dgram_locked(dgram);
+       spin_unlock(&dev->gnd_dgram_lock);
+
+       kgnilnd_cleanup_dgram(dgram);
+
+       /* if the dgram is 'canceled' it needs to be wait until the event
+        * comes up from kgni that tells us it is safe to release */
+       if (dgram->gndg_state != GNILND_DGRAM_CANCELED) {
+               dgram->gndg_state = GNILND_DGRAM_DONE;
+
+               LASSERTF(list_empty(&dgram->gndg_list), "dgram 0x%p on list\n", dgram);
+
+               /* if it is a wildcard and we are in an appropriate state, repost
+                * the wildcard */
+
+               if ((dgram->gndg_type == GNILND_DGRAM_WC_REQ) &&
+                   (!kgnilnd_data.kgn_wc_kill && !kgnilnd_data.kgn_in_reset)) {
+                       int     rerc;
+
+                       rerc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
+                       LASSERTF(rerc == 0,
+                               "error %d: dev %d could not repost wildcard datagram id 0x%p\n",
+                               rerc, dev->gnd_id, dgram);
+               }
+
+               /* always free the old dgram */
+               kgnilnd_free_dgram(dev, dgram);
+       }
+}
+
+
+int
+kgnilnd_probe_for_dgram(kgn_device_t *dev, kgn_dgram_t **dgramp)
+{
+       kgn_dgram_t             *dgram = NULL;
+       gni_post_state_t         post_state;
+       gni_return_t             grc;
+       int                      rc = 0;
+       __u64                    readyid;
+       __u32                    remote_addr = 0, remote_id = 0;
+       ENTRY;
+
+       /* Probe with the lock held. That way if we get a dgram we dont have it canceled
+        * between finding the ready dgram and grabbing the lock to remove it from the
+        * list. Otherwise we could be left in an inconsistent state. We own the dgram
+        * once its off the list so we don't need to worry about others changing it at
+        * that point. */
+       spin_lock(&dev->gnd_dgram_lock);
+       grc = kgnilnd_postdata_probe_by_id(dev->gnd_handle, &readyid);
+       if (grc != GNI_RC_SUCCESS) {
+               spin_unlock(&dev->gnd_dgram_lock);
+               /* return 0 to indicate nothing happened */
+               RETURN(0);
+       }
+
+       CDEBUG(D_NET, "ready "LPX64" on device 0x%p\n",
+               readyid, dev);
+
+       dgram = (kgn_dgram_t *)readyid;
+
+       LASSERTF(dgram->gndg_magic == GNILND_DGRAM_MAGIC,
+                "dgram 0x%p from id "LPX64" with bad magic %x\n",
+                dgram, readyid, dgram->gndg_magic);
+
+       LASSERTF(dgram->gndg_state == GNILND_DGRAM_POSTED ||
+                dgram->gndg_state == GNILND_DGRAM_CANCELED,
+                "dgram 0x%p with bad state %s\n",
+                dgram, kgnilnd_dgram_state2str(dgram));
+
+       LASSERTF(!list_empty(&dgram->gndg_list),
+                "dgram 0x%p with bad list state %s\n",
+                dgram, kgnilnd_dgram_state2str(dgram));
+
+       /* now we know that the datagram structure is ok, so pull off list */
+       list_del_init(&dgram->gndg_list);
+
+       /* while we have the gnn_dgram_lock and BEFORE we call test_by_id
+        * change the state from POSTED to PROCESSING to ensure that
+        * nobody cancels it after we've pulled it from the wire */
+       if (dgram->gndg_state == GNILND_DGRAM_POSTED) {
+               dgram->gndg_state = GNILND_DGRAM_PROCESSING;
+       }
+
+       spin_unlock(&dev->gnd_dgram_lock);
+
+       /* we now "own" this datagram */
+
+       LASSERTF(dgram->gndg_conn != NULL,
+               "dgram 0x%p with NULL conn\n", dgram);
+
+       grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle,
+                                            (__u64)dgram, &post_state,
+                                            &remote_addr, &remote_id);
+
+       LASSERTF(grc != GNI_RC_NO_MATCH, "kgni lied! probe_by_id told us that"
+                " id "LPU64" was ready\n", readyid);
+
+       CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
+               "remote_addr %u remote_id %u\n", grc, dgram,
+               kgnilnd_dgram_type2str(dgram),
+               post_state, remote_addr, remote_id);
+
+       if (unlikely(grc != GNI_RC_SUCCESS)) {
+               CNETERR("getting data for dgram 0x%p->%s failed rc %d. Dropping it\n",
+                       dgram, libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
+                       grc);
+               rc = -EINVAL;
+               GOTO(probe_for_out, rc);
+       }
+
+       rc = kgnilnd_process_dgram(dgram, post_state);
+
+       /* we should never get probe finding a dgram for us and then it
+        * being a WC dgram that is still in the middle of processing */
+       LASSERTF(rc <= 0, "bad rc %d from process_dgram 0x%p state %d\n",
+                rc, dgram, post_state);
+
+       if (rc == 0) {
+               /* dgram is good enough for the data to be used */
+               dgram->gndg_state = GNILND_DGRAM_PROCESSING;
+               /* fake rc to mark that we've done something */
+               rc = 1;
+       } else {
+               /* bring out your dead! */
+               dgram->gndg_state = GNILND_DGRAM_DONE;
+       }
+
+       *dgramp = dgram;
+       RETURN(rc);
+
+probe_for_out:
+
+       kgnilnd_release_dgram(dev, dgram);
+       RETURN(rc);
+}
+
+int
+kgnilnd_setup_wildcard_dgram(kgn_device_t *dev)
+{
+       /* if kgn_wildcard is zero, return error */
+       int     rc = -ENOENT, i;
+       ENTRY;
+
+       for (i = 0; i < *kgnilnd_tunables.kgn_nwildcard; i++) {
+               rc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
+               if (rc < 0) {
+                       CERROR("error %d: could not post wildcard datagram # %d\n",
+                               rc, i);
+                       rc = -EINVAL;
+                       GOTO(failed, rc);
+               }
+       }
+
+failed:
+       RETURN(rc);
+}
+
+int
+kgnilnd_cancel_net_dgrams(kgn_net_t *net)
+{
+       kgn_dgram_t            *dg, *dgN;
+       struct list_head        zombies;
+       int                     i;
+       ENTRY;
+
+       /* we want to cancel any outstanding dgrams - we don't want to rely
+        * on del_peer_or_conn catching all of them. This helps protect us in cases
+        * where we don't quite keep the peer->dgram mapping in sync due to some
+        * race conditions */
+
+       LASSERTF(net->gnn_shutdown || kgnilnd_data.kgn_in_reset,
+                "called with LND invalid state: net shutdown %d "
+                "in reset %d\n", net->gnn_shutdown,
+                kgnilnd_data.kgn_in_reset);
+
+       INIT_LIST_HEAD(&zombies);
+
+       spin_lock(&net->gnn_dev->gnd_dgram_lock);
+
+       for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
+               list_for_each_entry_safe(dg, dgN, &net->gnn_dev->gnd_dgrams[i], gndg_list) {
+
+                       /* skip nids not on our net or are wildcards */
+
+
+                       if (dg->gndg_type == GNILND_DGRAM_WC_REQ ||
+                               net->gnn_netnum != LNET_NETNUM(LNET_NIDNET(dg->gndg_conn_out.gncr_dstnid)))
+                               continue;
+
+                       kgnilnd_cancel_dgram_locked(dg);
+               }
+       }
+
+       spin_unlock(&net->gnn_dev->gnd_dgram_lock);
+
+       RETURN(0);
+}
+
+int
+kgnilnd_cancel_wc_dgrams(kgn_device_t *dev)
+{
+       kgn_dgram_t *dg, *dgN;
+       struct list_head zombies;
+       ENTRY;
+
+       /* Time to kill the outstanding WC's
+        * WC's exist on net 0 only but match on any net...
+        */
+
+       LASSERTF(kgnilnd_data.kgn_in_reset || kgnilnd_data.kgn_wc_kill,
+               "called with LND invalid state: WC shutdown %d "
+               "in reset %d\n", kgnilnd_data.kgn_wc_kill,
+               kgnilnd_data.kgn_in_reset);
+
+       INIT_LIST_HEAD(&zombies);
+       spin_lock(&dev->gnd_dgram_lock);
+
+       do {
+               dg = kgnilnd_find_dgram_locked(dev, LNET_NID_ANY);
+               if (dg != NULL) {
+                       LASSERTF(dg->gndg_type == GNILND_DGRAM_WC_REQ,
+                                "dgram 0x%p->%s with bad type %d (%s)\n",
+                               dg, libcfs_nid2str(dg->gndg_conn_out.gncr_dstnid),
+                               dg->gndg_type, kgnilnd_dgram_type2str(dg));
+
+                       kgnilnd_cancel_dgram_locked(dg);
+
+                       /* WC could be DONE already, check and if so add to list to be released */
+                       if (dg->gndg_state == GNILND_DGRAM_DONE) {
+                               list_del_init(&dg->gndg_list);
+                               list_add_tail(&dg->gndg_list, &zombies);
+                       }
+               }
+       } while (dg != NULL);
+
+       spin_unlock(&dev->gnd_dgram_lock);
+
+       list_for_each_entry_safe(dg, dgN, &zombies, gndg_list) {
+               list_del_init(&dg->gndg_list);
+               kgnilnd_release_dgram(dev, dg);
+       }
+       RETURN(0);
+
+}
+
+void
+kgnilnd_wait_for_canceled_dgrams(kgn_device_t *dev)
+{
+       int             i = 4;
+       int             rc;
+       gni_return_t    grc;
+       __u64           readyid;
+       kgn_dgram_t    *dgram;
+
+       /* use do while to get at least one check run to allow
+        * regression test for 762072 to hit bug if there */
+
+       /* This function races with the dgram mover during shutdown so it is possible for
+        * a dgram to be seen in kgnilnd_postdata_probe_wait_by_id but be handled in the
+        * dgram mover thread instead of inside of this function.
+        */
+
+       /* This should only be called from within shutdown, baseshutdown, or stack reset.
+        * there are no assertions here to verify since base_shutdown has nothing in it we can check
+        * the net is gone by then.
+        */
+
+       do {
+               i++;
+               CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
+                       "Waiting for %d canceled datagrams to clear on device %d\n",
+                       atomic_read(&dev->gnd_canceled_dgrams), dev->gnd_id);
+
+               /* check once a second */
+               grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle,
+                      250, &readyid);
+
+               if (grc != GNI_RC_SUCCESS)
+                       continue;
+
+               CDEBUG(D_NET, "ready "LPX64" on device %d->0x%p\n",
+                       readyid, dev->gnd_id, dev);
+
+               rc = kgnilnd_probe_for_dgram(dev, &dgram);
+               if (rc != 0) {
+                       /* if we got a valid dgram or one that is now done, clean up */
+                       kgnilnd_release_dgram(dev, dgram);
+               }
+       } while (atomic_read(&dev->gnd_canceled_dgrams));
+}
+
+int
+kgnilnd_start_connect(kgn_peer_t *peer)
+{
+       int              rc = 0;
+       /* sync point for kgnilnd_del_peer_locked - do an early check to
+        * catch the most common hits where del_peer is done by the
+        * time we get here */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING1)) {
+               while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING1, 1)) {};
+       }
+
+       write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+       if (!kgnilnd_peer_active(peer) || peer->gnp_connecting != GNILND_PEER_CONNECT) {
+               /* raced with peer getting unlinked */
+               write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+               rc = ESTALE;
+               GOTO(out, rc);
+       }
+       peer->gnp_connecting = GNILND_PEER_POSTING;
+       write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       set_mb(peer->gnp_last_dgram_time, jiffies);
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING2)) {
+               while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING2, 1)) {};
+       }
+
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING3)) {
+               while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING3, 1)) {};
+               rc = cfs_fail_val ? cfs_fail_val : -ENOMEM;
+       } else {
+               rc = kgnilnd_post_dgram(peer->gnp_net->gnn_dev,
+                                       peer->gnp_nid, GNILND_CONNREQ_REQ, 0);
+       }
+       if (rc < 0) {
+               set_mb(peer->gnp_last_dgram_errno, rc);
+               GOTO(failed, rc);
+       }
+
+       /* while we're posting someone could have decided this peer/dgram needed to
+        * die a quick death, so we check for state change and process accordingly */
+
+       write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+       if (!kgnilnd_peer_active(peer) || peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
+               if (peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
+                       peer->gnp_connecting = GNILND_PEER_KILL;
+               }
+               write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+               /* positive RC to avoid dgram cleanup - we'll have to
+                * wait for the kgni GNI_POST_TERMINATED event to
+                * finish cleaning up */
+               rc = ESTALE;
+               kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev, peer->gnp_nid);
+               GOTO(out, rc);
+       }
+       peer->gnp_connecting = GNILND_PEER_POSTED;
+       write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+       /* reaper thread will take care of any timeouts */
+       CDEBUG(D_NET, "waiting for connect to finish to %s rc %d\n",
+              libcfs_nid2str(peer->gnp_nid), rc);
+
+       RETURN(rc);
+
+failed:
+       CDEBUG(D_NET, "connect to %s failed: rc %d \n",
+              libcfs_nid2str(peer->gnp_nid), rc);
+out:
+       RETURN(rc);
+}
+
+int
+kgnilnd_finish_connect(kgn_dgram_t *dgram)
+{
+       kgn_conn_t        *conn = dgram->gndg_conn;
+       lnet_nid_t         her_nid = dgram->gndg_conn_in.gncr_srcnid;
+       kgn_peer_t        *new_peer, *peer = NULL;
+       kgn_tx_t          *tx;
+       kgn_tx_t          *txn;
+       kgn_mbox_info_t   *mbox;
+       int                rc;
+       int                nstale;
+
+       /* try to find a peer that matches the nid we got in the connreq
+        * kgnilnd_unpack_connreq makes sure that conn_in.gncr_srcnid is
+        * HER and conn_out.gncr_srcnid is ME for both active and WC dgrams */
+
+       /* assume this is a new peer  - it makes locking cleaner when it isn't */
+       /* no holding kgn_net_rw_sem - already are at the kgnilnd_dgram_mover level */
+
+       rc = kgnilnd_create_peer_safe(&new_peer, her_nid, NULL);
+       if (rc != 0) {
+               CERROR("Can't create peer for %s\n", libcfs_nid2str(her_nid));
+               return rc;
+       }
+
+       write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       /* this transfers ref from create_peer to the kgn_peer table */
+       kgnilnd_add_peer_locked(her_nid, new_peer, &peer);
+
+       /* if we found an existing peer, is it really ready for a new conn ? */
+       if (peer != new_peer) {
+               /* if this was an active connect attempt but we can't find a peer waiting for it
+                * we will dump in the trash */
+
+               if (peer->gnp_connecting == GNILND_PEER_IDLE && dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
+                       CDEBUG(D_NET, "dropping completed connreq for %s peer 0x%p->%s\n",
+                              libcfs_nid2str(her_nid), peer, libcfs_nid2str(peer->gnp_nid));
+                       write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+                       rc = ECANCELED;
+                       GOTO(out, rc);
+               }
+
+               /* check to see if we can catch a connecting peer before it is
+                * removed from the connd_peers list - if not, we need to
+                * let the connreqs race and be handled by kgnilnd_conn_isdup_locked() */
+               if (peer->gnp_connecting != GNILND_PEER_IDLE) {
+                       spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
+                       if (!list_empty(&peer->gnp_connd_list)) {
+                               list_del_init(&peer->gnp_connd_list);
+                               /* drop connd ref */
+                               kgnilnd_peer_decref(peer);
+                       }
+                       spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
+                       /* clear rc to make sure we don't have fake error */
+                       rc = 0;
+               }
+
+               /* no matter what, we are no longer waiting to connect this peer now */
+               peer->gnp_connecting = GNILND_PEER_IDLE;
+
+               /* Refuse to duplicate an existing connection (both sides might try to
+                * connect at once).  NB we return success!  We _are_ connected so we
+                * _don't_ have any blocked txs to complete with failure. */
+               rc = kgnilnd_conn_isdup_locked(peer, conn);
+               if (rc != 0) {
+                       write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+                       CDEBUG(D_NET, "Not creating duplicate connection to %s: %d\n",
+                             libcfs_nid2str(her_nid), rc);
+                       rc = EALREADY;
+                       GOTO(out, rc);
+               }
+       }
+
+       nstale = kgnilnd_close_stale_conns_locked(peer, conn);
+
+       /* either way with peer (new or existing), we are ok with ref counts here as the
+        * kgnilnd_add_peer_locked will use our ref on new_peer (from create_peer_safe) as the
+        * ref for the peer table. */
+
+       /* at this point, the connection request is a winner */
+
+       /* mark 'DONE' to avoid cancel being called from release */
+       dgram->gndg_state = GNILND_DGRAM_DONE;
+
+       /* initialise timestamps before reaper looks at them */
+       conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies;
+
+       /* last_tx is initialized to jiffies - (keepalive*2) so that if the NOOP fails it will
+        * immediatly send a NOOP in the reaper thread during the call to
+        * kgnilnd_check_conn_timeouts_locked
+        */
+       conn->gnc_last_tx = jiffies - (cfs_time_seconds(GNILND_TO2KA(conn->gnc_timeout)) * 2);
+       conn->gnc_state = GNILND_CONN_ESTABLISHED;
+
+       /* refs are not transferred from dgram to tables, so increment to
+        * take ownership */
+       kgnilnd_conn_addref(conn);
+       kgnilnd_peer_addref(peer);
+       conn->gnc_peer = peer;
+       list_add_tail(&conn->gnc_list, &peer->gnp_conns);
+
+       kgnilnd_conn_addref(conn);               /* +1 ref for conn table */
+       list_add_tail(&conn->gnc_hashlist,
+                     kgnilnd_cqid2connlist(conn->gnc_cqid));
+       kgnilnd_data.kgn_conn_version++;
+
+       /* Dont send NOOP if fail_loc is set
+        */
+       if (!CFS_FAIL_CHECK(CFS_FAIL_GNI_ONLY_NOOP)) {
+               tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP, peer->gnp_net->gnn_ni->ni_nid);
+               if (tx == NULL) {
+                       CNETERR("can't get TX to initiate NOOP to %s\n",
+                               libcfs_nid2str(peer->gnp_nid));
+               } else {
+                       kgnilnd_queue_tx(conn, tx);
+               }
+       }
+
+       /* Schedule all packets blocking for a connection */
+       list_for_each_entry_safe(tx, txn, &peer->gnp_tx_queue, tx_list) {
+               /* lock held here is the peer_conn lock */
+               kgnilnd_tx_del_state_locked(tx, peer, NULL, GNILND_TX_ALLOCD);
+               kgnilnd_queue_tx(conn, tx);
+       }
+
+       /* If this is an active connection lets mark its timestamp on the MBoX */
+       if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
+               mbox = &conn->gnc_fma_blk->gnm_mbox_info[conn->gnc_mbox_id];
+               /* conn->gnc_last_rx is jiffies it better exist as it was just set */
+               mbox->mbx_release_purg_active_dgram = conn->gnc_last_rx;
+       }
+
+       /* Bug 765042: wake up scheduler for a race with finish_connect and
+        * complete_conn_closed with a conn in purgatory
+        * since we can't use CFS_RACE due to mutex_holds in kgnilnd_process_conns,
+        * we just check for set and then clear */
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FINISH_PURG)) {
+               cfs_fail_loc = 0x0;
+               /* get scheduler thread moving again */
+               kgnilnd_schedule_device(conn->gnc_device);
+       }
+
+       CDEBUG(D_NET, "New conn 0x%p->%s dev %d\n",
+              conn, libcfs_nid2str(her_nid), conn->gnc_device->gnd_id);
+
+       /* make sure we reset peer reconnect interval now that we have a good conn */
+       kgnilnd_peer_alive(peer);
+       peer->gnp_reconnect_interval = 0;
+
+       /* clear the unlink attribute if we dont clear it kgnilnd_del_conn_or_peer will wait
+        * on the atomic forever
+        */
+       if (peer->gnp_pending_unlink) {
+               peer->gnp_pending_unlink = 0;
+               kgnilnd_admin_decref(kgnilnd_data.kgn_npending_unlink);
+               CDEBUG(D_NET, "Clearing peer unlink %p\n",peer);
+       }
+
+       /* add ref to make it hang around until after we drop the lock */
+       kgnilnd_conn_addref(conn);
+
+       /* Once the peer_conn lock is dropped, the conn could actually move into
+        * CLOSING->CLOSED->DONE in the scheduler thread, so hold the
+        * lock until we are really done */
+       write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       /* Notify LNET that we now have a working connection to this peer.
+        * This is a Cray extension to the "standard" LND behavior. */
+       lnet_notify(peer->gnp_net->gnn_ni, peer->gnp_nid,
+                    1, cfs_time_current());
+
+       /* schedule the conn to pick up any SMSG sent by peer before we could
+        * process this dgram */
+       kgnilnd_schedule_conn(conn);
+
+       /* drop our 'hold' ref */
+       kgnilnd_conn_decref(conn);
+
+out:
+       RETURN(rc);
+}
+
+void
+kgnilnd_send_nak(kgn_device_t *dev, lnet_nid_t dst_nid, int error)
+{
+       int              rc = 0;
+       ENTRY;
+
+       LASSERTF(dst_nid != LNET_NID_ANY, "bad dst_nid %s\n", libcfs_nid2str(dst_nid));
+
+       CDEBUG(D_NET, "NAK to %s errno %d\n", libcfs_nid2str(dst_nid), error);
+
+       rc = kgnilnd_post_dgram(dev, dst_nid, GNILND_CONNREQ_NAK, error);
+
+       if (rc < 0) {
+               CDEBUG(D_NET, "NAK to %s failed: rc %d \n", libcfs_nid2str(dst_nid), rc);
+       }
+       EXIT;
+}
+
+int
+kgnilnd_process_nak(kgn_dgram_t *dgram)
+{
+       kgn_connreq_t     *connreq = &dgram->gndg_conn_in;
+       lnet_nid_t         src_nid = connreq->gncr_srcnid;
+       int                errno = connreq->gncr_nakdata.gnnd_errno;
+       kgn_peer_t        *peer;
+       int                rc = 0;
+
+       write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       peer = kgnilnd_find_peer_locked(src_nid);
+       if (peer == NULL) {
+               /* we likely dropped him from bad data when we processed
+                * the original REQ */
+               write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+               return -EBADSLT;
+       }
+
+       /* need to check peerstamp/connstamp against the ones we find
+        * to make sure we don't close new (and good?) conns that we
+        * formed after this connreq failed */
+       if (peer->gnp_connecting == GNILND_PEER_IDLE) {
+               kgn_conn_t        conn;
+
+               if (list_empty(&peer->gnp_conns)) {
+                       /* assume already procced datagram and it barfed up
+                        * on this side too */
+                       CDEBUG(D_NET, "dropping NAK from %s; "
+                              "peer %s is already not connected\n",
+                               libcfs_nid2str(connreq->gncr_srcnid),
+                               libcfs_nid2str(connreq->gncr_dstnid));
+                       write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+                       return 0;
+               }
+
+               /* stub up a connection with the connreq XXX_stamps to allow
+                * use to use close_stale_conns_locked */
+               conn.gnc_peerstamp = connreq->gncr_peerstamp;
+               conn.gnc_my_connstamp = connreq->gncr_connstamp;
+               conn.gnc_peer_connstamp = connreq->gncr_connstamp;
+               conn.gnc_device = peer->gnp_net->gnn_dev;
+
+               rc = kgnilnd_close_stale_conns_locked(peer, &conn);
+
+               LCONSOLE_INFO("Received NAK from %s for %s errno %d; "
+                       "closed %d connections\n",
+                       libcfs_nid2str(connreq->gncr_srcnid),
+                       libcfs_nid2str(connreq->gncr_dstnid), errno, rc);
+       } else {
+               rc = 0;
+               spin_lock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);
+
+               if (list_empty(&peer->gnp_connd_list)) {
+                       /* if peer isn't on waiting list, try to find one to nuke */
+                       rc = kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev,
+                                                          peer->gnp_nid);
+
+                       if (rc) {
+                               LCONSOLE_INFO("Received NAK from %s for %s errno %d; "
+                                       "canceled pending connect request\n",
+                                       libcfs_nid2str(connreq->gncr_srcnid),
+                                       libcfs_nid2str(connreq->gncr_dstnid), errno);
+                       }
+
+                       /* if we can't find a waiting dgram, we just drop the nak - the conn
+                        * connect must have failed (didn't find conn above and clear connecting
+                        * -- so nothing to do besides drop */
+               } else {
+                       /* peer is on list, meaning it is a new connect attempt from the one
+                        * we started that generated the NAK - so just drop NAK */
+
+                       /* use negative to prevent error message */
+                       rc = -EAGAIN;
+               }
+               spin_unlock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);
+       }
+
+       /* success! we found a peer and at least marked pending_nak */
+       write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       return 0;
+}
+
+int
+kgnilnd_process_connreq(kgn_dgram_t *dgram, int *needs_nak)
+{
+       int                      rc;
+
+       rc = kgnilnd_unpack_connreq(dgram);
+       if (rc < 0) {
+               if (rc != -EBADF) {
+                       /* only NAK if we have good srcnid to use */
+                       *needs_nak = 1;
+               }
+               goto connreq_out;
+       }
+
+       switch (dgram->gndg_conn_in.gncr_type) {
+       case GNILND_CONNREQ_REQ:
+               /* wire up peer & conn, send queued TX */
+               rc = kgnilnd_finish_connect(dgram);
+
+               /* don't nak when the nid is hosed */
+               if ((rc < 0)) {
+                       *needs_nak = 1;
+               }
+
+               break;
+       case GNILND_CONNREQ_NAK:
+               rc = kgnilnd_process_nak(dgram);
+               /* return early to prevent reconnect bump */
+               return rc;
+       default:
+               CERROR("unexpected connreq type %s (%d) from %s\n",
+                       kgnilnd_connreq_type2str(&dgram->gndg_conn_in),
+                       dgram->gndg_conn_in.gncr_type,
+                       libcfs_nid2str(dgram->gndg_conn_in.gncr_srcnid));
+               rc = -EINVAL;
+               *needs_nak = 1;
+               break;
+       }
+
+connreq_out:
+       RETURN(rc);
+}
+
+int
+kgnilnd_probe_and_process_dgram(kgn_device_t *dev)
+{
+       int                      rc;
+       int                      needs_nak = 0;
+       lnet_nid_t               nak_dstnid = LNET_NID_ANY;
+       lnet_nid_t               orig_dstnid;
+       kgn_dgram_t             *dgram = NULL;
+       kgn_peer_t              *peer;
+       ENTRY;
+
+       if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PAUSE_DGRAM_COMP)) {
+               rc = 0;
+       } else {
+               rc = kgnilnd_probe_for_dgram(dev, &dgram);
+       }
+
+       if (rc == 0) {
+               RETURN(0);
+       } else if (rc < 0) {
+               GOTO(inform_peer, rc);
+       } else {
+               /* rc > 1 means it did something, reset for this func  */
+               rc = 0;
+       }
+
+       switch (dgram->gndg_type) {
+       case GNILND_DGRAM_WC_REQ:
+       case GNILND_DGRAM_REQ:
+               rc = kgnilnd_process_connreq(dgram, &needs_nak);
+               break;
+       case GNILND_DGRAM_NAK:
+               CDEBUG(D_NETTRACE, "NAK to %s done\n",
+                       libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid));
+               break;
+       default:
+               CERROR("unknown datagram type %s (%d)\n",
+                      kgnilnd_dgram_type2str(dgram), dgram->gndg_type);
+               break;
+       }
+
+       /* stash data to use after releasing current datagram */
+       /* don't stash net - we are operating on a net already,
+        * so the lock on rw_net_lock is sufficient */
+
+       nak_dstnid = dgram->gndg_conn_in.gncr_srcnid;
+
+inform_peer:
+       LASSERTF(dgram != NULL, "dgram 0x%p rc %d needs_nak %d\n", dgram, rc, needs_nak);
+
+       orig_dstnid = dgram->gndg_conn_out.gncr_dstnid;
+
+       kgnilnd_release_dgram(dev, dgram);
+
+       CDEBUG(D_NET, "cleaning up dgram to %s, rc %d\n",
+              libcfs_nid2str(orig_dstnid), rc);
+
+       /* if this was a WC_REQ that matched an existing peer, it'll get marked done
+        * in kgnilnd_finish_connect - if errors are from before we get to there,
+        * we just drop as it is a WC_REQ - the peer CAN'T be waiting for it */
+       if ((orig_dstnid != LNET_NID_ANY) && (rc < 0)) {
+               /* if we have a negative rc, we want to find a peer to inform about
+                * the bad connection attempt. Sorry buddy, better luck next time! */
+
+               write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+               peer = kgnilnd_find_peer_locked(orig_dstnid);
+
+               if (peer != NULL) {
+                       /* add ref to make sure he stays around past the possible unlink
+                        * so we can tell LNet about him */
+                       kgnilnd_peer_addref(peer);
+
+                       /* if he still cares about the outstanding connect */
+                       if (peer->gnp_connecting >= GNILND_PEER_CONNECT) {
+                               /* check if he is on the connd list and remove.. */
+                               spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
+                               if (!list_empty(&peer->gnp_connd_list)) {
+                                       list_del_init(&peer->gnp_connd_list);
+                                       /* drop connd ref */
+                                       kgnilnd_peer_decref(peer);
+                               }
+                               spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
+
+                               /* clear gnp_connecting so we don't have a non-connecting peer
+                                * on gnd_connd_list */
+                               peer->gnp_connecting = GNILND_PEER_IDLE;
+
+                               set_mb(peer->gnp_last_dgram_errno, rc);
+
+                               kgnilnd_peer_increase_reconnect_locked(peer);
+                       }
+               }
+               write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+               /* now that we are outside the lock, tell Mommy */
+               if (peer != NULL) {
+                       kgnilnd_peer_notify(peer, rc);
+                       kgnilnd_peer_decref(peer);
+               }
+       }
+
+       if (needs_nak) {
+               kgnilnd_send_nak(dev, nak_dstnid, rc);
+       }
+
+       RETURN(1);
+}
+
+void
+kgnilnd_reaper_dgram_check(kgn_device_t *dev)
+{
+       kgn_dgram_t    *dgram, *tmp;
+       int             i;
+
+       spin_lock(&dev->gnd_dgram_lock);
+
+       for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size - 1); i++) {
+               list_for_each_entry_safe(dgram, tmp, &dev->gnd_dgrams[i], gndg_list) {
+                       unsigned long            now = jiffies;
+                       unsigned long            timeout;
+
+                       /* don't timeout stuff if the network is mucked or shutting down */
+                       if (kgnilnd_check_hw_quiesce()) {
+                               break;
+                       }
+
+                       if ((dgram->gndg_state != GNILND_DGRAM_POSTED) ||
+                           (dgram->gndg_type == GNILND_DGRAM_WC_REQ)) {
+                               continue;
+                       }
+                       CDEBUG(D_NETTRACE, "checking dgram 0x%p type %s "
+                               "state %s conn 0x%p to %s age %lus\n",
+                               dgram, kgnilnd_dgram_type2str(dgram),
+                               kgnilnd_dgram_state2str(dgram), dgram->gndg_conn,
+                               libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
+                               cfs_duration_sec(now - dgram->gndg_post_time));
+
+                       timeout = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout);
+
+                       if (time_before(now, (dgram->gndg_post_time + timeout)))
+                               continue;
+
+                       CNETERR("%s datagram to %s timed out @ %lus dgram "
+                               "0x%p state %s conn 0x%p\n",
+                               kgnilnd_dgram_type2str(dgram),
+                               libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
+                               cfs_duration_sec(now - dgram->gndg_post_time),
+                               dgram, kgnilnd_dgram_state2str(dgram),
+                               dgram->gndg_conn);
+
+                       kgnilnd_cancel_dgram_locked(dgram);
+               }
+       }
+       spin_unlock(&dev->gnd_dgram_lock);
+}
+
+
+/* use a thread for the possibly long-blocking wait_by_id to prevent
+ * stalling the global workqueues */
+int
+kgnilnd_dgram_waitq(void *arg)
+{
+       kgn_device_t     *dev = (kgn_device_t *) arg;
+       char              name[16];
+       gni_return_t      grc;
+       __u64             readyid;
+       DEFINE_WAIT(mover_done);
+
+       snprintf(name, sizeof(name), "kgnilnd_dgn_%02d", dev->gnd_id);
+       cfs_daemonize(name);
+       cfs_block_allsigs();
+
+       /* all gnilnd threads need to run fairly urgently */
+       set_user_nice(current, *kgnilnd_tunables.kgn_nice);
+
+       /* we dont shut down until the device shuts down ... */
+       while (!kgnilnd_data.kgn_shutdown) {
+               /* to quiesce or to not quiesce, that is the question */
+               if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+                       KGNILND_SPIN_QUIESCE;
+               }
+
+               while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_PAUSE_DGRAM_COMP, 1)) {}
+
+               /* check once a second */
+               grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle,
+                                                      1000, &readyid);
+
+               if (grc == GNI_RC_SUCCESS) {
+                       CDEBUG(D_INFO, "waking up dgram mover thread\n");
+                       kgnilnd_schedule_dgram(dev);
+
+                       /* wait for dgram thread to ping us before spinning again */
+                       prepare_to_wait(&dev->gnd_dgping_waitq, &mover_done,
+                                       TASK_INTERRUPTIBLE);
+
+                       /* don't sleep if we need to quiesce */
+                       if (likely(!kgnilnd_data.kgn_quiesce_trigger)) {
+                               schedule();
+                       }
+                       finish_wait(&dev->gnd_dgping_waitq, &mover_done);
+               }
+       }
+
+       kgnilnd_thread_fini();
+       return 0;
+}
+
+int
+kgnilnd_start_outbound_dgrams(kgn_device_t *dev)
+{
+       int                      did_something = 0, rc;
+       kgn_peer_t              *peer = NULL;
+
+       spin_lock(&dev->gnd_connd_lock);
+
+       /* Active connect - we added this in kgnilnd_launch_tx */
+       while (!list_empty(&dev->gnd_connd_peers)) {
+               peer = list_first_entry(&dev->gnd_connd_peers,
+                                       kgn_peer_t, gnp_connd_list);
+
+               /* ref for connd removed in if/else below */
+              list_del_init(&peer->gnp_connd_list);
+
+               /* gnp_connecting and membership on gnd_connd_peers should be
+                * done coherently to avoid double adding, etc */
+               /* don't need kgnilnd_data.kgn_peer_conn_lock here as that is only needed
+                * to get the peer to gnp_connecting in the first place. We just need to
+                * rely on gnd_connd_lock to serialize someone pulling him from the list
+                * BEFORE clearing gnp_connecting */
+               LASSERTF(peer->gnp_connecting != GNILND_PEER_IDLE, "peer 0x%p->%s not connecting\n",
+                        peer, libcfs_nid2str(peer->gnp_nid));
+
+               spin_unlock(&dev->gnd_connd_lock);
+
+               CDEBUG(D_NET, "processing connect to %s\n",
+                      libcfs_nid2str(peer->gnp_nid));
+
+               did_something += 1;
+               rc = kgnilnd_start_connect(peer);
+
+               if (likely(rc >= 0)) {
+                       /* 0 on success, positive on 'just drop peer' errors */
+                       kgnilnd_peer_decref(peer);
+               } else if (rc == -ENOMEM) {
+                       /* if we are out of wildcards, add back to
+                        * connd_list - then break out and we'll try later
+                        * if other errors, we'll bail & cancel pending tx */
+                       write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+                       if (peer->gnp_connecting == GNILND_PEER_POSTING) {
+                               peer->gnp_connecting = GNILND_PEER_CONNECT;
+                               spin_lock(&dev->gnd_connd_lock);
+                               list_add_tail(&peer->gnp_connd_list,
+                                             &dev->gnd_connd_peers);
+                       } else {
+                               /* connecting changed while we were posting */
+
+                               LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid"
+                                       " state 0x%p->%s, connecting %d\n",
+                                       peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
+                               peer->gnp_connecting = GNILND_PEER_KILL;
+                               spin_lock(&dev->gnd_connd_lock);
+                               /* remove the peer ref frrom the cond list */
+                               kgnilnd_peer_decref(peer);
+                               /* let the system handle itself */
+                       }
+                       write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+                       /* the datagrams are a global pool,
+                        * so break out of trying and hope some free
+                        * up soon */
+                       did_something -= 1;
+                       break;
+               } else {
+                       /* something bad happened, you lose */
+                       CNETERR("could not start connecting to %s "
+                               "rc %d: Will retry until TX timeout\n",
+                              libcfs_nid2str(peer->gnp_nid), rc);
+                       /* It didnt post so just set connecting back to zero now.
+                        * The reaper will reattempt the connection if it needs too.
+                        * If the peer needs death set it so the reaper will cleanup.
+                        */
+                       write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+                       if (peer->gnp_connecting == GNILND_PEER_POSTING) {
+                               peer->gnp_connecting = GNILND_PEER_IDLE;
+                               kgnilnd_peer_increase_reconnect_locked(peer);
+                       } else {
+                               LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid"
+                                       " state 0x%p->%s, connecting %d\n",
+                                       peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
+                               peer->gnp_connecting = GNILND_PEER_KILL;
+                       }
+                       write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+                       /* hold onto ref until we are really done - if it was
+                        * unlinked this could result in a destroy */
+                       kgnilnd_peer_decref(peer);
+               }
+               spin_lock(&dev->gnd_connd_lock);
+       }
+
+       spin_unlock(&dev->gnd_connd_lock);
+       RETURN(did_something);
+}
+
+static void
+kgnilnd_dgram_poke_with_stick(unsigned long arg)
+{
+       int             dev_id = arg;
+       kgn_device_t    *dev = &kgnilnd_data.kgn_devices[dev_id];
+
+       wake_up(&dev->gnd_dgram_waitq);
+}
+
+/* use single thread for dgrams - should be sufficient for performance */
+int
+kgnilnd_dgram_mover(void *arg)
+{
+       kgn_device_t            *dev = (kgn_device_t *)arg;
+       char                     name[16];
+       int                      rc, did_something;
+       unsigned long            next_purge_check = jiffies - 1;
+       unsigned long            timeout;
+       struct timer_list        timer;
+       DEFINE_WAIT(wait);
+
+       snprintf(name, sizeof(name), "kgnilnd_dg_%02d", dev->gnd_id);
+       cfs_daemonize(name);
+       cfs_block_allsigs();
+       /* all gnilnd threads need to run fairly urgently */
+       set_user_nice(current, *kgnilnd_tunables.kgn_nice);
+
+       /* we are ok not locking for these variables as the dgram waitq threads
+        * will block both due to tying up net (kgn_shutdown) and the completion
+        * event for the dgram_waitq (kgn_quiesce_trigger) */
+
+       while (!kgnilnd_data.kgn_shutdown) {
+               /* Safe: kgn_shutdown only set when quiescent */
+
+               /* race with stack reset - we want to hold off seeing any new incoming dgrams
+                * so we can force a dirty WC dgram for Bug 762072 - put right before
+                * quiesce check so that it'll go right into that and not do any
+                * dgram mucking */
+               CFS_RACE(CFS_FAIL_GNI_WC_DGRAM_FREE);
+
+               /* to quiesce or to not quiesce, that is the question */
+               if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+                       KGNILND_SPIN_QUIESCE;
+               }
+               did_something = 0;
+
+               CFS_RACE(CFS_FAIL_GNI_QUIESCE_RACE);
+
+               /* process any newly completed dgrams */
+               down_read(&kgnilnd_data.kgn_net_rw_sem);
+
+               rc = kgnilnd_probe_and_process_dgram(dev);
+               if (rc > 0) {
+                       did_something += rc;
+               }
+
+               up_read(&kgnilnd_data.kgn_net_rw_sem);
+
+               /* start new outbound dgrams */
+               did_something += kgnilnd_start_outbound_dgrams(dev);
+
+               /* find dead dgrams */
+               if (time_after_eq(jiffies, next_purge_check)) {
+                       /* these don't need to be checked that often */
+                       kgnilnd_reaper_dgram_check(dev);
+
+                       next_purge_check = (long) jiffies +
+                                     cfs_time_seconds(kgnilnd_data.kgn_new_min_timeout / 4);
+               }
+
+               /* careful with the jiffy wrap... */
+               timeout = (long)(next_purge_check - jiffies);
+
+               CDEBUG(D_INFO, "did %d timeout %lu next %lu jiffies %lu\n",
+                      did_something, timeout, next_purge_check, jiffies);
+
+               if (did_something || timeout <= 0) {
+                       did_something = 0;
+                       continue;
+               }
+
+               prepare_to_wait(&dev->gnd_dgram_waitq, &wait, TASK_INTERRUPTIBLE);
+
+               setup_timer(&timer, kgnilnd_dgram_poke_with_stick, dev->gnd_id);
+               mod_timer(&timer, (long) jiffies + timeout);
+
+               /* last second chance for others to poke us */
+               did_something += xchg(&dev->gnd_dgram_ready, GNILND_DGRAM_IDLE);
+
+               /* check flag variables before comitting */
+               if (!did_something &&
+                   !kgnilnd_data.kgn_shutdown &&
+                   !kgnilnd_data.kgn_quiesce_trigger) {
+                       CDEBUG(D_INFO, "schedule timeout %ld (%lu sec)\n",
+                              timeout, cfs_duration_sec(timeout));
+                       wake_up_all(&dev->gnd_dgping_waitq);
+                       schedule();
+                       CDEBUG(D_INFO, "awake after schedule\n");
+               }
+
+               del_singleshot_timer_sync(&timer);
+               finish_wait(&dev->gnd_dgram_waitq, &wait);
+       }
+
+       kgnilnd_thread_fini();
+       return 0;
+}
+
diff --git a/lnet/klnds/gnilnd/gnilnd_debug.c b/lnet/klnds/gnilnd/gnilnd_debug.c
new file mode 100644 (file)
index 0000000..8230d98
--- /dev/null
@@ -0,0 +1,108 @@
+/*
+ * Copyright (C) 2009-2012 Cray, Inc.
+ *
+ *   Author: Nic Henke <nic@cray.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#include "gnilnd.h"
+
+void
+_kgnilnd_debug_msg(kgn_msg_t *msg, struct libcfs_debug_msg_data *msgdata,
+                  const char *fmt, ...)
+{
+       va_list args;
+
+       va_start(args, fmt);
+       /* XXX Nic TBD: add handling of gnm_u ? */
+       libcfs_debug_vmsg2(msgdata, fmt, args,
+                          " msg@0x%p m/v/ty/ck/pck/pl %08x/%d/%d/%x/%x/%d x%d:%s\n",
+                          msg, msg->gnm_magic, msg->gnm_version, msg->gnm_type,
+                          msg->gnm_cksum, msg->gnm_payload_cksum,
+                          msg->gnm_payload_len, msg->gnm_seq,
+                          kgnilnd_msgtype2str(msg->gnm_type));
+       va_end(args);
+}
+
+void
+_kgnilnd_debug_conn(kgn_conn_t *conn, struct libcfs_debug_msg_data *msgdata,
+                   const char *fmt, ...)
+{
+       va_list args;
+
+       va_start(args, fmt);
+       libcfs_debug_vmsg2(msgdata, fmt, args,
+               " conn@0x%p->%s:%s cq %u, to %ds, "
+               " RX %d @ %lu/%lus; TX %d @ %lus/%lus; "
+               " NOOP %lus/%lu/%lus; sched %lus/%lus/%lus ago \n",
+               conn, conn->gnc_peer ? libcfs_nid2str(conn->gnc_peer->gnp_nid) :
+               "<?>", kgnilnd_conn_state2str(conn),
+               conn->gnc_cqid, conn->gnc_timeout,
+               conn->gnc_rx_seq,
+               cfs_duration_sec(jiffies - conn->gnc_last_rx),
+               cfs_duration_sec(jiffies - conn->gnc_last_rx_cq),
+               conn->gnc_tx_seq,
+               cfs_duration_sec(jiffies - conn->gnc_last_tx),
+               cfs_duration_sec(jiffies - conn->gnc_last_tx_cq),
+               cfs_duration_sec(jiffies - conn->gnc_last_noop_want),
+               cfs_duration_sec(jiffies - conn->gnc_last_noop_sent),
+               cfs_duration_sec(jiffies - conn->gnc_last_noop_cq),
+               cfs_duration_sec(jiffies - conn->gnc_last_sched_ask),
+               cfs_duration_sec(jiffies - conn->gnc_last_sched_do),
+               cfs_duration_sec(jiffies - conn->gnc_device->gnd_sched_alive));
+
+
+       va_end(args);
+}
+
+void
+_kgnilnd_debug_tx(kgn_tx_t *tx, struct libcfs_debug_msg_data *msgdata,
+                 const char *fmt, ...)
+{
+       kgn_tx_ev_id_t  *id   = &tx->tx_id;
+       char            *nid = "<?>";
+       va_list          args;
+
+       if (tx->tx_conn && tx->tx_conn->gnc_peer) {
+               nid = libcfs_nid2str(tx->tx_conn->gnc_peer->gnp_nid);
+       }
+
+       va_start(args, fmt);
+       libcfs_debug_vmsg2(msgdata, fmt, args,
+               " tx@0x%p->%s id "LPX64"/%u/%d:%d msg %x/%s/%d q %s@%lds->0x%p f %x re %d\n",
+               tx, nid, id->txe_cookie, id->txe_smsg_id, id->txe_cqid,
+               id->txe_idx, tx->tx_msg.gnm_type,
+               kgnilnd_msgtype2str(tx->tx_msg.gnm_type), tx->tx_buftype,
+               kgnilnd_tx_state2str(tx->tx_list_state),
+               cfs_duration_sec((long)jiffies - tx->tx_qtime), tx->tx_list_p,
+               tx->tx_state, tx->tx_retrans);
+       va_end(args);
+}
+
+void
+_kgnilnd_api_rc_lbug(const char* rcstr, int rc, struct libcfs_debug_msg_data *msgdata,
+                       const char *fmt, ...)
+{
+       va_list args;
+
+       va_start(args, fmt);
+       libcfs_debug_vmsg2(msgdata, fmt, args,
+                          " GNI API violated? Unexpected rc %s(%d)!\n",
+                          rcstr, rc);
+       va_end(args);
+       LBUG();
+}
diff --git a/lnet/klnds/gnilnd/gnilnd_hss_ops.h b/lnet/klnds/gnilnd/gnilnd_hss_ops.h
new file mode 100644 (file)
index 0000000..ec75177
--- /dev/null
@@ -0,0 +1,284 @@
+/*
+ * Copyright (C) 2010-2012 Cray, Inc.
+ *   Author: Nic Henke <nic@cray.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#ifndef _GNILND_HSS_OPS_H
+#define _GNILND_HSS_OPS_H
+
+/* for krca nid & nic translation */
+#include <krca_lib.h>
+#include <linux/typecheck.h>
+
+/* the SimNow nodes can't load rca.ko, so we need to detect this
+ * and fake a table that'd work for lookups there */
+
+typedef struct kgn_nid_entry {
+       __u32   nid;
+       __u32   nicaddr;
+} kgn_nid_entry_t;
+
+typedef struct kgn_hssops
+{
+       /* function pointers for nid and nic conversion */
+       /* from krca_lib.h */
+       int     (*nid_to_nicaddr)(__u32 nid, int numnic, __u32 *nicaddr);
+       int     (*nicaddr_to_nid)(__u32 nicaddr, __u32 *nid);
+       void    (*hb_to_l0)(void);
+} kgn_hssops_t;
+
+/* pull in static store in gnilnd.c */
+extern kgn_hssops_t             kgnilnd_hssops;
+
+#define GNILND_NO_RCA           0xdeadbeef
+#define GNILND_NO_QUIESCE       0xdeadbeef
+
+static inline int
+kgnilnd_lookup_rca_funcs(void)
+{
+        void    *funcp;
+
+       funcp = __symbol_get("send_hb_2_l0");
+       if (funcp == 0) {
+               CERROR("couldn't find send_hb_2_l0\n");
+               /* not fatal for now */
+       } else {
+               kgnilnd_hssops.hb_to_l0 = funcp;
+       }
+
+       /* if we find one, we should get the other */
+
+       funcp = __symbol_get("krca_nid_to_nicaddrs");
+       if (funcp == 0) {
+               kgnilnd_hssops.nid_to_nicaddr = (void *)GNILND_NO_RCA;
+               kgnilnd_hssops.nicaddr_to_nid = (void *)GNILND_NO_RCA;
+               LCONSOLE_INFO("using SimNow nid table for RCA translation\n");
+               return 0;
+       }
+       kgnilnd_hssops.nid_to_nicaddr = funcp;
+
+       funcp = __symbol_get("krca_nicaddr_to_nid");
+       if (funcp == 0) {
+               CERROR("found krca_nid_to_nicaddrs but not "
+                      "krca_nicaddr_to_nid\n");
+               return -ESRCH;
+       }
+       kgnilnd_hssops.nicaddr_to_nid = funcp;
+       return 0;
+}
+
+#if defined(CONFIG_CRAY_GEMINI)
+/* Gemini SimNow has a hard coded table to use - no RCA there */
+#define GNILND_MAX_NID_TABLE    0xffffffff
+/* this is all of the nodes defined in the Baker SimNow "sim_platforms" page */
+static kgn_nid_entry_t kgn_nid_table[] = {
+       {0x1, 0x100}, {0x2, 0x101}, {0x3, 0x104}, {0x4, 0x105},
+       {0x5, 0x108}, {0x6, 0x109}, {0x7, 0x10c}, {0x8, 0x10d},
+       {0x9, 0x110}, {0xa, 0x111}, {0xb, 0x114}, {0xc, 0x115},
+       {0xd, 0x118}, {0xe, 0x119}, {0xf, 0x11c}, {0x10, 0x11d},
+       {0x11, 0x120}, {0x12, 0x121}, {0x13, 0x124}, {0x14, 0x125},
+       {0x15, 0x128}, {0x16, 0x129}, {0x17, 0x12c}, {0x18, 0x12d},
+       {0x19, 0x130}, {0x1a, 0x131}, {0x1b, 0x134}, {0x1c, 0x135},
+       {0x1d, 0x138}, {0x1e, 0x139}, {0x1f, 0x13c}, {0x20, 0x13d},
+       {0x21, 0x140}, {0x22, 0x141}, {0x23, 0x144}, {0x24, 0x145},
+       {0x25, 0x148}, {0x26, 0x149}, {0x27, 0x14c}, {0x28, 0x14d},
+       {0x29, 0x150}, {0x2a, 0x151}, {0x2b, 0x154}, {0x2c, 0x155},
+       {0x2d, 0x158}, {0x2e, 0x159}, {0x2f, 0x15c}, {0x30, 0x15d},
+       {0x31, 0x160}, {0x32, 0x161}, {0x33, 0x164}, {0x3d, 0x178},
+       {0x34, 0x165}, {0x3e, 0x179}, {0x35, 0x168}, {0x3f, 0x17c},
+       {0x36, 0x169}, {0x40, 0x17d}, {0x37, 0x16c}, {0x41, 0x180},
+       {0x38, 0x16d}, {0x42, 0x181}, {0x39, 0x170}, {0x3a, 0x171},
+       {0x3b, 0x174}, {0x3c, 0x175}, {0x43, 0x184}, {0x44, 0x185},
+       {0x45, 0x188}, {0x46, 0x189}, {0x47, 0x18c}, {0x48, 0x18d},
+       /* entries after this are for 'dead' peer tests */
+       {0x63, 0x1ff}, {0x111, 0x209},
+       {GNILND_MAX_NID_TABLE, GNILND_MAX_NID_TABLE}
+};
+static int
+gemini_nid_to_nicaddr(__u32 nid, int numnic, __u32 *nicaddr)
+{
+       int i;
+
+       /* GNILND_NO_RCA, so use hardcoded table for Gemini SimNow */
+       if (numnic > 1) {
+               CERROR("manual nid2nic translation doesn't support"
+                      "multiple nic addrs (you asked for %d)\n",
+                       numnic);
+               return -EINVAL;
+       }
+
+       for (i = 0;;i++) {
+               if (kgn_nid_table[i].nid == GNILND_MAX_NID_TABLE) {
+                       CERROR("could not translate %u to a NIC "
+                              "address\n", nid);
+                       return -ESRCH;
+               }
+               if (kgn_nid_table[i].nid == nid) {
+                       *nicaddr = kgn_nid_table[i].nicaddr;
+                       return 1;
+               }
+       }
+}
+
+static int
+gemini_nicaddr_to_nid(__u32 nicaddr, __u32 *nid)
+{
+       int i;
+
+       /* GNILND_RCA_NOT_HOME, so use hardcoded table for SimNow */
+       for (i = 0;;i++) {
+               if (kgn_nid_table[i].nicaddr == GNILND_MAX_NID_TABLE) {
+                       CERROR("could not translate NIC address "
+                               "%u\n",
+                               nicaddr);
+                       return -ESRCH;
+               }
+               if (kgn_nid_table[i].nicaddr == nicaddr) {
+                       *nid = kgn_nid_table[i].nid;
+                       return 1;
+               }
+       }
+}
+
+static inline int
+kgnilnd_setup_nic_translation(__u32 device_id)
+{
+        int rc;
+
+       /* do lookup on first use */
+       if (unlikely(kgnilnd_hssops.nid_to_nicaddr == NULL)) {
+               rc = kgnilnd_lookup_rca_funcs();
+               if (rc)
+                       return rc;
+       }
+
+       /* if we have a real function, return - we'll use those going forward */
+       if (likely(kgnilnd_hssops.nid_to_nicaddr != (void *)GNILND_NO_RCA))
+               return 0;
+
+       kgnilnd_hssops.nid_to_nicaddr = gemini_nid_to_nicaddr;
+       kgnilnd_hssops.nicaddr_to_nid = gemini_nicaddr_to_nid;
+       return 0;
+}
+
+#elif defined(CONFIG_CRAY_ARIES)
+/* for libcfs_ipif_query */
+#include <libcfs/libcfs.h>
+
+/* Aries Sim doesn't have hardcoded tables, so we'll hijack the nic_pe
+ * and decode our address and nic addr from that - the rest are just offsets */
+static __u32 aries_sim_base_nid;
+static __u32 aries_sim_nic;
+
+static int
+aries_nid_to_nicaddr(__u32 nid, int numnic, __u32 *nicaddr)
+{
+       if (numnic > 1) {
+               CERROR("manual nid2nic translation doesn't support"
+                      "multiple nic addrs (you asked for %d)\n",
+                       numnic);
+               return -EINVAL;
+       }
+       if (nid < aries_sim_base_nid) {
+               CERROR("Request for invalid nid translation %u, minimum %u\n",
+                      nid, aries_sim_base_nid);
+               return -ESRCH;
+       }
+
+       *nicaddr = nid - aries_sim_base_nid;
+       return 1;
+}
+
+static int
+aries_nicaddr_to_nid(__u32 nicaddr, __u32 *nid)
+{
+       *nid = aries_sim_base_nid + nicaddr;
+       return 1;
+}
+
+/* XXX Nic: This does not support multiple device!!!! */
+static inline int
+kgnilnd_setup_nic_translation(__u32 device_id)
+{
+       char              *if_name = "ipogif0";
+       __u32              ipaddr, netmask, my_nid;
+       int                up, rc;
+
+       /* do lookup on first use */
+       if (unlikely(kgnilnd_hssops.nid_to_nicaddr == NULL)) {
+               rc = kgnilnd_lookup_rca_funcs();
+               if (rc)
+                       return rc;
+       }
+
+       /* if we have a real function, return - we'll use those going forward */
+       if (likely(kgnilnd_hssops.nid_to_nicaddr != (void *)GNILND_NO_RCA))
+               return 0;
+
+       rc = libcfs_ipif_query(if_name, &up, &ipaddr, &netmask);
+       if (rc != 0) {
+               CERROR("can't get IP interface for %s: %d\n", if_name, rc);
+               return rc;
+       }
+       if (!up) {
+               CERROR("IP interface %s is down\n", if_name);
+               return -ENODEV;
+       }
+
+       my_nid = ((ipaddr >> 8) & 0xFF) + (ipaddr & 0xFF);
+       aries_sim_nic = device_id;
+       aries_sim_base_nid = my_nid - aries_sim_nic;
+
+       kgnilnd_hssops.nid_to_nicaddr = aries_nid_to_nicaddr;
+       kgnilnd_hssops.nicaddr_to_nid = aries_nicaddr_to_nid;
+
+       return 0;
+}
+#else
+#error "Undefined Network Type"
+#endif
+
+/* we use RCA types here to get the compiler to whine when we have
+ * mismatched types */
+static inline int
+kgnilnd_nid_to_nicaddrs(rca_nid_t nid, int numnic, nic_addr_t *nicaddrs)
+{
+       /* compile time checks to ensure that the RCA types match
+        * the LNet idea of NID and NIC */
+       typecheck(__u32, nid);
+       typecheck(__u32, *nicaddrs);
+
+       LASSERTF(kgnilnd_hssops.nid_to_nicaddr != NULL, "missing setup?\n");
+
+       return kgnilnd_hssops.nid_to_nicaddr(nid, numnic, nicaddrs);
+}
+
+static inline int
+kgnilnd_nicaddr_to_nid(nic_addr_t nicaddr, rca_nid_t *nid)
+{
+       /* compile time checks to ensure that the RCA types match
+        * the LNet idea of NID and NIC */
+       typecheck(__u32, nicaddr);
+       typecheck(__u32, nid[0]);
+
+       LASSERTF(kgnilnd_hssops.nicaddr_to_nid != NULL, "missing setup ?\n");
+
+       return kgnilnd_hssops.nicaddr_to_nid(nicaddr, nid);
+}
+
+#endif /* _GNILND_HSS_OPS_H */
diff --git a/lnet/klnds/gnilnd/gnilnd_modparams.c b/lnet/klnds/gnilnd/gnilnd_modparams.c
new file mode 100644 (file)
index 0000000..17cbfd6
--- /dev/null
@@ -0,0 +1,500 @@
+/*
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ * Copyright (C) 2009-2012 Cray, Inc.
+ *
+ *   Derived from work by: Eric Barton <eric@bartonsoftware.com>
+ *   Author: Nic Henke <nic@cray.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "gnilnd.h"
+
+static int credits = 256;
+CFS_MODULE_PARM(credits, "i", int, 0444,
+               "# concurrent sends");
+
+static int peer_credits = 16;
+CFS_MODULE_PARM(peer_credits, "i", int, 0444,
+               "# LNet peer credits");
+
+/* NB - we'll not actually limit sends to this, we just size the mailbox buffer
+ * such that at most we'll have concurrent_sends * max_immediate messages
+ * in the mailbox */
+static int concurrent_sends = 0;
+CFS_MODULE_PARM(concurrent_sends, "i", int, 0444,
+               "# concurrent HW sends to 1 peer");
+
+/* default for 2k nodes @ 16 peer credits */
+static int fma_cq_size = 32768;
+CFS_MODULE_PARM(fma_cq_size, "i", int, 0444,
+               "size of the completion queue");
+
+static int timeout = GNILND_BASE_TIMEOUT;
+/* can't change @ runtime because LNet gets NI data at startup from
+ * this value */
+CFS_MODULE_PARM(timeout, "i", int, 0444,
+               "communications timeout (seconds)");
+
+/* time to wait between datagram timeout and sending of next dgram */
+static int min_reconnect_interval = GNILND_MIN_RECONNECT_TO;
+CFS_MODULE_PARM(min_reconnect_interval, "i", int, 0644,
+               "minimum connection retry interval (seconds)");
+
+/* if this goes longer than timeout, we'll timeout the TX before
+ * the dgram */
+static int max_reconnect_interval = GNILND_MAX_RECONNECT_TO;
+CFS_MODULE_PARM(max_reconnect_interval, "i", int, 0644,
+               "maximum connection retry interval (seconds)");
+
+static int max_immediate = (2<<10);
+CFS_MODULE_PARM(max_immediate, "i", int, 0644,
+               "immediate/RDMA breakpoint");
+
+#ifdef CONFIG_CRAY_GEMINI
+static int checksum = GNILND_CHECKSUM_SMSG_BTE;
+#else
+static int checksum = 0;
+#endif
+CFS_MODULE_PARM(checksum, "i", int, 0644,
+               "0: None, 1: headers, 2: short msg, 3: all traffic");
+
+static int checksum_dump = 0;
+CFS_MODULE_PARM(checksum_dump, "i", int, 0644,
+               "0: None, 1: dump log on failure, 2: payload data to D_INFO log");
+
+static int bte_hash = 1;
+CFS_MODULE_PARM(bte_hash, "i", int, 0644,
+               "enable hashing for BTE (RDMA) transfers");
+
+static int bte_adapt = 1;
+CFS_MODULE_PARM(bte_adapt, "i", int, 0644,
+               "enable adaptive request and response for BTE (RDMA) transfers");
+
+static int bte_relaxed_ordering = 1;
+CFS_MODULE_PARM(bte_relaxed_ordering, "i", int, 0644,
+               "enable relaxed ordering (PASSPW) for BTE (RDMA) transfers");
+
+static int ptag = GNI_PTAG_LND;
+CFS_MODULE_PARM(ptag, "i", int, 0444,
+               "ptag for Gemini CDM");
+
+static int max_retransmits = 1024;
+CFS_MODULE_PARM(max_retransmits, "i", int, 0644,
+               "max retransmits for FMA");
+
+static int nwildcard = 4;
+CFS_MODULE_PARM(nwildcard, "i", int, 0444,
+               "# wildcard datagrams to post per net (interface)");
+
+static int nice = -20;
+CFS_MODULE_PARM(nice, "i", int, 0444,
+               "nice value for kgnilnd threads, default -20");
+
+static int rdmaq_intervals = 4;
+CFS_MODULE_PARM(rdmaq_intervals, "i", int, 0644,
+               "# intervals per second for rdmaq throttling, default 4, 0 to disable");
+
+static int loops = 100;
+CFS_MODULE_PARM(loops, "i", int, 0644,
+               "# of loops before scheduler is friendly, default 100");
+
+static int hash_size = 503;
+CFS_MODULE_PARM(hash_size, "i", int, 0444,
+               "prime number for peer/conn hash sizing, default 503");
+
+static int peer_health = 0;
+CFS_MODULE_PARM(peer_health, "i", int, 0444,
+               "Disable peer timeout for LNet peer health, default off, > 0 to enable");
+
+static int vmap_cksum = 0;
+CFS_MODULE_PARM(vmap_cksum, "i", int, 0644,
+               "use vmap for all kiov checksumming, default off");
+
+static int mbox_per_block = GNILND_FMABLK;
+CFS_MODULE_PARM(mbox_per_block, "i", int, 0644,
+               "mailboxes per block");
+
+static int nphys_mbox = 0;
+CFS_MODULE_PARM(nphys_mbox, "i", int, 0444,
+               "# mbox to preallocate from physical memory, default 0");
+
+static int mbox_credits = GNILND_MBOX_CREDITS;
+CFS_MODULE_PARM(mbox_credits, "i", int, 0644,
+               "number of credits per mailbox");
+
+static int sched_threads = GNILND_SCHED_THREADS;
+CFS_MODULE_PARM(sched_threads, "i", int, 0444,
+               "number of threads for moving data");
+
+static int net_hash_size = 11;
+CFS_MODULE_PARM(net_hash_size, "i", int, 0444,
+               "prime number for net hash sizing, default 11");
+
+static int hardware_timeout = GNILND_HARDWARE_TIMEOUT;
+CFS_MODULE_PARM(hardware_timeout, "i", int, 0444,
+               "maximum time for traffic to get from one node to another");
+
+static int mdd_timeout = GNILND_MDD_TIMEOUT;
+CFS_MODULE_PARM(mdd_timeout, "i", int, 0644,
+               "maximum time (in minutes) for mdd to be held");
+
+kgn_tunables_t kgnilnd_tunables = {
+       .kgn_min_reconnect_interval = &min_reconnect_interval,
+       .kgn_max_reconnect_interval = &max_reconnect_interval,
+       .kgn_credits                = &credits,
+       .kgn_peer_credits           = &peer_credits,
+       .kgn_concurrent_sends       = &concurrent_sends,
+       .kgn_fma_cq_size            = &fma_cq_size,
+       .kgn_timeout                = &timeout,
+       .kgn_max_immediate          = &max_immediate,
+       .kgn_checksum               = &checksum,
+       .kgn_checksum_dump          = &checksum_dump,
+       .kgn_bte_hash               = &bte_hash,
+       .kgn_bte_adapt              = &bte_adapt,
+       .kgn_bte_relaxed_ordering   = &bte_relaxed_ordering,
+       .kgn_ptag                   = &ptag,
+       .kgn_max_retransmits        = &max_retransmits,
+       .kgn_nwildcard              = &nwildcard,
+       .kgn_nice                   = &nice,
+       .kgn_rdmaq_intervals        = &rdmaq_intervals,
+       .kgn_loops                  = &loops,
+       .kgn_peer_hash_size         = &hash_size,
+       .kgn_peer_health            = &peer_health,
+       .kgn_vmap_cksum             = &vmap_cksum,
+       .kgn_mbox_per_block         = &mbox_per_block,
+       .kgn_nphys_mbox             = &nphys_mbox,
+       .kgn_mbox_credits           = &mbox_credits,
+       .kgn_sched_threads          = &sched_threads,
+       .kgn_net_hash_size          = &net_hash_size,
+       .kgn_hardware_timeout       = &hardware_timeout,
+       .kgn_mdd_timeout            = &mdd_timeout
+};
+
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+static cfs_sysctl_table_t kgnilnd_ctl_table[] = {
+       {
+               INIT_CTL_NAME(2)
+               .procname = "min_reconnect_interval",
+               .data     = &min_reconnect_interval,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(3)
+               .procname = "max_reconnect_interval",
+               .data     = &max_reconnect_interval,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(5)
+               .procname = "credits",
+               .data     = &credits,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(6)
+               .procname = "peer_credits",
+               .data     = &peer_credits,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(7)
+               .procname = "fma_cq_size",
+               .data     = &fma_cq_size,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(8)
+               .procname = "timeout",
+               .data     = &timeout,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(9)
+               .procname = "max_immediate",
+               .data     = &max_immediate,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(10)
+               .procname = "checksum",
+               .data     = &checksum,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(11)
+               .procname = "bte_hash",
+               .data     = &bte_hash,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(12)
+               .procname = "bte_adapt",
+               .data     = &bte_adapt,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(13)
+               .procname = "ptag",
+               .data     = &ptag,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(14)
+               .procname = "nwildcard",
+               .data     = &nwildcard,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(15)
+               .procname = "bte_relaxed_ordering",
+               .data     = &bte_relaxed_ordering,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(16)
+               .procname = "checksum_dump",
+               .data     = &checksum_dump,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(17)
+               .procname = "nice",
+               .data     = &nice,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(18)
+               .procname = "rdmaq_intervals",
+               .data     = &rdmaq_intervals,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(19)
+               .procname = "loops",
+               .data     = &loops,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(20)
+               .procname = "hash_size",
+               .data     = &hash_size,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(21)
+               .procname = "peer_health",
+               .data     = &peer_health,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(22)
+               .procname = "vmap_cksum",
+               .data     = &vmap_cksum,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(23)
+               .procname = "mbox_per_block",
+               .data     = &mbox_per_block,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(24)
+               .procname = "mbox_credits"
+               .data     = &mbox_credits,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(25)
+               .procname = "sched_threads"
+               .data     = &sched_threads,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(26)
+               .procname = "net_hash_size",
+               .data     = &net_hash_size,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(27)
+               .procname = "hardware_timeout",
+               .data     = &hardware_timeout,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(28)
+               .procname = "mdd_timeout",
+               .data     = &mdd_timeout,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(29)
+               .procname = "max_retransmits"
+               .data     = &max_retransmits,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(30)
+               .procname = "concurrent_sends",
+               .data     = &concurrent_sends,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(31)
+               .procname = "nphys_mbox",
+               .data     = &nphys_mbox,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {0}
+};
+
+static cfs_sysctl_table_t kgnilnd_top_ctl_table[] = {
+       {
+               INIT_CTL_NAME(202)
+               .procname = "gnilnd",
+               .data     = NULL,
+               .maxlen   = 0,
+               .mode     = 0555,
+               .child    = kgnilnd_ctl_table
+       },
+       {       INIT_CTL_NAME(0)   }
+};
+#endif
+
+int
+kgnilnd_tunables_init()
+{
+       int rc = 0;
+
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+       kgnilnd_tunables.kgn_sysctl =
+               cfs_register_sysctl_table(kgnilnd_top_ctl_table, 0);
+
+       if (kgnilnd_tunables.kgn_sysctl == NULL)
+               CWARN("Can't setup /proc tunables\n");
+#endif
+       switch (*kgnilnd_tunables.kgn_checksum) {
+       default:
+               CERROR("Invalid checksum module parameter: %d\n",
+                      *kgnilnd_tunables.kgn_checksum);
+               rc = -EINVAL;
+               GOTO(out, rc);
+       case GNILND_CHECKSUM_OFF:
+               /* no checksumming */
+               break;
+       case GNILND_CHECKSUM_SMSG_HEADER:
+               LCONSOLE_INFO("SMSG header only checksumming enabled\n");
+               break;
+       case GNILND_CHECKSUM_SMSG:
+               LCONSOLE_INFO("SMSG checksumming enabled\n");
+               break;
+       case GNILND_CHECKSUM_SMSG_BTE:
+               LCONSOLE_INFO("SMSG + BTE checksumming enabled\n");
+               break;
+       }
+
+       if (*kgnilnd_tunables.kgn_max_immediate > GNILND_MAX_IMMEDIATE) {
+               LCONSOLE_ERROR("kgnilnd module parameter 'max_immediate' too large %d > %d\n",
+               *kgnilnd_tunables.kgn_max_immediate, GNILND_MAX_IMMEDIATE);
+               rc = -EINVAL;
+               GOTO(out, rc);
+       }
+
+       if (*kgnilnd_tunables.kgn_mbox_per_block < 1) {
+               *kgnilnd_tunables.kgn_mbox_per_block = 1;
+       }
+
+       if (*kgnilnd_tunables.kgn_concurrent_sends == 0) {
+               *kgnilnd_tunables.kgn_concurrent_sends = *kgnilnd_tunables.kgn_peer_credits;
+       } else if (*kgnilnd_tunables.kgn_concurrent_sends > *kgnilnd_tunables.kgn_peer_credits) {
+               LCONSOLE_ERROR("kgnilnd parameter 'concurrent_sends' too large: %d > %d (peer_credits)\n",
+                              *kgnilnd_tunables.kgn_concurrent_sends, *kgnilnd_tunables.kgn_peer_credits);
+               rc = -EINVAL;
+       }
+out:
+       return rc;
+}
+
+void
+kgnilnd_tunables_fini()
+{
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+       if (kgnilnd_tunables.kgn_sysctl != NULL)
+               cfs_unregister_sysctl_table(kgnilnd_tunables.kgn_sysctl);
+#endif
+}
diff --git a/lnet/klnds/gnilnd/gnilnd_proc.c b/lnet/klnds/gnilnd/gnilnd_proc.c
new file mode 100644 (file)
index 0000000..f161224
--- /dev/null
@@ -0,0 +1,1260 @@
+/*
+ * Copyright (C) 2009-2012 Cray, Inc.
+ *
+ *   Author: Nic Henke <nic@cray.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* this code liberated and modified from lnet/lnet/router_proc.c */
+
+#define DEBUG_SUBSYSTEM S_LND
+#include "gnilnd.h"
+#include <linux/seq_file.h>
+
+#define GNILND_PROC_STATS       "stats"
+#define GNILND_PROC_MDD         "mdd"
+#define GNILND_PROC_SMSG        "smsg"
+#define GNILND_PROC_CONN        "conn"
+#define GNILND_PROC_PEER        "peer"
+#define GNILND_PROC_CKSUM_TEST  "cksum_test"
+
+static int
+_kgnilnd_proc_run_cksum_test(int caseno, int nloops, int nob)
+{
+       lnet_kiov_t              *src, *dest;
+       struct timespec          begin, end, diff;
+       int                      niov;
+       int                      i = 0, j = 0, n;
+       __u16                    cksum, cksum2;
+       __u64                    mbytes;
+
+       LIBCFS_ALLOC(src, LNET_MAX_IOV * sizeof(lnet_kiov_t));
+       LIBCFS_ALLOC(dest, LNET_MAX_IOV * sizeof(lnet_kiov_t));
+
+       if (src == NULL || dest == NULL) {
+               CERROR("couldn't allocate iovs\n");
+               GOTO(unwind, -ENOMEM);
+       }
+
+       for (i = 0; i < LNET_MAX_IOV; i++) {
+               src[i].kiov_offset = 0;
+               src[i].kiov_len = CFS_PAGE_SIZE;
+               src[i].kiov_page = cfs_alloc_page(CFS_ALLOC_STD|CFS_ALLOC_ZERO);
+
+               if (src[i].kiov_page == NULL) {
+                       CERROR("couldn't allocate page %d\n", i);
+                       GOTO(unwind, -ENOMEM);
+               }
+
+               dest[i].kiov_offset = 0;
+               dest[i].kiov_len = CFS_PAGE_SIZE;
+               dest[i].kiov_page = cfs_alloc_page(CFS_ALLOC_STD|CFS_ALLOC_ZERO);
+
+               if (dest[i].kiov_page == NULL) {
+                       CERROR("couldn't allocate page %d\n", i);
+                       GOTO(unwind, -ENOMEM);
+               }
+       }
+
+       /* add extra 2 pages - one for offset of src, 2nd to allow dest offset */
+       niov = (nob / PAGE_SIZE) + 2;
+       if (niov > LNET_MAX_IOV) {
+               CERROR("bytes %d too large, requires niov %d > %d\n",
+                       nob, niov, LNET_MAX_IOV);
+               GOTO(unwind, -E2BIG);
+       }
+
+       /* setup real data */
+       src[0].kiov_offset = 317;
+       dest[0].kiov_offset = 592;
+       switch (caseno) {
+       default:
+               /* odd -> even */
+               break;
+       case 1:
+               /* odd -> odd */
+               dest[0].kiov_offset -= 1;
+               break;
+       case 2:
+               /* even -> even */
+               src[0].kiov_offset += 1;
+               break;
+       case 3:
+               /* even -> odd */
+               src[0].kiov_offset += 1;
+               dest[0].kiov_offset -= 1;
+       }
+       src[0].kiov_len = PAGE_SIZE - src[0].kiov_offset;
+       dest[0].kiov_len = PAGE_SIZE - dest[0].kiov_offset;
+
+       for (i = 0; i < niov; i++) {
+               memset(page_address(src[i].kiov_page) + src[i].kiov_offset,
+                      0xf0 + i, src[i].kiov_len);
+       }
+
+       lnet_copy_kiov2kiov(niov, dest, 0, niov, src, 0, nob);
+
+       getnstimeofday(&begin);
+
+       for (n = 0; n < nloops; n++) {
+               CDEBUG(D_BUFFS, "case %d loop %d src %d dest %d nob %d niov %d\n",
+                      caseno, n, src[0].kiov_offset, dest[0].kiov_offset, nob, niov);
+               cksum = kgnilnd_cksum_kiov(niov, src, 0, nob - n, 1);
+               cksum2 = kgnilnd_cksum_kiov(niov, dest, 0, nob - n, 1);
+
+               if (cksum != cksum2) {
+                       CERROR("case %d loop %d different checksums %x expected %x\n",
+                              j, n, cksum2, cksum);
+                       GOTO(unwind, -ENOKEY);
+               }
+       }
+
+       getnstimeofday(&end);
+
+       mbytes = (nloops * nob * 2) / (1024*1024);
+
+       diff = kgnilnd_ts_sub(end, begin);
+
+       LCONSOLE_INFO("running "LPD64"MB took %ld.%ld seconds\n",
+                     mbytes, diff.tv_sec, diff.tv_nsec);
+
+unwind:
+       CDEBUG(D_NET, "freeing %d pages\n", i);
+       for (i -= 1; i >= 0; i--) {
+               if (src[i].kiov_page != NULL) {
+                       cfs_free_page(src[i].kiov_page);
+               }
+               if (dest[i].kiov_page != NULL) {
+                       cfs_free_page(dest[i].kiov_page);
+               }
+       }
+
+       if (src != NULL)
+               LIBCFS_FREE(src, LNET_MAX_IOV * sizeof(lnet_kiov_t));
+       if (dest != NULL)
+               LIBCFS_FREE(dest, LNET_MAX_IOV * sizeof(lnet_kiov_t));
+       return 0;
+}
+
+static int
+kgnilnd_proc_cksum_test_write(struct file *file, const char *ubuffer,
+                             unsigned long count, void *data)
+{
+       char                    dummy[256 + 1] = { '\0' };
+       int                     testno, nloops, nbytes;
+       int                     rc;
+
+       if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) {
+               CERROR("can't run cksum test, kgnilnd is not initialized yet\n");
+               return -ENOSYS;
+       }
+
+       if (count >= sizeof(dummy) || count == 0)
+               return -EINVAL;
+
+       if (copy_from_user(dummy, ubuffer, count))
+               return -EFAULT;
+
+       if (sscanf(dummy, "%d:%d:%d", &testno, &nloops, &nbytes) == 3) {
+               rc = _kgnilnd_proc_run_cksum_test(testno, nloops, nbytes);
+               if (rc < 0) {
+                       RETURN(rc);
+               } else {
+                       /* spurious, but lets us know the parse was ok */
+                       RETURN(count);
+               }
+       }
+       RETURN(count);
+}
+
+static int
+kgnilnd_proc_stats_read(char *page, char **start, off_t off,
+                            int count, int *eof, void *data)
+{
+       kgn_device_t           *dev;
+       struct timeval          now;
+       int                     rc;
+
+       if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) {
+               rc = sprintf(page,
+                       "kgnilnd is not initialized yet\n");
+               return rc;
+       }
+
+       /* only do the first device */
+       dev = &kgnilnd_data.kgn_devices[0];
+
+       /* sampling is racy, but so is reading this file! */
+       smp_rmb();
+       do_gettimeofday(&now);
+
+       rc = sprintf(page, "time: %lu.%lu\n"
+                          "ntx: %d\n"
+                          "npeers: %d\n"
+                          "nconns: %d\n"
+                          "nEPs: %d\n"
+                          "ndgrams: %d\n"
+                          "nfmablk: %d\n"
+                          "n_mdd: %d\n"
+                          "n_mdd_held: %d\n"
+                          "GART map bytes: %ld\n"
+                          "TX queued maps: %d\n"
+                          "TX phys nmaps: %d\n"
+                          "TX phys bytes: %lu\n"
+                          "TX virt nmaps: %d\n"
+                          "TX virt bytes: "LPU64"\n"
+                          "RDMAQ bytes_auth: %ld\n"
+                          "RDMAQ bytes_left: %ld\n"
+                          "RDMAQ nstalls: %d\n"
+                          "dev mutex delay: %ld\n"
+                          "dev n_yield: %d\n"
+                          "dev n_schedule: %d\n"
+                          "SMSG fast_try: %d\n"
+                          "SMSG fast_ok: %d\n"
+                          "SMSG fast_block: %d\n"
+                          "SMSG ntx: %d\n"
+                          "SMSG tx_bytes: %ld\n"
+                          "SMSG nrx: %d\n"
+                          "SMSG rx_bytes: %ld\n"
+                          "RDMA ntx: %d\n"
+                          "RDMA tx_bytes: %ld\n"
+                          "RDMA nrx: %d\n"
+                          "RDMA rx_bytes: %ld\n"
+                          "VMAP short: %d\n"
+                          "VMAP cksum: %d\n"
+                          "KMAP short: %d\n",
+               now.tv_sec, now.tv_usec,
+               atomic_read(&kgnilnd_data.kgn_ntx),
+               atomic_read(&kgnilnd_data.kgn_npeers),
+               atomic_read(&kgnilnd_data.kgn_nconns),
+               atomic_read(&dev->gnd_neps),
+               atomic_read(&dev->gnd_ndgrams),
+               atomic_read(&dev->gnd_nfmablk),
+               atomic_read(&dev->gnd_n_mdd), atomic_read(&dev->gnd_n_mdd_held),
+               atomic64_read(&dev->gnd_nbytes_map),
+               atomic_read(&dev->gnd_nq_map),
+               dev->gnd_map_nphys, dev->gnd_map_physnop * PAGE_SIZE,
+               dev->gnd_map_nvirt, dev->gnd_map_virtnob,
+               atomic64_read(&dev->gnd_rdmaq_bytes_out),
+               atomic64_read(&dev->gnd_rdmaq_bytes_ok),
+               atomic_read(&dev->gnd_rdmaq_nstalls),
+               dev->gnd_mutex_delay,
+               atomic_read(&dev->gnd_n_yield), atomic_read(&dev->gnd_n_schedule),
+               atomic_read(&dev->gnd_fast_try), atomic_read(&dev->gnd_fast_ok),
+               atomic_read(&dev->gnd_fast_block),
+               atomic_read(&dev->gnd_short_ntx), atomic64_read(&dev->gnd_short_txbytes),
+               atomic_read(&dev->gnd_short_nrx), atomic64_read(&dev->gnd_short_rxbytes),
+               atomic_read(&dev->gnd_rdma_ntx), atomic64_read(&dev->gnd_rdma_txbytes),
+               atomic_read(&dev->gnd_rdma_nrx), atomic64_read(&dev->gnd_rdma_rxbytes),
+               atomic_read(&kgnilnd_data.kgn_nvmap_short),
+               atomic_read(&kgnilnd_data.kgn_nvmap_cksum),
+               atomic_read(&kgnilnd_data.kgn_nkmap_short));
+
+       return rc;
+}
+
+static int
+kgnilnd_proc_stats_write(struct file *file, const char *ubuffer,
+                    unsigned long count, void *data)
+{
+       kgn_device_t           *dev;
+
+       if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) {
+               CERROR("kgnilnd is not initialized for stats write\n");
+               return -EINVAL;
+       }
+
+       /* only do the first device */
+       dev = &kgnilnd_data.kgn_devices[0];
+
+       atomic_set(&dev->gnd_short_ntx, 0);
+       atomic_set(&dev->gnd_short_nrx, 0);
+       atomic64_set(&dev->gnd_short_txbytes, 0);
+       atomic64_set(&dev->gnd_short_rxbytes, 0);
+       atomic_set(&dev->gnd_rdma_ntx, 0);
+       atomic_set(&dev->gnd_rdma_nrx, 0);
+       atomic_set(&dev->gnd_fast_ok, 0);
+       atomic_set(&dev->gnd_fast_try, 0);
+       atomic_set(&dev->gnd_fast_block, 0);
+       atomic64_set(&dev->gnd_rdma_txbytes, 0);
+       atomic64_set(&dev->gnd_rdma_rxbytes, 0);
+       atomic_set(&dev->gnd_rdmaq_nstalls, 0);
+       set_mb(dev->gnd_mutex_delay, 0);
+       atomic_set(&dev->gnd_n_yield, 0);
+       atomic_set(&dev->gnd_n_schedule, 0);
+       atomic_set(&kgnilnd_data.kgn_nvmap_short, 0);
+       atomic_set(&kgnilnd_data.kgn_nvmap_cksum, 0);
+       atomic_set(&kgnilnd_data.kgn_nkmap_short, 0);
+       /* sampling is racy, but so is writing this file! */
+       smp_wmb();
+       return count;
+}
+
+typedef struct {
+       kgn_device_t           *gmdd_dev;
+       kgn_tx_t               *gmdd_tx;
+       loff_t                  gmdd_off;
+} kgn_mdd_seq_iter_t;
+
+int
+kgnilnd_mdd_seq_seek(kgn_mdd_seq_iter_t *gseq, loff_t off)
+{
+       kgn_tx_t                *tx;
+       struct list_head        *r;
+       loff_t                  here;
+       int                     rc = 0;
+
+       if (off == 0) {
+               gseq->gmdd_tx = NULL;
+               gseq->gmdd_off = 0;
+               return 0;
+       }
+
+       tx = gseq->gmdd_tx;
+
+       if (tx == NULL || gseq->gmdd_off > off) {
+               /* search from start */
+               r = gseq->gmdd_dev->gnd_map_list.next;
+               here = 1;
+       } else {
+               /* continue current search */
+               r = &tx->tx_map_list;
+               here = gseq->gmdd_off;
+       }
+
+       gseq->gmdd_off = off;
+
+       while (r != &gseq->gmdd_dev->gnd_map_list) {
+               kgn_tx_t      *t;
+
+               t = list_entry(r, kgn_tx_t, tx_map_list);
+
+               if (here == off) {
+                       gseq->gmdd_tx = t;
+                       rc = 0;
+                       goto out;
+               }
+               r = r->next;
+               here++;
+       }
+
+       gseq->gmdd_tx = NULL;
+       rc = -ENOENT;
+out:
+       return rc;
+}
+
+static void *
+kgnilnd_mdd_seq_start(struct seq_file *s, loff_t *pos)
+{
+
+       kgn_mdd_seq_iter_t      *gseq;
+       int                      rc;
+
+       if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) {
+               return NULL;
+       }
+
+       LIBCFS_ALLOC(gseq, sizeof(*gseq));
+       if (gseq == NULL) {
+               CERROR("could not allocate mdd sequence iterator\n");
+               return NULL;
+       }
+
+       /* only doing device 0 for now */
+       gseq->gmdd_dev = &kgnilnd_data.kgn_devices[0];
+       gseq->gmdd_tx = NULL;
+
+       /* need to lock map while we poke - huge disturbance
+        * but without it, no way to get the data printed */
+       spin_lock(&gseq->gmdd_dev->gnd_map_lock);
+
+       /* set private to gseq for stop */
+       s->private = gseq;
+
+       rc = kgnilnd_mdd_seq_seek(gseq, *pos);
+       if (rc == 0)
+               return gseq;
+       else
+               return NULL;
+}
+
+static void
+kgnilnd_mdd_seq_stop(struct seq_file *s, void *iter)
+{
+       kgn_mdd_seq_iter_t     *gseq = s->private;
+
+       if (gseq != NULL) {
+               spin_unlock(&gseq->gmdd_dev->gnd_map_lock);
+               LIBCFS_FREE(gseq, sizeof(*gseq));
+       }
+}
+
+static void *
+kgnilnd_mdd_seq_next(struct seq_file *s, void *iter, loff_t *pos)
+{
+       kgn_mdd_seq_iter_t     *gseq = iter;
+       int                     rc;
+       loff_t                  next = *pos + 1;
+
+       rc = kgnilnd_mdd_seq_seek(gseq, next);
+       if (rc != 0) {
+               return NULL;
+       }
+       *pos = next;
+       return gseq;
+}
+
+static int
+kgnilnd_mdd_seq_show(struct seq_file *s, void *iter)
+{
+       kgn_mdd_seq_iter_t     *gseq = iter;
+       kgn_tx_t               *tx;
+       __u64                   nob;
+       __u32                   physnop;
+       int                     id;
+       int                     buftype;
+       gni_mem_handle_t        hndl;
+
+       if (gseq->gmdd_off == 0) {
+               seq_printf(s, "%s %22s %16s %8s %8s %37s\n",
+                       "tx", "tx_id", "nob", "physnop",
+                       "buftype", "mem handle");
+               return 0;
+       }
+
+       tx = gseq->gmdd_tx;
+       LASSERT(tx != NULL);
+
+       id = tx->tx_id.txe_smsg_id;
+       nob = tx->tx_nob;
+       physnop = tx->tx_phys_npages;
+       buftype = tx->tx_buftype;
+       hndl.qword1 = tx->tx_map_key.qword1;
+       hndl.qword2 = tx->tx_map_key.qword2;
+
+       seq_printf(s, "%p %x %16"LPF64"u %8d %#8x "LPX64"."LPX64"x\n",
+               tx, id, nob, physnop, buftype,
+               hndl.qword1, hndl.qword2);
+
+       return 0;
+}
+
+static struct seq_operations kgn_mdd_sops = {
+       .start = kgnilnd_mdd_seq_start,
+       .stop  = kgnilnd_mdd_seq_stop,
+       .next  = kgnilnd_mdd_seq_next,
+       .show  = kgnilnd_mdd_seq_show,
+
+};
+
+static int
+kgnilnd_mdd_seq_open(struct inode *inode, struct file *file)
+{
+       struct seq_file       *sf;
+       int                    rc;
+
+       rc = seq_open(file, &kgn_mdd_sops);
+       if (rc == 0) {
+               sf = file->private_data;
+
+               /* NULL means we've not yet open() */
+               sf->private = NULL;
+       }
+       return rc;
+}
+
+static struct file_operations kgn_mdd_fops = {
+       .owner   = THIS_MODULE,
+       .open    = kgnilnd_mdd_seq_open,
+       .read    = seq_read,
+       .llseek  = seq_lseek,
+       .release = seq_release,
+};
+
+typedef struct {
+       __u64                   gsmsg_version;
+       kgn_device_t           *gsmsg_dev;
+       kgn_fma_memblock_t     *gsmsg_fmablk;
+       loff_t                  gsmsg_off;
+} kgn_smsg_seq_iter_t;
+
+int
+kgnilnd_smsg_seq_seek(kgn_smsg_seq_iter_t *gseq, loff_t off)
+{
+       kgn_fma_memblock_t             *fmablk;
+       kgn_device_t                   *dev;
+       struct list_head               *r;
+       loff_t                          here;
+       int                             rc = 0;
+
+       /* offset 0 is the header, so we start real entries at
+        * here == off == 1 */
+       if (off == 0) {
+               gseq->gsmsg_fmablk = NULL;
+               gseq->gsmsg_off = 0;
+               return 0;
+       }
+
+       fmablk = gseq->gsmsg_fmablk;
+       dev = gseq->gsmsg_dev;
+
+       spin_lock(&dev->gnd_fmablk_lock);
+
+       if (fmablk != NULL &&
+               gseq->gsmsg_version != atomic_read(&dev->gnd_fmablk_vers)) {
+               /* list changed */
+               rc = -ESTALE;
+               goto out;
+       }
+
+       if (fmablk == NULL || gseq->gsmsg_off > off) {
+               /* search from start */
+               r = dev->gnd_fma_buffs.next;
+               here = 1;
+       } else {
+               /* continue current search */
+               r = &fmablk->gnm_bufflist;
+               here = gseq->gsmsg_off;
+       }
+
+       gseq->gsmsg_version = atomic_read(&dev->gnd_fmablk_vers);
+       gseq->gsmsg_off = off;
+
+       while (r != &dev->gnd_fma_buffs) {
+               kgn_fma_memblock_t      *t;
+
+               t = list_entry(r, kgn_fma_memblock_t, gnm_bufflist);
+
+               if (here == off) {
+                       gseq->gsmsg_fmablk = t;
+                       rc = 0;
+                       goto out;
+               }
+               r = r->next;
+               here++;
+       }
+
+       gseq->gsmsg_fmablk = NULL;
+       rc = -ENOENT;
+out:
+       spin_unlock(&dev->gnd_fmablk_lock);
+       return rc;
+}
+
+static void *
+kgnilnd_smsg_seq_start(struct seq_file *s, loff_t *pos)
+{
+
+       kgn_smsg_seq_iter_t     *gseq;
+       int                      rc;
+
+       if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) {
+               return NULL;
+       }
+
+       LIBCFS_ALLOC(gseq, sizeof(*gseq));
+       if (gseq == NULL) {
+               CERROR("could not allocate smsg sequence iterator\n");
+               return NULL;
+       }
+
+       /* only doing device 0 for now */
+       gseq->gsmsg_dev = &kgnilnd_data.kgn_devices[0];
+       gseq->gsmsg_fmablk = NULL;
+       rc = kgnilnd_smsg_seq_seek(gseq, *pos);
+       if (rc == 0)
+               return gseq;
+
+       LIBCFS_FREE(gseq, sizeof(*gseq));
+       return NULL;
+}
+
+static void
+kgnilnd_smsg_seq_stop(struct seq_file *s, void *iter)
+{
+       kgn_smsg_seq_iter_t     *gseq = iter;
+
+       if (gseq != NULL)
+               LIBCFS_FREE(gseq, sizeof(*gseq));
+}
+
+static void *
+kgnilnd_smsg_seq_next(struct seq_file *s, void *iter, loff_t *pos)
+{
+       kgn_smsg_seq_iter_t    *gseq = iter;
+       int                     rc;
+       loff_t                  next = *pos + 1;
+
+       rc = kgnilnd_smsg_seq_seek(gseq, next);
+       if (rc != 0) {
+               LIBCFS_FREE(gseq, sizeof(*gseq));
+               return NULL;
+       }
+       *pos = next;
+       return gseq;
+}
+
+static int
+kgnilnd_smsg_seq_show(struct seq_file *s, void *iter)
+{
+       kgn_smsg_seq_iter_t    *gseq = iter;
+       kgn_fma_memblock_t     *fmablk;
+       kgn_device_t           *dev;
+       int                     avail_mboxs, held_mboxs, num_mboxs;
+       unsigned int            blk_size;
+       int                     live;
+       kgn_fmablk_state_t      state;
+       gni_mem_handle_t        hndl;
+
+       if (gseq->gsmsg_off == 0) {
+               seq_printf(s, "%5s %4s %6s/%5s/%5s %9s %18s %37s\n",
+                       "blk#", "type", "avail", "held", "total", "size",
+                       "fmablk", "mem handle");
+               return 0;
+       }
+
+       fmablk = gseq->gsmsg_fmablk;
+       dev = gseq->gsmsg_dev;
+       LASSERT(fmablk != NULL);
+
+       spin_lock(&dev->gnd_fmablk_lock);
+
+       if (gseq->gsmsg_version != atomic_read(&dev->gnd_fmablk_vers)) {
+               /* list changed */
+               spin_unlock(&dev->gnd_fmablk_lock);
+               return -ESTALE;
+       }
+
+       live = fmablk->gnm_hold_timeout == 0;
+       /* none are available if it isn't live... */
+       avail_mboxs = live ? fmablk->gnm_avail_mboxs : 0;
+       held_mboxs = fmablk->gnm_held_mboxs;
+       num_mboxs = fmablk->gnm_num_mboxs;
+       blk_size = fmablk->gnm_blk_size;
+       state = fmablk->gnm_state;
+       hndl.qword1 = fmablk->gnm_hndl.qword1;
+       hndl.qword2 = fmablk->gnm_hndl.qword2;
+
+       spin_unlock(&dev->gnd_fmablk_lock);
+
+       if (live) {
+               seq_printf(s, "%5d %4s %6d/%5d/%5d %9d %18p   "LPX64"."LPX64"\n",
+                          (int) gseq->gsmsg_off, kgnilnd_fmablk_state2str(state),
+                          avail_mboxs, held_mboxs, num_mboxs, blk_size,
+                          fmablk, hndl.qword1, hndl.qword2);
+       } else {
+               seq_printf(s, "%5d %4s %6d/%5d/%5d %9d %18p %37s\n",
+                          (int) gseq->gsmsg_off, kgnilnd_fmablk_state2str(state),
+                          avail_mboxs, held_mboxs, num_mboxs, blk_size,
+                          fmablk, "PURGATORY.HOLD");
+       }
+
+       return 0;
+}
+
+static struct seq_operations kgn_smsg_sops = {
+       .start = kgnilnd_smsg_seq_start,
+       .stop  = kgnilnd_smsg_seq_stop,
+       .next  = kgnilnd_smsg_seq_next,
+       .show  = kgnilnd_smsg_seq_show,
+
+};
+
+static int
+kgnilnd_smsg_seq_open(struct inode *inode, struct file *file)
+{
+       struct proc_dir_entry *dp = PDE(inode);
+       struct seq_file       *sf;
+       int                    rc;
+
+       rc = seq_open(file, &kgn_smsg_sops);
+       if (rc == 0) {
+               sf = file->private_data;
+               sf->private = dp->data;
+       }
+
+       return rc;
+}
+
+static struct file_operations kgn_smsg_fops = {
+       .owner   = THIS_MODULE,
+       .open    = kgnilnd_smsg_seq_open,
+       .read    = seq_read,
+       .llseek  = seq_lseek,
+       .release = seq_release,
+};
+
+typedef struct {
+       __u64                   gconn_version;
+       struct list_head       *gconn_list;
+       kgn_conn_t             *gconn_conn;
+       loff_t                  gconn_off;
+       int                     gconn_hashidx;
+} kgn_conn_seq_iter_t;
+
+int
+kgnilnd_conn_seq_seek(kgn_conn_seq_iter_t *gseq, loff_t off)
+{
+       struct list_head       *list, *tmp;
+       loff_t                  here = 0;
+       int                     rc = 0;
+
+       if (off == 0) {
+               gseq->gconn_hashidx = 0;
+               gseq->gconn_list = NULL;
+       }
+
+       if (off > atomic_read(&kgnilnd_data.kgn_nconns)) {
+               gseq->gconn_list = NULL;
+               rc = -ENOENT;
+       }
+
+       read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+       if (gseq->gconn_list != NULL &&
+               gseq->gconn_version != kgnilnd_data.kgn_conn_version) {
+               /* list changed */
+               rc = -ESTALE;
+               goto out;
+       }
+
+       if ((gseq->gconn_list == NULL) ||
+               (gseq->gconn_off > off) ||
+               (gseq->gconn_hashidx >= *kgnilnd_tunables.kgn_peer_hash_size)) {
+               /* search from start */
+               gseq->gconn_hashidx = 0;
+               list = &kgnilnd_data.kgn_conns[gseq->gconn_hashidx];
+               here = 0;
+       } else {
+               /* continue current search */
+               list = gseq->gconn_list;
+       }
+
+       gseq->gconn_version = kgnilnd_data.kgn_conn_version;
+       gseq->gconn_off = off;
+
+start_list:
+
+       list_for_each(tmp, list) {
+               if (here == off) {
+                       kgn_conn_t *conn;
+                       conn = list_entry(tmp, kgn_conn_t, gnc_hashlist);
+                       gseq->gconn_conn = conn;
+                       rc = 0;
+                       goto out;
+               }
+               here++;
+       }
+       /* if we got through this hash bucket with 'off' still to go, try next*/
+       gseq->gconn_hashidx++;
+       if ((here <= off) &&
+               (gseq->gconn_hashidx < *kgnilnd_tunables.kgn_peer_hash_size)) {
+               list = &kgnilnd_data.kgn_conns[gseq->gconn_hashidx];
+               goto start_list;
+       }
+
+       gseq->gconn_list = NULL;
+       rc = -ENOENT;
+out:
+       read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+       return rc;
+}
+
+static void *
+kgnilnd_conn_seq_start(struct seq_file *s, loff_t *pos)
+{
+
+       kgn_conn_seq_iter_t     *gseq;
+       int                      rc;
+
+       if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) {
+               return NULL;
+       }
+
+       LIBCFS_ALLOC(gseq, sizeof(*gseq));
+       if (gseq == NULL) {
+               CERROR("could not allocate conn sequence iterator\n");
+               return NULL;
+       }
+
+       /* only doing device 0 for now */
+       gseq->gconn_list = NULL;
+       rc = kgnilnd_conn_seq_seek(gseq, *pos);
+       if (rc == 0)
+               return gseq;
+
+       LIBCFS_FREE(gseq, sizeof(*gseq));
+       return NULL;
+}
+
+static void
+kgnilnd_conn_seq_stop(struct seq_file *s, void *iter)
+{
+       kgn_conn_seq_iter_t     *gseq = iter;
+
+       if (gseq != NULL)
+               LIBCFS_FREE(gseq, sizeof(*gseq));
+}
+
+static void *
+kgnilnd_conn_seq_next(struct seq_file *s, void *iter, loff_t *pos)
+{
+       kgn_conn_seq_iter_t    *gseq = iter;
+       int                     rc;
+       loff_t                  next = *pos + 1;
+
+       rc = kgnilnd_conn_seq_seek(gseq, next);
+       if (rc != 0) {
+               LIBCFS_FREE(gseq, sizeof(*gseq));
+               return NULL;
+       }
+       *pos = next;
+       return gseq;
+}
+
+static int
+kgnilnd_conn_seq_show(struct seq_file *s, void *iter)
+{
+       kgn_conn_seq_iter_t    *gseq = iter;
+       kgn_peer_t             *peer = NULL;
+       kgn_conn_t             *conn;
+
+       /* there is no header data for conns, so offset 0 is the first
+        * real entry. */
+
+       conn = gseq->gconn_conn;
+       LASSERT(conn != NULL);
+
+       read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+       if (gseq->gconn_list != NULL &&
+               gseq->gconn_version != kgnilnd_data.kgn_conn_version) {
+               /* list changed */
+               read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+               return -ESTALE;
+       }
+
+       /* instead of saving off the data, just refcount */
+       kgnilnd_conn_addref(conn);
+       if (conn->gnc_peer) {
+               /* don't use link - after unlock it could get nuked */
+               peer = conn->gnc_peer;
+               kgnilnd_peer_addref(peer);
+       }
+
+       read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       seq_printf(s, "%p->%s [%d] q %d/%d/%d "
+               "tx sq %u %dms/%dms "
+               "rx sq %u %dms/%dms "
+               "noop r/s %d/%d w/s/cq %lds/%lds/%lds "
+               "sched a/d %lds/%lds "
+               "tx_re "LPD64" TO %ds %s\n",
+               conn, peer ? libcfs_nid2str(peer->gnp_nid) : "<?>",
+               atomic_read(&conn->gnc_refcount),
+               kgnilnd_count_list(&conn->gnc_fmaq),
+               atomic_read(&conn->gnc_nlive_fma),
+               atomic_read(&conn->gnc_nlive_rdma),
+               conn->gnc_tx_seq,
+               jiffies_to_msecs(jiffies - conn->gnc_last_tx),
+               jiffies_to_msecs(jiffies - conn->gnc_last_tx_cq),
+               conn->gnc_rx_seq,
+               jiffies_to_msecs(jiffies - conn->gnc_last_rx),
+               jiffies_to_msecs(jiffies - conn->gnc_last_rx_cq),
+               atomic_read(&conn->gnc_reaper_noop),
+               atomic_read(&conn->gnc_sched_noop),
+               cfs_duration_sec(jiffies - conn->gnc_last_noop_want),
+               cfs_duration_sec(jiffies - conn->gnc_last_noop_sent),
+               cfs_duration_sec(jiffies - conn->gnc_last_noop_cq),
+               cfs_duration_sec(jiffies - conn->gnc_last_sched_ask),
+               cfs_duration_sec(jiffies - conn->gnc_last_sched_do),
+               conn->gnc_tx_retrans, conn->gnc_timeout,
+               kgnilnd_conn_state2str(conn));
+
+       if (peer)
+               kgnilnd_peer_decref(peer);
+       kgnilnd_conn_decref(conn);
+
+       return 0;
+}
+
+static struct seq_operations kgn_conn_sops = {
+       .start = kgnilnd_conn_seq_start,
+       .stop  = kgnilnd_conn_seq_stop,
+       .next  = kgnilnd_conn_seq_next,
+       .show  = kgnilnd_conn_seq_show,
+
+};
+
+static int
+kgnilnd_conn_seq_open(struct inode *inode, struct file *file)
+{
+       struct proc_dir_entry *dp = PDE(inode);
+       struct seq_file       *sf;
+       int                    rc;
+
+       rc = seq_open(file, &kgn_conn_sops);
+       if (rc == 0) {
+               sf = file->private_data;
+               sf->private = dp->data;
+       }
+
+       return rc;
+}
+
+static struct file_operations kgn_conn_fops = {
+       .owner   = THIS_MODULE,
+       .open    = kgnilnd_conn_seq_open,
+       .read    = seq_read,
+       .llseek  = seq_lseek,
+       .release = seq_release,
+};
+
+typedef struct {
+       __u64                   gpeer_version;
+       struct list_head       *gpeer_list;
+       kgn_peer_t             *gpeer_peer;
+       loff_t                  gpeer_off;
+       int                     gpeer_hashidx;
+} kgn_peer_seq_iter_t;
+
+int
+kgnilnd_peer_seq_seek(kgn_peer_seq_iter_t *gseq, loff_t off)
+{
+       struct list_head       *list, *tmp;
+       loff_t                  here = 0;
+       int                     rc = 0;
+
+       if (off == 0) {
+               gseq->gpeer_hashidx = 0;
+               gseq->gpeer_list = NULL;
+       }
+
+       if (off > atomic_read(&kgnilnd_data.kgn_npeers)) {
+               gseq->gpeer_list = NULL;
+               rc = -ENOENT;
+       }
+
+       read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+       if (gseq->gpeer_list != NULL &&
+               gseq->gpeer_version != kgnilnd_data.kgn_peer_version) {
+               /* list changed */
+               rc = -ESTALE;
+               goto out;
+       }
+
+       if ((gseq->gpeer_list == NULL) ||
+               (gseq->gpeer_off > off) ||
+               (gseq->gpeer_hashidx >= *kgnilnd_tunables.kgn_peer_hash_size)) {
+               /* search from start */
+               gseq->gpeer_hashidx = 0;
+               list = &kgnilnd_data.kgn_peers[gseq->gpeer_hashidx];
+               here = 0;
+       } else {
+               /* continue current search */
+               list = gseq->gpeer_list;
+       }
+
+       gseq->gpeer_version = kgnilnd_data.kgn_peer_version;
+       gseq->gpeer_off = off;
+
+start_list:
+
+       list_for_each(tmp, list) {
+               if (here == off) {
+                       kgn_peer_t *peer;
+                       peer = list_entry(tmp, kgn_peer_t, gnp_list);
+                       gseq->gpeer_peer = peer;
+                       rc = 0;
+                       goto out;
+               }
+               here++;
+       }
+       /* if we got through this hash bucket with 'off' still to go, try next*/
+       gseq->gpeer_hashidx++;
+       if ((here <= off) &&
+               (gseq->gpeer_hashidx < *kgnilnd_tunables.kgn_peer_hash_size)) {
+               list = &kgnilnd_data.kgn_peers[gseq->gpeer_hashidx];
+               goto start_list;
+       }
+
+       gseq->gpeer_list = NULL;
+       rc = -ENOENT;
+out:
+       read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+       return rc;
+}
+
+static void *
+kgnilnd_peer_seq_start(struct seq_file *s, loff_t *pos)
+{
+
+       kgn_peer_seq_iter_t     *gseq;
+       int                      rc;
+
+       if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) {
+               return NULL;
+       }
+
+       LIBCFS_ALLOC(gseq, sizeof(*gseq));
+       if (gseq == NULL) {
+               CERROR("could not allocate peer sequence iterator\n");
+               return NULL;
+       }
+
+       /* only doing device 0 for now */
+       gseq->gpeer_list = NULL;
+       rc = kgnilnd_peer_seq_seek(gseq, *pos);
+       if (rc == 0)
+               return gseq;
+
+       LIBCFS_FREE(gseq, sizeof(*gseq));
+       return NULL;
+}
+
+static void
+kgnilnd_peer_seq_stop(struct seq_file *s, void *iter)
+{
+       kgn_peer_seq_iter_t     *gseq = iter;
+
+       if (gseq != NULL)
+               LIBCFS_FREE(gseq, sizeof(*gseq));
+}
+
+static void *
+kgnilnd_peer_seq_next(struct seq_file *s, void *iter, loff_t *pos)
+{
+       kgn_peer_seq_iter_t    *gseq = iter;
+       int                     rc;
+       loff_t                  next = *pos + 1;
+
+       rc = kgnilnd_peer_seq_seek(gseq, next);
+       if (rc != 0) {
+               LIBCFS_FREE(gseq, sizeof(*gseq));
+               return NULL;
+       }
+       *pos = next;
+       return gseq;
+}
+
+static int
+kgnilnd_peer_seq_show(struct seq_file *s, void *iter)
+{
+       kgn_peer_seq_iter_t    *gseq = iter;
+       kgn_peer_t             *peer;
+       kgn_conn_t             *conn;
+       char                   conn_str;
+       int                    purg_count = 0;
+       /* there is no header data for peers, so offset 0 is the first
+        * real entry. */
+
+       peer = gseq->gpeer_peer;
+       LASSERT(peer != NULL);
+
+       read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+       if (gseq->gpeer_list != NULL &&
+               gseq->gpeer_version != kgnilnd_data.kgn_peer_version) {
+               /* list changed */
+               read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+               return -ESTALE;
+       }
+
+       /* instead of saving off the data, just refcount */
+       kgnilnd_peer_addref(peer);
+       conn = kgnilnd_find_conn_locked(peer);
+
+       if (peer->gnp_connecting) {
+               conn_str = 'S';
+       } else if (conn != NULL) {
+               conn_str = 'C';
+       } else {
+               conn_str = 'D';
+       }
+
+       list_for_each_entry(conn, &peer->gnp_conns, gnc_list) {
+               if (conn->gnc_in_purgatory) {
+                       purg_count++;
+               }
+       }
+
+       read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+       seq_printf(s, "%p->%s [%d] NIC 0x%x q %d conn %c purg %d "
+               "last %d@%dms dgram %d@%dms "
+               "reconn %dms to %lus \n",
+               peer, libcfs_nid2str(peer->gnp_nid),
+               atomic_read(&peer->gnp_refcount),
+               peer->gnp_host_id,
+               kgnilnd_count_list(&peer->gnp_tx_queue),
+               conn_str,
+               purg_count,
+               peer->gnp_last_errno,
+               jiffies_to_msecs(jiffies - peer->gnp_last_alive),
+               peer->gnp_last_dgram_errno,
+               jiffies_to_msecs(jiffies - peer->gnp_last_dgram_time),
+               peer->gnp_reconnect_interval != 0
+                       ? jiffies_to_msecs(jiffies - peer->gnp_reconnect_time)
+                       : 0,
+               peer->gnp_reconnect_interval);
+
+       kgnilnd_peer_decref(peer);
+
+       return 0;
+}
+
+static struct seq_operations kgn_peer_sops = {
+       .start = kgnilnd_peer_seq_start,
+       .stop  = kgnilnd_peer_seq_stop,
+       .next  = kgnilnd_peer_seq_next,
+       .show  = kgnilnd_peer_seq_show,
+};
+
+static int
+kgnilnd_peer_seq_open(struct inode *inode, struct file *file)
+{
+       struct proc_dir_entry *dp = PDE(inode);
+       struct seq_file       *sf;
+       int                    rc;
+
+       rc = seq_open(file, &kgn_peer_sops);
+       if (rc == 0) {
+               sf = file->private_data;
+               sf->private = dp->data;
+       }
+
+       return rc;
+}
+
+static struct file_operations kgn_peer_fops = {
+       .owner   = THIS_MODULE,
+       .open    = kgnilnd_peer_seq_open,
+       .read    = seq_read,
+       .llseek  = seq_lseek,
+       .release = seq_release,
+};
+
+static struct proc_dir_entry *kgn_proc_root;
+
+void
+kgnilnd_proc_init(void)
+{
+       struct proc_dir_entry *pde;
+       int             rc = 0;
+       ENTRY;
+
+       /* setup dir */
+       kgn_proc_root = proc_mkdir(libcfs_lnd2modname(GNILND), NULL);
+       if (kgn_proc_root == NULL) {
+               CERROR("couldn't create proc dir %s\n",
+                       libcfs_lnd2modname(GNILND));
+               return;
+       }
+
+       /* Initialize CKSUM_TEST */
+       pde = create_proc_entry(GNILND_PROC_CKSUM_TEST, 0200, kgn_proc_root);
+       if (pde == NULL) {
+               CERROR("couldn't create proc entry %s\n", GNILND_PROC_CKSUM_TEST);
+               rc = -ENOENT;
+               GOTO(remove_dir, rc);
+       }
+
+       pde->data = NULL;
+       pde->write_proc = kgnilnd_proc_cksum_test_write;
+
+       /* Initialize STATS */
+       pde = create_proc_entry(GNILND_PROC_STATS, 0644, kgn_proc_root);
+       if (pde == NULL) {
+               CERROR("couldn't create proc entry %s\n", GNILND_PROC_STATS);
+               rc = -ENOENT;
+               GOTO(remove_test, rc);
+       }
+
+       pde->data = NULL;
+       pde->read_proc = kgnilnd_proc_stats_read;
+       pde->write_proc = kgnilnd_proc_stats_write;
+
+       /* Initialize MDD */
+       pde = create_proc_entry(GNILND_PROC_MDD, 0444, kgn_proc_root);
+       if (pde == NULL) {
+               CERROR("couldn't create proc entry %s\n", GNILND_PROC_MDD);
+               rc = -ENOENT;
+               GOTO(remove_stats, rc);
+       }
+
+       pde->data = NULL;
+       pde->proc_fops = &kgn_mdd_fops;
+
+       /* Initialize SMSG */
+       pde = create_proc_entry(GNILND_PROC_SMSG, 0444, kgn_proc_root);
+       if (pde == NULL) {
+               CERROR("couldn't create proc entry %s\n", GNILND_PROC_SMSG);
+               rc = -ENOENT;
+               GOTO(remove_mdd, rc);
+       }
+
+       pde->data = NULL;
+       pde->proc_fops = &kgn_smsg_fops;
+
+       /* Initialize CONN */
+       pde = create_proc_entry(GNILND_PROC_CONN, 0444, kgn_proc_root);
+       if (pde == NULL) {
+               CERROR("couldn't create proc entry %s\n", GNILND_PROC_CONN);
+               rc = -ENOENT;
+               GOTO(remove_smsg, rc);
+       }
+
+       pde->data = NULL;
+       pde->proc_fops = &kgn_conn_fops;
+
+       /* Initialize PEER */
+       pde = create_proc_entry(GNILND_PROC_PEER, 0444, kgn_proc_root);
+       if (pde == NULL) {
+               CERROR("couldn't create proc entry %s\n", GNILND_PROC_PEER);
+               rc = -ENOENT;
+               GOTO(remove_conn, rc);
+       }
+
+       pde->data = NULL;
+       pde->proc_fops = &kgn_peer_fops;
+       RETURN_EXIT;
+
+remove_conn:
+       remove_proc_entry(GNILND_PROC_CONN, kgn_proc_root);
+remove_smsg:
+       remove_proc_entry(GNILND_PROC_SMSG, kgn_proc_root);
+remove_mdd:
+       remove_proc_entry(GNILND_PROC_MDD, kgn_proc_root);
+remove_stats:
+       remove_proc_entry(GNILND_PROC_STATS, kgn_proc_root);
+remove_test:
+       remove_proc_entry(GNILND_PROC_CKSUM_TEST, kgn_proc_root);
+remove_dir:
+       remove_proc_entry(kgn_proc_root->name, NULL);
+
+       RETURN_EXIT;
+}
+
+void
+kgnilnd_proc_fini(void)
+{
+       remove_proc_entry(GNILND_PROC_PEER, kgn_proc_root);
+       remove_proc_entry(GNILND_PROC_CONN, kgn_proc_root);
+       remove_proc_entry(GNILND_PROC_MDD, kgn_proc_root);
+       remove_proc_entry(GNILND_PROC_SMSG, kgn_proc_root);
+       remove_proc_entry(GNILND_PROC_STATS, kgn_proc_root);
+       remove_proc_entry(GNILND_PROC_CKSUM_TEST, kgn_proc_root);
+       remove_proc_entry(kgn_proc_root->name, NULL);
+}
diff --git a/lnet/klnds/gnilnd/gnilnd_stack.c b/lnet/klnds/gnilnd/gnilnd_stack.c
new file mode 100644 (file)
index 0000000..10ae493
--- /dev/null
@@ -0,0 +1,564 @@
+/*
+ * Copyright (C) 2012 Cray, Inc.
+ *
+ *   Author: Nic Henke <nic@cray.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#include "gnilnd.h"
+
+/* Advance all timeouts by nap_time seconds. */
+void
+kgnilnd_bump_timeouts(__u32 nap_time, char *reason)
+{
+       int                     i;
+       kgn_peer_t             *peer;
+       kgn_conn_t             *conn;
+       kgn_tx_t               *tx;
+       kgn_device_t           *dev;
+       kgn_dgram_t            *dgram;
+
+       LCONSOLE_INFO("%s: bumping all timeouts by %ds\n", reason, nap_time);
+
+       LASSERTF(GNILND_IS_QUIESCED, "gnilnd not quiesced %d != %d\n",
+                atomic_read(&kgnilnd_data.kgn_nquiesce),
+                atomic_read(&kgnilnd_data.kgn_nthreads));
+
+       /* requiring that the threads are paused ensures a couple of things:
+        * - combined code paths for stack reset and quiesce event as stack reset
+        *   runs with the threads paused
+        * - prevents traffic to the Gemini during a quiesce period
+        * - reduces the locking requirements
+       */
+
+       for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
+               list_for_each_entry(peer, &kgnilnd_data.kgn_peers[i], gnp_list) {
+
+                       /* we can reconnect again at any time */
+                       peer->gnp_reconnect_time = jiffies;
+                       /* reset now that network is healthy */
+                       peer->gnp_reconnect_interval = 0;
+                       /* tell LNet dude is still alive */
+                       kgnilnd_peer_alive(peer);
+
+                       list_for_each_entry(tx, &peer->gnp_tx_queue, tx_list) {
+                               tx->tx_qtime = jiffies;
+                       }
+
+                       list_for_each_entry(conn, &peer->gnp_conns, gnc_list) {
+                               unsigned long           timeout;
+
+                               timeout = cfs_time_seconds(conn->gnc_timeout);
+
+                               /* bump last_rx/last_rx_cq on all conns - including
+                                * closed ones, this will have the effect of
+                                * bumping the purgatory timers for those */
+                               conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies;
+
+                               /* we don't timeout based on old gnc_last_tx, so
+                                * we'll back it up and schedule the conn to trigger
+                                * a NOOP */
+                               conn->gnc_last_tx = jiffies - timeout;
+                               kgnilnd_schedule_conn(conn);
+                       }
+               }
+       }
+
+       for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+               dev = &kgnilnd_data.kgn_devices[i];
+               for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size - 1); i++) {
+                       list_for_each_entry(dgram, &dev->gnd_dgrams[i], gndg_list) {
+                               dgram->gndg_post_time = jiffies;
+                       }
+               }
+       }
+}
+
+/* Quiesce or wake up the stack.  The caller must hold the kgn_quiesce_sem semaphore
+ * on entry, which holds off any pending stack shutdown.   */
+void
+kgnilnd_quiesce_wait(char *reason)
+{
+       int             i;
+
+       if (kgnilnd_data.kgn_quiesce_trigger) {
+               unsigned long   quiesce_deadline, quiesce_to;
+               /* FREEZE TAG!!!! */
+
+               /* morning sunshine */
+               spin_lock(&kgnilnd_data.kgn_reaper_lock);
+               wake_up_all(&kgnilnd_data.kgn_reaper_waitq);
+               spin_unlock(&kgnilnd_data.kgn_reaper_lock);
+
+               for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+                       kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
+
+                       wake_up_all(&dev->gnd_waitq);
+                       wake_up_all(&dev->gnd_dgram_waitq);
+                       wake_up_all(&dev->gnd_dgping_waitq);
+               }
+
+               /* we'll wait for 10x the timeout for the threads to pause */
+               quiesce_to = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout * 10);
+               quiesce_deadline = (long) jiffies + quiesce_to;
+
+               /* wait for everyone to check-in as quiesced */
+               i = 1;
+               while (!GNILND_IS_QUIESCED) {
+                       i++;
+                       LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
+                                "%s: Waiting for %d threads to pause\n",
+                                reason,
+                                atomic_read(&kgnilnd_data.kgn_nthreads) -
+                                atomic_read(&kgnilnd_data.kgn_nquiesce));
+                       CFS_RACE(CFS_FAIL_GNI_QUIESCE_RACE);
+                       cfs_pause(cfs_time_seconds(1 * i));
+
+                       LASSERTF(quiesce_deadline > jiffies,
+                                "couldn't quiesce threads in %lu seconds, falling over now\n",
+                                cfs_duration_sec(quiesce_to));
+               }
+
+               LCONSOLE_WARN("%s: All threads paused!\n", reason);
+               /* XXX Nic: Is there a set of counters we can grab here to
+                * ensure that there is no traffic until quiesce is over ?*/
+       } else {
+               /* GO! GO! GO! */
+
+               for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+                       kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
+                       kgnilnd_schedule_dgram(dev);
+               }
+
+               /* wait for everyone to check-in as running - they will be spinning
+                * and looking, so no need to poke any waitq */
+               i = 1;
+               while (atomic_read(&kgnilnd_data.kgn_nquiesce) > 0) {
+                       i++;
+                       LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
+                                "%s: Waiting for %d threads to wake up\n",
+                                 reason,
+                                 atomic_read(&kgnilnd_data.kgn_nquiesce));
+                       cfs_pause(cfs_time_seconds(1 * i));
+               }
+
+               LCONSOLE_WARN("%s: All threads awake!\n", reason);
+       }
+}
+
+/* Reset the stack.  */
+void
+kgnilnd_reset_stack(void)
+{
+       int              i, rc = 0;
+       kgn_net_t       *net;
+       kgn_peer_t      *peer, *peerN;
+       LIST_HEAD        (souls);
+       char            *reason = "critical hardware error";
+       __u32            seconds;
+       unsigned long    start, end;
+       ENTRY;
+
+       /* Race with del_peer and its atomics */
+       CFS_RACE(CFS_FAIL_GNI_RACE_RESET);
+
+       if (kgnilnd_data.kgn_init != GNILND_INIT_ALL) {
+               CERROR("can't reset the stack, gnilnd is not initialized\n");
+               RETURN_EXIT;
+       }
+
+       /* First make sure we are not already quiesced - we panic if so,
+        * as that could leave software in a bad state */
+       LASSERTF(kgnilnd_data.kgn_quiesce_trigger == GNILND_QUIESCE_IDLE,
+               "can't reset the stack, already doing so: trigger %d\n",
+                kgnilnd_data.kgn_quiesce_trigger);
+
+       set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_RESET);
+
+       /* wake up the dgram waitq thread - but after trigger set to make sure it
+        * goes into quiesce */
+       CFS_RACE(CFS_FAIL_GNI_WC_DGRAM_FREE);
+       /* same for scheduler that is dropping state transitiosn */
+       CFS_RACE(CFS_FAIL_GNI_DROP_CLOSING);
+       CFS_RACE(CFS_FAIL_GNI_DROP_DESTROY_EP);
+
+       kgnilnd_quiesce_wait(reason);
+
+       start = jiffies;
+
+       kgnilnd_data.kgn_in_reset = 1;
+       kgnilnd_data.kgn_nresets++;
+       LCONSOLE_WARN("%s: resetting all resources (count %d)\n",
+                     reason, kgnilnd_data.kgn_nresets);
+
+       for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) {
+               list_for_each_entry(net, &kgnilnd_data.kgn_nets[i], gnn_list) {
+                       rc = kgnilnd_cancel_net_dgrams(net);
+                       LASSERTF(rc == 0, "couldn't cleanup datagrams: %d\n", rc);
+               }
+       }
+
+       /* error -ENOTRECOVERABLE is stack reset */
+       kgnilnd_del_conn_or_peer(NULL, LNET_NID_ANY, GNILND_DEL_CONN, -ENOTRECOVERABLE);
+
+       for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+               kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
+               kgnilnd_cancel_wc_dgrams(dev);
+               kgnilnd_wait_for_canceled_dgrams(dev);
+       }
+
+       /* manually do some conn processing ala kgnilnd_process_conns */
+       for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+               kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
+               kgn_conn_t      *conn;
+               int              conn_sched;
+
+               /* go find all the closed conns that need to be nuked - the
+                * scheduler thread isn't running to do this for us */
+
+               CDEBUG(D_NET, "will try to clear up %d ready_conns\n",
+                       kgnilnd_count_list(&dev->gnd_ready_conns));
+
+               /* use while/list_first_entry loop to ensure we can handle any
+                * DESTROY_EP conns added from kgnilnd_complete_closed_conn */
+               while (!list_empty(&dev->gnd_ready_conns)) {
+                       conn = list_first_entry(&dev->gnd_ready_conns,
+                                               kgn_conn_t, gnc_schedlist);
+                       conn_sched = xchg(&conn->gnc_scheduled, GNILND_CONN_PROCESS);
+
+                       LASSERTF(conn_sched != GNILND_CONN_IDLE &&
+                                conn_sched != GNILND_CONN_PROCESS,
+                                "conn %p on ready list but in bad state: %d\n",
+                                conn, conn_sched);
+
+                       list_del_init(&conn->gnc_schedlist);
+
+                       if (conn->gnc_state == GNILND_CONN_CLOSING) {
+                               /* bump to CLOSED to fake out send of CLOSE */
+                               conn->gnc_state = GNILND_CONN_CLOSED;
+                               conn->gnc_close_sent = 1;
+                       }
+
+                       if (conn->gnc_state == GNILND_CONN_DESTROY_EP) {
+                               kgnilnd_destroy_conn_ep(conn);
+                       } else {
+                               kgnilnd_complete_closed_conn(conn);
+                       }
+
+                       /* there really shouldn't be any other states here -
+                        * they would have been cleared out in the del_peer_or_conn or the dgram
+                        * aborts above.
+                        * there is an LASSERTF in kgnilnd_complete_closed_conn that will take
+                        * care of catching anything else for us */
+
+                       kgnilnd_schedule_process_conn(conn, -1);
+
+                       kgnilnd_conn_decref(conn);
+               }
+       }
+
+       /* don't let the little weasily purgatory conns hide from us */
+       for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
+               list_for_each_entry_safe(peer, peerN, &kgnilnd_data.kgn_peers[i], gnp_list) {
+                       kgn_conn_t       *conn, *connN;
+
+                       list_for_each_entry_safe(conn, connN, &peer->gnp_conns, gnc_list) {
+                               kgnilnd_detach_purgatory_locked(conn, &souls);
+                       }
+               }
+       }
+
+       CDEBUG(D_NET, "about to release %d purgatory entries\n",
+               kgnilnd_count_list(&souls));
+
+       kgnilnd_release_purgatory_list(&souls);
+
+       /* validate we are now clean */
+       for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+               kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
+
+               /* now all the cons/mboxes should be cleaned up, including purgatory
+                * so go through and release the MDDs for our persistent PHYS fma_blks
+                */
+               kgnilnd_unmap_phys_fmablk(dev);
+
+               LASSERTF(atomic_read(&dev->gnd_nfmablk) == 0,
+                       "reset failed: fma blocks still live %d\n",
+                       atomic_read(&dev->gnd_nfmablk));
+
+               LASSERTF(atomic_read(&dev->gnd_neps) == 0,
+                       "reset failed: EP handles still live %d\n",
+                       atomic_read(&dev->gnd_neps));
+       }
+
+       LASSERTF(atomic_read(&kgnilnd_data.kgn_nconns) == 0,
+               "reset failed: conns left %d\n",
+               atomic_read(&kgnilnd_data.kgn_nconns));
+
+       /* fine to have peers left - they are waiting for new conns
+        * but should not be holding any open HW resources */
+
+       /* like the last part of kgnilnd_base_shutdown() */
+
+       CFS_RACE(CFS_FAIL_GNI_SR_DOWN_RACE);
+
+       for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+               kgnilnd_dev_fini(&kgnilnd_data.kgn_devices[i]);
+       }
+
+       /* no need to free and recreate the TX descriptors
+        * we nuked all the ones that could be using HW resources in
+        * kgnilnd_close_matching_conns and asserted it worked in
+        * kgnilnd_dev_fini */
+
+       /* At this point, all HW is torn down, start to reset */
+
+       /* only reset our known devs */
+       for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+               kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
+               rc = kgnilnd_dev_init(dev);
+               LASSERTF(rc == 0, "dev_init failed for dev %d\n", i);
+               kgnilnd_map_phys_fmablk(dev);
+               LASSERTF(rc == 0, "map_phys_fmablk failed for dev %d\n", i);
+               rc = kgnilnd_setup_wildcard_dgram(dev);
+               LASSERTF(rc == 0, "couldnt setup datagrams on dev %d: %d\n",
+                       i, rc);
+       }
+
+       /* Now the fun restarts... - release the hounds! */
+
+       end = jiffies;
+       seconds = cfs_duration_sec((long)end - start);
+       kgnilnd_bump_timeouts(seconds, reason);
+
+       kgnilnd_data.kgn_in_reset = 0;
+       set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_IDLE);
+       kgnilnd_quiesce_wait(reason);
+       LCONSOLE_WARN("%s reset of all hardware resources\n",
+               rc ? "failed" : "successful");
+
+       RETURN_EXIT;
+}
+
+/* A thread that handles quiece and reset hardware events.
+ * We do the same thing regardless of which device reported the event. */
+int
+kgnilnd_ruhroh_thread(void *arg)
+{
+       int                i = 1;
+       DEFINE_WAIT(wait);
+
+       cfs_daemonize("kgnilnd_rr");
+       cfs_block_allsigs();
+       set_user_nice(current, *kgnilnd_tunables.kgn_nice);
+       kgnilnd_data.kgn_ruhroh_running = 1;
+
+       while (1) {
+
+               /* Block until there's a request..  A reset request could come in
+                * while we're handling a quiesce one, or vice versa.
+                * Keep processing requests until there are none.*/
+               prepare_to_wait(&kgnilnd_data.kgn_ruhroh_waitq, &wait, TASK_INTERRUPTIBLE);
+               while (!(kgnilnd_data.kgn_ruhroh_shutdown ||
+                               kgnilnd_data.kgn_needs_reset || kgnilnd_data.kgn_needs_pause))
+                       schedule();
+               finish_wait(&kgnilnd_data.kgn_ruhroh_waitq, &wait);
+
+              /* Exit if the driver is shutting down. */
+               if (kgnilnd_data.kgn_ruhroh_shutdown)
+                       break;
+
+               /* Serialize with driver startup and shutdown. */
+               down(&kgnilnd_data.kgn_quiesce_sem);
+
+              CDEBUG(D_NET, "trigger %d reset %d to_bump %d pause %d\n",
+                       kgnilnd_data.kgn_quiesce_trigger,
+                       kgnilnd_data.kgn_needs_reset,
+                       kgnilnd_data.kgn_bump_info_rdy,
+                       kgnilnd_data.kgn_needs_pause);
+
+               /* Do we need to do a pause/quiesce? */
+               if (kgnilnd_data.kgn_needs_pause) {
+
+                       /* Pause all other kgnilnd threads. */
+                       set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_HW_QUIESCE);
+                       kgnilnd_quiesce_wait("hardware quiesce flag");
+
+                       /* If the hardware quiesce flag is set, wait for it to clear.
+                        * This should happen relatively quickly, so we wait for it.
+                        * This will hold up the eventd thread, but on everything but
+                        * the simulator, this is ok-- there is one thread per core.
+                        *
+                        * Handle (possibly multiple) quiesce events while we wait. The
+                        * memory barrier ensures that the core doesn't start fetching
+                        * kgn_bump_info_rdy before it fetches kgn_needs_pause, and
+                        * matches the second mb in kgnilnd_quiesce_end_callback(). */
+                       smp_rmb();
+                       while (kgnilnd_hw_in_quiesce() || kgnilnd_data.kgn_bump_info_rdy) {
+
+                               i++;
+                               LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
+                                               "Waiting for hardware quiesce flag to clear\n");
+                               cfs_pause(cfs_time_seconds(1 * i));
+
+                               /* If we got a quiesce event with bump info, DO THE BUMP!. */
+                               if (kgnilnd_data.kgn_bump_info_rdy) {
+                                       /* reset console rate limiting for each event */
+                                       i = 1;
+
+                                       /* Make sure the core doesn't start fetching
+                                        * kgni_quiesce_seconds until after it sees
+                                        * kgn_bump_info_rdy set.  This is the match to the
+                                        * first mb in kgnilnd_quiesce_end_callback(). */
+                                       smp_rmb();
+                                       (void) kgnilnd_bump_timeouts(kgnilnd_data.kgn_quiesce_secs,
+                                                              "hardware quiesce callback");
+                                       set_mb(kgnilnd_data.kgn_quiesce_secs, 0);
+                                       set_mb(kgnilnd_data.kgn_bump_info_rdy, 0);
+                               }
+                     }
+
+                       /* Reset the kgn_needs_pause flag before coming out of
+                        * the pause.  This ordering avoids a race with the
+                        * setting of this flag in kgnilnd_pause_threads().  */
+                       set_mb(kgnilnd_data.kgn_needs_pause, 0);
+
+                       /* ok, let the kids back into the pool */
+                       set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_IDLE);
+                       kgnilnd_quiesce_wait("hardware quiesce");
+               }
+
+               /* Do a stack reset if needed. */
+               if (kgnilnd_data.kgn_needs_reset) {
+                       kgnilnd_reset_stack();
+                       set_mb(kgnilnd_data.kgn_needs_reset, 0);
+               }
+
+               up(&kgnilnd_data.kgn_quiesce_sem);
+       }
+
+       kgnilnd_data.kgn_ruhroh_running = 0;
+       return 0;
+}
+
+/* Set pause request flag.  Any functions that
+ * call this one are responsible for ensuring that
+ * variables they set up are visible on other cores before
+ * this flag setting.  This executes in interrupt or kernel
+ * thread context.  */
+void
+kgnilnd_pause_threads(void)
+{
+       /* only device 0 gets the handle, see kgnilnd_dev_init */
+       kgn_device_t  *dev = &kgnilnd_data.kgn_devices[0];
+       LASSERTF(dev != NULL, "dev 0 is NULL\n");
+
+       /* If we're currently in a pause triggered by the pause flag,
+        * there's no need to set it again.  We clear the kgn_needs_pause
+        * flag before we reset kgn_quiesce_trigger to avoid a race.  The
+        * read memory barrier matches the setmb() on the trigger in
+        * kgnilnd_ruhroh_task().                                       */
+       smp_rmb();
+       if (!(kgnilnd_data.kgn_quiesce_trigger == GNILND_QUIESCE_HW_QUIESCE &&
+                       GNILND_IS_QUIESCED)) {
+                CDEBUG(D_NET, "requesting thread pause\n");
+
+               kgnilnd_data.kgn_needs_pause = 1;
+
+               wake_up(&kgnilnd_data.kgn_ruhroh_waitq);
+       } else {
+           CDEBUG(D_NET, "thread pause already underway\n");
+       }
+}
+
+/* Return non-zero if the GNI hardware quiesce flag is set */
+int
+kgnilnd_hw_in_quiesce(void)
+{
+       /* only device 0 gets the handle, see kgnilnd_dev_init */
+       kgn_device_t      *dev0 = &kgnilnd_data.kgn_devices[0];
+
+       LASSERTF(dev0 != NULL, "dev 0 is NULL\n");
+
+       smp_rmb();
+       return kgnilnd_get_quiesce_status(dev0->gnd_handle) != 0;
+}
+
+
+/* If the GNI hardware quiesce flag is set, initiate our pause and
+ * return non-zero.  Also return non-zero if the stack is shutting down. */
+int
+kgnilnd_check_hw_quiesce(void)
+{
+       if (likely(!kgnilnd_hw_in_quiesce()))
+               return 0;
+
+       if (!kgnilnd_data.kgn_ruhroh_shutdown) {
+               CDEBUG(D_NET, "initiating thread pause\n");
+               kgnilnd_pause_threads();
+       } else {
+               CDEBUG(D_NET, "thread pause bypassed because of shutdown\n");
+       }
+
+       return 1;
+}
+
+/* Callback from kngi with the quiesce duration.  This executes
+ * in interrupt context.                                        */
+void
+kgnilnd_quiesce_end_callback(gni_nic_handle_t nic_handle, uint64_t msecs)
+{
+       /* only device 0 gets the handle, see kgnilnd_dev_init */
+       kgn_device_t  *dev = &kgnilnd_data.kgn_devices[0];
+       LASSERTF(dev != NULL, "dev 0 is NULL\n");
+
+       if (!kgnilnd_data.kgn_ruhroh_shutdown) {
+
+               CDEBUG(D_NET, "requesting timeout bump by "LPD64" msecs\n", msecs);
+
+               /* Save the bump interval and request the bump.
+                * The memory barrier ensures that the interval is in place before
+                * the bump flag can be seen (in case a core is already running the
+                * ruhroh task), and that the bump request flag in place before
+                * the pause request can be seen (to ensure a core doesn't miss the bump
+                * request flag).       */
+               /* If another callback occurred before the ruhroh task
+                * finished processing the first bump request, we'd over-write its info.
+                * Nic says that callbacks occur so slowly that this isn't an issue.    */
+               set_mb(kgnilnd_data.kgn_quiesce_secs, msecs / MSEC_PER_SEC);
+               set_mb(kgnilnd_data.kgn_bump_info_rdy, 1);
+               kgnilnd_pause_threads();
+       } else {
+               CDEBUG(D_NET, "timeout bump bypassed because of shutdown\n");
+       }
+}
+
+void
+kgnilnd_critical_error(struct gni_err *err_handle)
+{
+       /* only device 0 gets the handle, see kgnilnd_dev_init */
+       kgn_device_t  *dev = &kgnilnd_data.kgn_devices[0];
+       LASSERTF(dev != NULL, "dev 0 is NULL\n");
+
+       if (!kgnilnd_data.kgn_ruhroh_shutdown) {
+               CDEBUG(D_NET, "requesting stack reset\n");
+               kgnilnd_data.kgn_needs_reset = 1;
+               wake_up(&kgnilnd_data.kgn_ruhroh_waitq);
+       } else {
+               CDEBUG(D_NET, "stack reset bypassed because of shutdown\n");
+       }
+}
diff --git a/lnet/klnds/gnilnd/gnilnd_sysctl.c b/lnet/klnds/gnilnd/gnilnd_sysctl.c
new file mode 100644 (file)
index 0000000..cd33d3e
--- /dev/null
@@ -0,0 +1,252 @@
+/*
+ * Copyright (C) 2012 Cray, Inc.
+ *
+ *   Author: Nic Henke <nic@cray.com>
+ *   Author: James Shimek <jshimek@cray.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* this code liberated and modified from Lustre */
+
+#define DEBUG_SUBSYSTEM S_LND
+
+#include "gnilnd.h"
+
+typedef struct kgn_sysctl_data {
+       int                     ksd_pause_trigger;
+       int                     ksd_quiesce_secs;
+       int                     ksd_rdmaq_override;
+} kgn_sysctl_data_t;
+
+static kgn_sysctl_data_t        kgnilnd_sysctl;
+
+#if defined(CONFIG_SYSCTL)
+
+static cfs_sysctl_table_header_t *kgnilnd_table_header = NULL;
+#ifndef HAVE_SYSCTL_UNNUMBERED
+
+enum {
+       GNILND_VERSION = 1,
+       GNILND_THREAD_PAUSE,
+       GNILND_HW_QUIESCE,
+       GNILND_STACK_RESET,
+       GNILND_RDMAQ_OVERRIDE,
+};
+#else
+#define GNILND_VERSION             CTL_UNNUMBERED
+#define GNILND_THREAD_PAUSE        CTL_UNNUMBERED
+#define GNILND_HW_QUIESCE          CTL_UNNUMBERED
+#define GNILND_STACK_RESET         CTL_UNNUMBERED
+#define GNILND_RDMAQ_OVERRIDE      CTL_UNNUMBERED
+#endif
+
+static int LL_PROC_PROTO(proc_toggle_thread_pause)
+{
+       int  old_val = kgnilnd_sysctl.ksd_pause_trigger;
+       int  rc = 0;
+       ENTRY;
+
+       rc = ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+       if (!write) {
+               /* read */
+               RETURN(rc);
+       }
+
+       if (kgnilnd_data.kgn_init != GNILND_INIT_ALL) {
+               rc = -EINVAL;
+               RETURN(rc);
+       }
+
+       if (old_val != kgnilnd_sysctl.ksd_pause_trigger) {
+               down(&kgnilnd_data.kgn_quiesce_sem);
+               CDEBUG(D_NET, "setting quiesce_trigger %d\n", old_val);
+               kgnilnd_data.kgn_quiesce_trigger = kgnilnd_sysctl.ksd_pause_trigger;
+               kgnilnd_quiesce_wait("admin sysctl");
+               up(&kgnilnd_data.kgn_quiesce_sem);
+       }
+
+       RETURN(rc);
+}
+
+static int LL_PROC_PROTO(proc_hw_quiesce)
+{
+       int              rc = 0;
+       kgn_device_t    *dev;
+       ENTRY;
+
+       rc = ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+       if (!write) {
+               /* read */
+               RETURN(rc);
+       }
+
+       if (kgnilnd_data.kgn_init != GNILND_INIT_ALL) {
+               rc = -EINVAL;
+               RETURN(rc);
+       }
+
+
+       /* only device 0 gets the handle, see kgnilnd_dev_init */
+       dev = &kgnilnd_data.kgn_devices[0];
+
+       LASSERTF(dev != NULL, "dev 0 is NULL\n");
+
+       kgnilnd_quiesce_end_callback(dev->gnd_handle,
+                                    kgnilnd_sysctl.ksd_quiesce_secs * MSEC_PER_SEC);
+
+       RETURN(rc);
+}
+
+int LL_PROC_PROTO(proc_trigger_stack_reset)
+{
+       int              rc = 0;
+       int                i = 1;
+       kgn_device_t    *dev;
+       ENTRY;
+
+       if (!write) {
+               /* read */
+               rc = ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+               RETURN(rc);
+       }
+
+       /* only device 0 gets the handle, see kgnilnd_dev_init */
+       dev = &kgnilnd_data.kgn_devices[0];
+
+       LASSERTF(dev != NULL, "dev 0 is NULL\n");
+
+       kgnilnd_critical_error(dev->gnd_err_handle);
+
+       /* Wait for the reset to complete.  This prevents any races in testing
+        * where we'd immediately try to send traffic again */
+       while (kgnilnd_data.kgn_needs_reset != 0) {
+              i++;
+              LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
+                              "Waiting for stack reset request to clear\n");
+              cfs_pause(cfs_time_seconds(1 * i));
+       }
+
+       RETURN(rc);
+}
+
+static int LL_PROC_PROTO(proc_toggle_rdmaq_override)
+{
+       int  old_val = kgnilnd_sysctl.ksd_rdmaq_override;
+       int  rc = 0;
+       ENTRY;
+
+       rc = ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+       if (!write) {
+               /* read */
+               RETURN(rc);
+       }
+
+       if (kgnilnd_data.kgn_init != GNILND_INIT_ALL) {
+               rc = -EINVAL;
+               RETURN(rc);
+       }
+
+       if (old_val != kgnilnd_sysctl.ksd_rdmaq_override) {
+               long    new_mb = kgnilnd_sysctl.ksd_rdmaq_override * (long)(1024*1024);
+               LCONSOLE_INFO("changing RDMAQ override to %d mbytes/sec\n",
+                             kgnilnd_sysctl.ksd_rdmaq_override);
+               /* override proc is mbytes, but we calc in bytes */
+               kgnilnd_data.kgn_rdmaq_override = new_mb;
+               smp_wmb();
+       }
+
+       RETURN(rc);
+}
+
+static cfs_sysctl_table_t kgnilnd_table[] = {
+       /*
+        * NB No .strategy entries have been provided since sysctl(8) prefers
+        * to go via /proc for portability.
+        */
+       {
+               INIT_CTL_NAME(GNILND_VERSION)
+               .procname = "version",
+               .data     = KGNILND_BUILD_REV,
+               .maxlen   = sizeof(KGNILND_BUILD_REV),
+               .mode     = 0444,
+               .proc_handler = &proc_dostring
+       },
+       {
+               INIT_CTL_NAME(GNILND_THREAD_PAUSE)
+               .procname = "thread_pause",
+               .data     = &kgnilnd_sysctl.ksd_pause_trigger,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_toggle_thread_pause,
+       },
+       {
+               INIT_CTL_NAME(GNILND_HW_QUIESCE)
+               .procname = "hw_quiesce",
+               .data     = &kgnilnd_sysctl.ksd_quiesce_secs,
+               .maxlen   = sizeof(__u32),
+               .mode     = 0644,
+               .proc_handler = &proc_hw_quiesce,
+       },
+       {
+               INIT_CTL_NAME(GNILND_STACK_RESET)
+               .procname = "stack_reset",
+               .data     = NULL,
+               .maxlen   = sizeof(int),
+               .mode     = 0600,
+               .proc_handler = &proc_trigger_stack_reset,
+       },
+       {
+               INIT_CTL_NAME(GNILND_RDMAQ_OVERRIDE)
+               .procname = "rdmaq_override",
+               .data     = &kgnilnd_sysctl.ksd_rdmaq_override,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_toggle_rdmaq_override,
+       },
+       {       INIT_CTL_NAME(0)   }
+};
+
+static cfs_sysctl_table_t kgnilnd_top_table[2] = {
+       {
+               INIT_CTL_NAME(CTL_GNILND)
+               .procname = "kgnilnd",
+               .data     = NULL,
+               .maxlen   = 0,
+               .mode     = 0555,
+               .child    = kgnilnd_table
+       },
+       {       INIT_CTL_NAME(0)   }
+};
+
+void kgnilnd_insert_sysctl(void)
+{
+       if (kgnilnd_table_header == NULL)
+               kgnilnd_table_header = cfs_register_sysctl_table(kgnilnd_top_table, 0);
+}
+
+void kgnilnd_remove_sysctl(void)
+{
+       if (kgnilnd_table_header != NULL)
+               cfs_unregister_sysctl_table(kgnilnd_table_header);
+
+       kgnilnd_table_header = NULL;
+}
+
+#else
+void kgnilnd_insert_sysctl(void) {}
+void kgnilnd_remove_sysctl(void) {}
+#endif
diff --git a/lnet/klnds/gnilnd/gnilnd_version.h b/lnet/klnds/gnilnd/gnilnd_version.h
new file mode 100644 (file)
index 0000000..10f6278
--- /dev/null
@@ -0,0 +1 @@
+#define KGNILND_BUILD_REV        SVN_CODE_REV
index 21b5de5..d720f3d 100644 (file)
@@ -856,6 +856,7 @@ static struct mod_paths {
        { "kmxlnd", "lnet/klnds/mxlnd" },
        { "ko2iblnd", "lnet/klnds/o2iblnd" },
        { "kptllnd", "lnet/klnds/ptllnd" },
+       { "kgnilnd", "lnet/klnds/gnilnd"},
        { "kqswlnd", "lnet/klnds/qswlnd" },
        { "kralnd", "lnet/klnds/ralnd" },
        { "ksocklnd", "lnet/klnds/socklnd" },
index dedb75d..3c09a8d 100644 (file)
@@ -567,7 +567,7 @@ jt_ptl_print_peers (int argc, char **argv)
         int                      rc;
 
         if (!g_net_is_compatible (argv[0], SOCKLND, RALND, PTLLND, MXLND,
-                                  O2IBLND, 0))
+                                 O2IBLND, GNILND, 0))
                 return -1;
 
         for (index = 0;;index++) {
@@ -620,6 +620,26 @@ jt_ptl_print_peers (int argc, char **argv)
                                ptl_ipaddr_2_str(data.ioc_u32[0], buffer[1],
                                                 sizeof(buffer[1]), 1),
                                 data.ioc_u32[1]); /* peer port */
+               } else if (g_net_is_compatible(NULL, GNILND, 0)) {
+                       int disconn = data.ioc_flags >> 16;
+                       char *state;
+
+                       if (disconn)
+                               state = "D";
+                       else
+                               state = data.ioc_flags & 0xffff ? "C" : "U";
+
+                       printf ("%-20s (%d) %s [%d] "LPU64" "
+                               "sq %d/%d tx %d/%d/%d\n",
+                               libcfs_nid2str(data.ioc_nid), /* peer nid */
+                               data.ioc_net, /* gemini device id */
+                               state, /* peer is Connecting, Up, or Down */
+                               data.ioc_count,   /* peer refcount */
+                               data.ioc_u64[0], /* peerstamp */
+                               data.ioc_u32[2], data.ioc_u32[3], /* tx and rx seq */
+                               /* fmaq, nfma, nrdma */
+                               data.ioc_u32[0], data.ioc_u32[1], data.ioc_u32[4]
+                               );
                 } else {
                         printf ("%-20s [%d]\n",
                                 libcfs_nid2str(data.ioc_nid), data.ioc_count);
@@ -647,11 +667,12 @@ jt_ptl_add_peer (int argc, char **argv)
         int                      port = 0;
         int                      rc;
 
-        if (!g_net_is_compatible (argv[0], SOCKLND, RALND, 0))
+       if (!g_net_is_compatible (argv[0], SOCKLND, RALND,
+                                 GNILND, 0))
                 return -1;
 
         if (argc != 4) {
-                fprintf (stderr, "usage(tcp,ra): %s nid ipaddr port\n",
+               fprintf (stderr, "usage(tcp,ra,gni): %s nid ipaddr port\n",
                          argv[0]);
                 return 0;
         }
@@ -699,7 +720,7 @@ jt_ptl_del_peer (int argc, char **argv)
         int                      rc;
 
         if (!g_net_is_compatible (argv[0], SOCKLND, RALND, MXLND, PTLLND,
-                                  O2IBLND, 0))
+                                 O2IBLND, GNILND, 0))
                 return -1;
 
         if (g_net_is_compatible(NULL, SOCKLND, 0)) {
@@ -768,7 +789,8 @@ jt_ptl_print_connections (int argc, char **argv)
         int                      index;
         int                      rc;
 
-        if (!g_net_is_compatible (argv[0], SOCKLND, RALND, MXLND, O2IBLND, 0))
+       if (!g_net_is_compatible (argv[0], SOCKLND, RALND, MXLND, O2IBLND,
+                                 GNILND, 0))
                 return -1;
 
         for (index = 0; ; index++) {
@@ -808,6 +830,10 @@ jt_ptl_print_connections (int argc, char **argv)
                         printf ("%s mtu %d\n",
                                 libcfs_nid2str(data.ioc_nid),
                                 data.ioc_u32[0]); /* path MTU */
+               } else if (g_net_is_compatible (NULL, GNILND, 0)) {
+                       printf ("%-20s [%d]\n",
+                               libcfs_nid2str(data.ioc_nid),
+                               data.ioc_u32[0] /* device id */);
                 } else {
                         printf ("%s\n", libcfs_nid2str(data.ioc_nid));
                 }
@@ -837,7 +863,8 @@ int jt_ptl_disconnect(int argc, char **argv)
                 return 0;
         }
 
-        if (!g_net_is_compatible (NULL, SOCKLND, RALND, MXLND, O2IBLND, 0))
+       if (!g_net_is_compatible (NULL, SOCKLND, RALND, MXLND, O2IBLND,
+                                 GNILND, 0))
                 return 0;
 
         if (argc >= 2 &&
@@ -879,7 +906,7 @@ int jt_ptl_push_connection (int argc, char **argv)
                 return 0;
         }
 
-        if (!g_net_is_compatible (argv[0], SOCKLND, 0))
+       if (!g_net_is_compatible (argv[0], SOCKLND, GNILND, 0))
                 return -1;
 
         if (argc > 1 &&