From: James Simmons <uja.ornl@gmail.com>
Date: Wed, 5 Dec 2012 18:54:39 +0000 (-0500)
Subject: LU-1419 lnet: Add support for Cray's Gemini interconnect
X-Git-Tag: 2.3.58~20
X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=4d381ef9f179b21217c237ad1cc83055a2448550

LU-1419 lnet: Add support for Cray's Gemini interconnect

This patch adds LNET support to use Cray's Gemini
interconnect on their newer systems. The gnilnd was
originally based off of the ralnd.

Signed-off-by: James Simmons <uja.ornl@gmail.com>
Signed-off-by: Chris Horn <hornc@cray.com>
Signed-off-by: Cory Spitz <spitzcor@cray.com>
Change-Id: Ia98a44f4f3d68773438d820c49fe554a3d551dc5
Reviewed-on: http://review.whamcloud.com/3381
Tested-by: Hudson
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Isaac Huang <he.huang@intel.com>
Reviewed-by: Doug Oucharek <doug.s.oucharek@intel.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
---

diff --git a/lnet/autoconf/lustre-lnet.m4 b/lnet/autoconf/lustre-lnet.m4
index 4cd30eb..9d9ca11 100644
--- a/lnet/autoconf/lustre-lnet.m4
+++ b/lnet/autoconf/lustre-lnet.m4
@@ -526,13 +526,88 @@ AC_SUBST(RACPPFLAGS)
 AC_SUBST(RALND)
 ])
 
+#
+# LN_CONFIG_GNILND
+#
+# check whether to use the Gemini Network Interface lnd
+#
+AC_DEFUN([LN_CONFIG_GNILND],
+[#### Gemini Network Interface
+AC_MSG_CHECKING([whether to enable GNI lnd])
+AC_ARG_ENABLE([gni],
+	AC_HELP_STRING([--enable-gni],
+			[enable GNI lnd]),
+	[],[enable_gni='no'])
+AC_MSG_RESULT([$enable_gni])
+
+if test x$enable_gni = xyes ; then
+	AC_MSG_CHECKING([if GNI kernel headers are present])
+	# placeholder
+	# GNICPPFLAGS was set in spec file
+	EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS"
+	EXTRA_KCFLAGS="$EXTRA_KCFLAGS $GNICPPFLAGS"
+	LB_LINUX_TRY_COMPILE([
+		#include <linux/types.h>
+		#include <gni_pub.h>
+	],[
+		gni_cdm_handle_t	kgni_domain;
+		gni_return_t		rc;
+		int			rrc;
+
+		rc = gni_cdm_create(0, 1, 1, 0, &kgni_domain);
+
+		rrc = (rc == GNI_RC_SUCCESS) ? 0 : 1;
+
+		return rrc;
+	],[
+		AC_MSG_RESULT([yes])
+		GNILND="gnilnd"
+	],[
+		AC_MSG_RESULT([no])
+		AC_MSG_ERROR([can't compile gnilnd with given GNICPPFLAGS: $GNICPPFLAGS])
+	])
+	# at this point, we have gnilnd basic support, now check for extra features
+	AC_MSG_CHECKING([to use RCA in gnilnd])
+	LB_LINUX_TRY_COMPILE([
+		#include <linux/types.h>
+		#include <gni_pub.h>
+		#include <krca_lib.h>
+	],[
+		gni_cdm_handle_t	kgni_domain;
+		gni_return_t		rc;
+		krca_ticket_t		ticket = KRCA_NULL_TICKET;
+		int			rrc;
+		__u32			nid = 0, nic_addr;
+
+		rc = gni_cdm_create(0, 1, 1, 0, &kgni_domain);
+
+		rrc = (rc == GNI_RC_SUCCESS) ? 0 : 1;
+
+		rrc += krca_nid_to_nicaddrs(nid, 1, &nic_addr);
+
+		rrc += krca_register(&ticket, RCA_MAKE_SERVICE_INDEX(RCA_IO_CLASS, 9), 99, 0);
+
+		return rrc;
+	],[
+		AC_MSG_RESULT([yes])
+		GNICPPFLAGS="$GNICPPFLAGS -DGNILND_USE_RCA=1"
+		GNILNDRCA="gnilndrca"
+	],[
+		AC_MSG_RESULT([no])
+	])
+	EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save"
+fi
+AC_SUBST(GNICPPFLAGS)
+AC_SUBST(GNILNDRCA)
+AC_SUBST(GNILND)
+])
 
 
 #
 #
 # LN_CONFIG_USERSPACE
 #
-# This is defined but empty because it is called from 
+# This is defined but empty because it is called from
 # build/autconf/lustre-build.m4 which is shared by all branches.
 #
 AC_DEFUN([LN_CONFIG_USERSPACE],
@@ -598,6 +673,7 @@ LN_CONFIG_BACKOFF
 LN_CONFIG_QUADRICS
 LN_CONFIG_O2IB
 LN_CONFIG_RALND
+LN_CONFIG_GNILND
 LN_CONFIG_PTLLND
 LN_CONFIG_MX
 # 2.6.32
@@ -740,6 +816,8 @@ AC_DEFUN([LN_CONDITIONALS],
 AM_CONDITIONAL(BUILD_MXLND, test x$MXLND = "xmxlnd")
 AM_CONDITIONAL(BUILD_O2IBLND, test x$O2IBLND = "xo2iblnd")
 AM_CONDITIONAL(BUILD_RALND, test x$RALND = "xralnd")
+AM_CONDITIONAL(BUILD_GNILND, test x$GNILND = "xgnilnd")
+AM_CONDITIONAL(BUILD_GNILND_RCA, test x$GNILNDRCA = "xgnilndrca")
 AM_CONDITIONAL(BUILD_PTLLND, test x$PTLLND = "xptllnd")
 AM_CONDITIONAL(BUILD_USOCKLND, test x$USOCKLND = "xusocklnd")
 ])
@@ -769,6 +847,8 @@ lnet/klnds/qswlnd/Makefile
 lnet/klnds/qswlnd/autoMakefile
 lnet/klnds/ralnd/Makefile
 lnet/klnds/ralnd/autoMakefile
+lnet/klnds/gnilnd/Makefile
+lnet/klnds/gnilnd/autoMakefile
 lnet/klnds/socklnd/Makefile
 lnet/klnds/socklnd/autoMakefile
 lnet/klnds/ptllnd/Makefile
diff --git a/lnet/klnds/Makefile.in b/lnet/klnds/Makefile.in
index f0586ae..0d99a87 100644
--- a/lnet/klnds/Makefile.in
+++ b/lnet/klnds/Makefile.in
@@ -1,5 +1,6 @@
 @BUILD_MXLND_TRUE@subdir-m += mxlnd
 @BUILD_RALND_TRUE@subdir-m += ralnd
+@BUILD_GNILND_TRUE@subdir-m += gnilnd
 @BUILD_O2IBLND_TRUE@subdir-m += o2iblnd
 @BUILD_QSWLND_TRUE@subdir-m += qswlnd
 @BUILD_PTLLND_TRUE@subdir-m += ptllnd
diff --git a/lnet/klnds/autoMakefile.am b/lnet/klnds/autoMakefile.am
index 57d709c..1591d87 100644
--- a/lnet/klnds/autoMakefile.am
+++ b/lnet/klnds/autoMakefile.am
@@ -34,4 +34,4 @@
 # Lustre is a trademark of Sun Microsystems, Inc.
 #
 
-SUBDIRS = socklnd qswlnd mxlnd ralnd ptllnd o2iblnd
+SUBDIRS = socklnd qswlnd mxlnd ralnd gnilnd ptllnd o2iblnd
diff --git a/lnet/klnds/gnilnd/Makefile.in b/lnet/klnds/gnilnd/Makefile.in
new file mode 100644
index 0000000..14e8c30
--- /dev/null
+++ b/lnet/klnds/gnilnd/Makefile.in
@@ -0,0 +1,9 @@
+MODULES := kgnilnd
+kgnilnd-objs := gnilnd.o gnilnd_cb.o gnilnd_modparams.o gnilnd_debug.o gnilnd_proc.o \
+		gnilnd_sysctl.o gnilnd_stack.o gnilnd_conn.o
+
+EXTRA_POST_CFLAGS := -D"SVN_CODE_REV=KBUILD_STR(${SVN_CODE_REV})" @GNICPPFLAGS@
+
+EXTRA_DIST = $(kgnilnd-objs:%.o=%.c) gnilnd.h gnilnd_api_wrap.h
+
+@INCLUDE_RULES@
diff --git a/lnet/klnds/gnilnd/autoMakefile.am b/lnet/klnds/gnilnd/autoMakefile.am
new file mode 100644
index 0000000..888b68e
--- /dev/null
+++ b/lnet/klnds/gnilnd/autoMakefile.am
@@ -0,0 +1,12 @@
+# Copyright (C) 2009  Cray, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+if MODULES
+if BUILD_GNILND
+modulenet_DATA = kgnilnd$(KMODEXT)
+endif
+endif
+
+MOSTLYCLEANFILES = @MOSTLYCLEANFILES@
diff --git a/lnet/klnds/gnilnd/gnilnd.c b/lnet/klnds/gnilnd/gnilnd.c
new file mode 100644
index 0000000..fcc05fa
--- /dev/null
+++ b/lnet/klnds/gnilnd/gnilnd.c
@@ -0,0 +1,2698 @@
+/*
+ * Copyright (C) 2012 Cray, Inc.
+ *
+ *   Author: Igor Gorodetsky <iogordet@cray.com>
+ *   Author: Nic Henke <nic@cray.com>
+ *   Author: James Shimek <jshimek@cray.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#include "gnilnd.h"
+
+/* Primary entry points from LNET.  There are no guarantees against reentrance. */
+lnd_t the_kgnilnd = {
+	.lnd_type       = GNILND,
+	.lnd_startup    = kgnilnd_startup,
+	.lnd_shutdown   = kgnilnd_shutdown,
+	.lnd_ctl        = kgnilnd_ctl,
+	.lnd_send       = kgnilnd_send,
+	.lnd_recv       = kgnilnd_recv,
+	.lnd_eager_recv = kgnilnd_eager_recv,
+	.lnd_query      = kgnilnd_query,
+};
+
+kgn_data_t      kgnilnd_data;
+kgn_hssops_t	kgnilnd_hssops;
+
+/* needs write_lock on kgn_peer_conn_lock */
+int
+kgnilnd_close_stale_conns_locked(kgn_peer_t *peer, kgn_conn_t *newconn)
+{
+	kgn_conn_t         *conn;
+	struct list_head   *ctmp, *cnxt;
+	int                 loopback;
+	int                 count = 0;
+
+	loopback = peer->gnp_nid == peer->gnp_net->gnn_ni->ni_nid;
+
+	list_for_each_safe(ctmp, cnxt, &peer->gnp_conns) {
+		conn = list_entry(ctmp, kgn_conn_t, gnc_list);
+
+		if (conn->gnc_state != GNILND_CONN_ESTABLISHED)
+			continue;
+
+		if (conn == newconn)
+			continue;
+
+		if (conn->gnc_device != newconn->gnc_device)
+			continue;
+
+		/* This is a two connection loopback - one talking to the other */
+		if (loopback &&
+		    newconn->gnc_my_connstamp == conn->gnc_peer_connstamp &&
+		    newconn->gnc_peer_connstamp == conn->gnc_my_connstamp) {
+			CDEBUG(D_NET, "skipping prune of %p, "
+				"loopback and matching stamps"
+				" connstamp "LPU64"("LPU64")"
+				" peerstamp "LPU64"("LPU64")\n",
+				conn, newconn->gnc_my_connstamp,
+				conn->gnc_peer_connstamp,
+				newconn->gnc_peer_connstamp,
+				conn->gnc_my_connstamp);
+			continue;
+		}
+
+		if (conn->gnc_peerstamp != newconn->gnc_peerstamp) {
+			LASSERTF(conn->gnc_peerstamp < newconn->gnc_peerstamp,
+				"conn 0x%p peerstamp "LPU64" >= "
+				"newconn 0x%p peerstamp "LPU64"\n",
+				conn, conn->gnc_peerstamp,
+				newconn, newconn->gnc_peerstamp);
+
+			CDEBUG(D_NET, "Closing stale conn nid: %s "
+			       " peerstamp:"LPX64"("LPX64")\n",
+			       libcfs_nid2str(peer->gnp_nid),
+			       conn->gnc_peerstamp, newconn->gnc_peerstamp);
+		} else {
+
+			LASSERTF(conn->gnc_peer_connstamp < newconn->gnc_peer_connstamp,
+				"conn 0x%p peer_connstamp "LPU64" >= "
+				"newconn 0x%p peer_connstamp "LPU64"\n",
+				conn, conn->gnc_peer_connstamp,
+				newconn, newconn->gnc_peer_connstamp);
+
+			CDEBUG(D_NET, "Closing stale conn nid: %s"
+			       " connstamp:"LPU64"("LPU64")\n",
+			       libcfs_nid2str(peer->gnp_nid),
+			       conn->gnc_peer_connstamp, newconn->gnc_peer_connstamp);
+		}
+
+		count++;
+		kgnilnd_close_conn_locked(conn, -ESTALE);
+	}
+
+	if (count != 0) {
+		CWARN("Closed %d stale conns to %s\n", count, libcfs_nid2str(peer->gnp_nid));
+	}
+
+	RETURN(count);
+}
+
+int
+kgnilnd_conn_isdup_locked(kgn_peer_t *peer, kgn_conn_t *newconn)
+{
+	kgn_conn_t       *conn;
+	struct list_head *tmp;
+	int               loopback;
+	ENTRY;
+
+	loopback = peer->gnp_nid == peer->gnp_net->gnn_ni->ni_nid;
+
+	list_for_each(tmp, &peer->gnp_conns) {
+		conn = list_entry(tmp, kgn_conn_t, gnc_list);
+		CDEBUG(D_NET, "checking conn 0x%p for peer %s"
+			" lo %d new "LPU64" existing "LPU64
+			" new peer "LPU64" existing peer "LPU64
+			" new dev %p existing dev %p\n",
+			conn, libcfs_nid2str(peer->gnp_nid),
+			loopback,
+			newconn->gnc_peerstamp, conn->gnc_peerstamp,
+			newconn->gnc_peer_connstamp, conn->gnc_peer_connstamp,
+			newconn->gnc_device, conn->gnc_device);
+
+		/* conn is in the process of closing */
+		if (conn->gnc_state != GNILND_CONN_ESTABLISHED)
+			continue;
+
+		/* 'newconn' is from an earlier version of 'peer'!!! */
+		if (newconn->gnc_peerstamp < conn->gnc_peerstamp)
+			RETURN(1);
+
+		/* 'conn' is from an earlier version of 'peer': it will be
+		 * removed when we cull stale conns later on... */
+		if (newconn->gnc_peerstamp > conn->gnc_peerstamp)
+			continue;
+
+		/* Different devices are OK */
+		if (conn->gnc_device != newconn->gnc_device)
+			continue;
+
+		/* It's me connecting to myself */
+		if (loopback &&
+		    newconn->gnc_my_connstamp == conn->gnc_peer_connstamp &&
+		    newconn->gnc_peer_connstamp == conn->gnc_my_connstamp)
+			continue;
+
+		/* 'newconn' is an earlier connection from 'peer'!!! */
+		if (newconn->gnc_peer_connstamp < conn->gnc_peer_connstamp)
+			RETURN(2);
+
+		/* 'conn' is an earlier connection from 'peer': it will be
+		 * removed when we cull stale conns later on... */
+		if (newconn->gnc_peer_connstamp > conn->gnc_peer_connstamp)
+			continue;
+
+		/* 'newconn' has the SAME connection stamp; 'peer' isn't
+		 * playing the game... */
+		RETURN(3);
+	}
+
+	RETURN(0);
+}
+
+int
+kgnilnd_create_conn(kgn_conn_t **connp, kgn_device_t *dev)
+{
+	kgn_conn_t    *conn;
+	gni_return_t   rrc;
+	int            rc = 0;
+
+	LASSERT (!in_interrupt());
+	atomic_inc(&kgnilnd_data.kgn_nconns);
+
+	/* divide by 2 to allow for complete reset and immediate reconnect */
+	if (atomic_read(&kgnilnd_data.kgn_nconns) >= GNILND_MAX_CQID/2) {
+		CERROR("Too many conn are live: %d > %d\n",
+			atomic_read(&kgnilnd_data.kgn_nconns), GNILND_MAX_CQID/2);
+		atomic_dec(&kgnilnd_data.kgn_nconns);
+		return -E2BIG;
+	}
+
+	LIBCFS_ALLOC(conn, sizeof(*conn));
+	if (conn == NULL) {
+		atomic_dec(&kgnilnd_data.kgn_nconns);
+		return -ENOMEM;
+	}
+
+	LIBCFS_ALLOC(conn->gnc_tx_ref_table, GNILND_MAX_MSG_ID * sizeof(void *));
+	if (conn->gnc_tx_ref_table == NULL) {
+		CERROR("Can't allocate conn tx_ref_table\n");
+		rc = -ENOMEM;
+		GOTO(failed, rc);
+	}
+
+	atomic_set(&conn->gnc_refcount, 1);
+	atomic_set(&conn->gnc_reaper_noop, 0);
+	atomic_set(&conn->gnc_sched_noop, 0);
+	INIT_LIST_HEAD(&conn->gnc_list);
+	INIT_LIST_HEAD(&conn->gnc_hashlist);
+	INIT_LIST_HEAD(&conn->gnc_schedlist);
+	INIT_LIST_HEAD(&conn->gnc_fmaq);
+	INIT_LIST_HEAD(&conn->gnc_mdd_list);
+	spin_lock_init(&conn->gnc_list_lock);
+	spin_lock_init(&conn->gnc_tx_lock);
+
+	/* set tx id to nearly the end to make sure we find wrapping
+	 * issues soon */
+	conn->gnc_next_tx = (int) GNILND_MAX_MSG_ID - 10;
+
+	/* if this fails, we have conflicts and MAX_TX is too large */
+	CLASSERT(GNILND_MAX_MSG_ID < GNILND_MSGID_CLOSE);
+
+	/* get a new unique CQ id for this conn */
+	write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+	conn->gnc_my_connstamp = kgnilnd_data.kgn_connstamp++;
+	conn->gnc_cqid = kgnilnd_get_cqid_locked();
+	write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+	if (conn->gnc_cqid == 0) {
+		CERROR("Could not allocate unique CQ ID for conn 0x%p\n", conn);
+		rc = -E2BIG;
+		GOTO(failed, rc);
+	}
+
+	CDEBUG(D_NET, "alloc cqid %u for conn 0x%p\n",
+		conn->gnc_cqid, conn);
+
+	/* need to be set before gnc_ephandle to allow kgnilnd_destroy_conn_ep to
+	 * check context */
+	conn->gnc_device = dev;
+
+	conn->gnc_timeout = MAX(*kgnilnd_tunables.kgn_timeout,
+				GNILND_MIN_TIMEOUT);
+	kgnilnd_update_reaper_timeout(conn->gnc_timeout);
+
+	/* this is the ep_handle for doing SMSG & BTE */
+	mutex_lock(&dev->gnd_cq_mutex);
+	rrc = kgnilnd_ep_create(dev->gnd_handle, dev->gnd_snd_fma_cqh,
+				&conn->gnc_ephandle);
+	mutex_unlock(&dev->gnd_cq_mutex);
+	if (rrc != GNI_RC_SUCCESS) {
+		rc = -ENETDOWN;
+		GOTO(failed, rc);
+	}
+
+	CDEBUG(D_NET, "created conn 0x%p ep_hndl 0x%p\n",
+	       conn, conn->gnc_ephandle);
+
+	/* add ref for EP canceling */
+	kgnilnd_conn_addref(conn);
+	atomic_inc(&dev->gnd_neps);
+
+	*connp = conn;
+	return 0;
+
+failed:
+	atomic_dec(&kgnilnd_data.kgn_nconns);
+	LIBCFS_FREE(conn->gnc_tx_ref_table, GNILND_MAX_MSG_ID * sizeof(void *));
+	LIBCFS_FREE(conn, sizeof(*conn));
+	return rc;
+}
+
+/* needs to be called with kgn_peer_conn_lock held (read or write) */
+kgn_conn_t *
+kgnilnd_find_conn_locked(kgn_peer_t *peer)
+{
+	kgn_conn_t      *conn = NULL;
+	ENTRY;
+
+	/* if we are in reset, this conn is going to die soon */
+	if (unlikely(kgnilnd_data.kgn_in_reset)) {
+		RETURN(NULL);
+	}
+
+	/* just return the first ESTABLISHED connection */
+	list_for_each_entry(conn, &peer->gnp_conns, gnc_list) {
+		/* kgnilnd_finish_connect doesn't put connections on the
+		 * peer list until they are actually established */
+		LASSERTF(conn->gnc_state >= GNILND_CONN_ESTABLISHED,
+			"found conn %p state %s on peer %p (%s)\n",
+			conn, kgnilnd_conn_state2str(conn), peer,
+			libcfs_nid2str(peer->gnp_nid));
+		if (conn->gnc_state != GNILND_CONN_ESTABLISHED)
+			continue;
+
+		RETURN(conn);
+	}
+	RETURN(NULL);
+}
+
+/* needs write_lock on kgn_peer_conn_lock held */
+kgn_conn_t *
+kgnilnd_find_or_create_conn_locked(kgn_peer_t *peer) {
+
+	kgn_device_t    *dev = peer->gnp_net->gnn_dev;
+	kgn_conn_t      *conn;
+
+	conn = kgnilnd_find_conn_locked(peer);
+
+	if (conn != NULL) {
+		return conn;
+	}
+
+	/* if the peer was previously connecting, check if we should
+	 * trigger another connection attempt yet. */
+	if (time_before(jiffies, peer->gnp_reconnect_time)) {
+		return NULL;
+	}
+
+	/* This check prevents us from creating a new connection to a peer while we are
+	 * still in the process of closing an existing connection to the peer.
+	 */
+	list_for_each_entry(conn, &peer->gnp_conns, gnc_list) {
+		if (conn->gnc_ephandle != NULL) {
+			CDEBUG(D_NET, "Not connecting non-null ephandle found peer 0x%p->%s\n", peer,
+				libcfs_nid2str(peer->gnp_nid));
+			return NULL;
+		}
+	}
+
+	if (peer->gnp_connecting != GNILND_PEER_IDLE) {
+		/* if we are not connecting, fire up a new connection */
+		/* or if we are anything but IDLE DONT start a new connection */
+	       return NULL;
+	}
+
+	CDEBUG(D_NET, "starting connect to %s\n",
+		libcfs_nid2str(peer->gnp_nid));
+	peer->gnp_connecting = GNILND_PEER_CONNECT;
+	kgnilnd_peer_addref(peer); /* extra ref for connd */
+
+	spin_lock(&dev->gnd_connd_lock);
+	list_add_tail(&peer->gnp_connd_list, &dev->gnd_connd_peers);
+	spin_unlock(&dev->gnd_connd_lock);
+
+	kgnilnd_schedule_dgram(dev);
+	CDEBUG(D_NETTRACE, "scheduling new connect\n");
+
+	return NULL;
+}
+
+/* Caller is responsible for deciding if/when to call this */
+void
+kgnilnd_destroy_conn_ep(kgn_conn_t *conn)
+{
+	gni_return_t    rrc;
+	gni_ep_handle_t tmp_ep;
+
+	/* only if we actually initialized it,
+	 *  then set NULL to tell kgnilnd_destroy_conn to leave it alone */
+
+	tmp_ep = xchg(&conn->gnc_ephandle, NULL);
+	if (tmp_ep != NULL) {
+		/* we never re-use the EP, so unbind is not needed */
+		mutex_lock(&conn->gnc_device->gnd_cq_mutex);
+		rrc = kgnilnd_ep_destroy(tmp_ep);
+
+		mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+
+		/* if this fails, it could hork up kgni smsg retransmit and others
+		 * since we could free the SMSG mbox memory, etc. */
+		LASSERTF(rrc == GNI_RC_SUCCESS, "rrc %d conn 0x%p ep 0x%p\n",
+			 rrc, conn, conn->gnc_ephandle);
+
+		atomic_dec(&conn->gnc_device->gnd_neps);
+
+		/* clear out count added in kgnilnd_close_conn_locked
+		 * conn will have a peer once it hits finish_connect, where it
+		 * is the first spot we'll mark it ESTABLISHED as well */
+		if (conn->gnc_peer) {
+			kgnilnd_admin_decref(conn->gnc_peer->gnp_dirty_eps);
+		}
+
+		/* drop ref for EP */
+		kgnilnd_conn_decref(conn);
+	}
+}
+
+void
+kgnilnd_destroy_conn(kgn_conn_t *conn)
+{
+	LASSERTF(!in_interrupt() &&
+		!conn->gnc_scheduled &&
+		!conn->gnc_in_purgatory &&
+		conn->gnc_ephandle == NULL &&
+		list_empty(&conn->gnc_list) &&
+		list_empty(&conn->gnc_hashlist) &&
+		list_empty(&conn->gnc_schedlist) &&
+		list_empty(&conn->gnc_mdd_list),
+		"conn 0x%p->%s IRQ %d sched %d purg %d ep 0x%p lists %d/%d/%d/%d\n",
+		conn, conn->gnc_peer ? libcfs_nid2str(conn->gnc_peer->gnp_nid)
+				     : "<?>",
+		!!in_interrupt(), conn->gnc_scheduled,
+		conn->gnc_in_purgatory,
+		conn->gnc_ephandle,
+		list_empty(&conn->gnc_list),
+		list_empty(&conn->gnc_hashlist),
+		list_empty(&conn->gnc_schedlist),
+		list_empty(&conn->gnc_mdd_list));
+
+	/* Tripping these is especially bad, as it means we have items on the
+	 *  lists that didn't keep their refcount on the connection - or
+	 *  somebody evil released their own */
+	LASSERTF(list_empty(&conn->gnc_fmaq) &&
+		 atomic_read(&conn->gnc_nlive_fma) == 0 &&
+		 atomic_read(&conn->gnc_nlive_rdma) == 0,
+		 "conn 0x%p fmaq %d@0x%p nfma %d nrdma %d\n",
+		 conn, kgnilnd_count_list(&conn->gnc_fmaq), &conn->gnc_fmaq,
+		 atomic_read(&conn->gnc_nlive_fma), atomic_read(&conn->gnc_nlive_rdma));
+
+	CDEBUG(D_NET, "destroying conn %p ephandle %p error %d\n",
+		conn, conn->gnc_ephandle, conn->gnc_error);
+
+	/* if there is an FMA blk left here, we'll tear it down */
+	if (conn->gnc_fma_blk) {
+		kgnilnd_release_mbox(conn, 0);
+	}
+
+	if (conn->gnc_peer != NULL)
+		kgnilnd_peer_decref(conn->gnc_peer);
+
+	if (conn->gnc_tx_ref_table != NULL) {
+		LIBCFS_FREE(conn->gnc_tx_ref_table,
+			    GNILND_MAX_MSG_ID * sizeof(void *));
+	}
+
+	LIBCFS_FREE(conn, sizeof(*conn));
+	atomic_dec(&kgnilnd_data.kgn_nconns);
+}
+
+/* peer_alive and peer_notify done in the style of the o2iblnd */
+void
+kgnilnd_peer_alive(kgn_peer_t *peer)
+{
+	set_mb(peer->gnp_last_alive, jiffies);
+}
+
+void
+kgnilnd_peer_notify(kgn_peer_t *peer, int error)
+{
+	int                     tell_lnet = 0;
+	int                     nnets = 0;
+	int                     rc;
+	int                     i, j;
+	kgn_conn_t             *conn;
+	kgn_net_t             **nets;
+	kgn_net_t              *net;
+
+
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_DONT_NOTIFY))
+		return;
+
+	/* Tell LNet we are giving ups on this peer - but only
+	 * if it isn't already reconnected or trying to reconnect */
+	read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+	/* use kgnilnd_find_conn_locked to avoid any conns in the process of being nuked
+	 *
+	 * don't tell LNet if we are in reset - we assume that everyone will be able to
+	 * reconnect just fine
+	 */
+	conn = kgnilnd_find_conn_locked(peer);
+
+	CDEBUG(D_NETTRACE, "peer 0x%p->%s ting %d conn 0x%p, rst %d error %d\n",
+	       peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting, conn,
+	       kgnilnd_data.kgn_in_reset, error);
+
+	if ((peer->gnp_connecting == GNILND_PEER_IDLE) &&
+	    (conn == NULL) &&
+	    (!kgnilnd_data.kgn_in_reset) &&
+	    (!kgnilnd_conn_clean_errno(error))) {
+		tell_lnet = 1;
+	}
+
+	read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+	if (!tell_lnet) {
+		/* short circuit if we dont need to notify Lnet */
+		return;
+	}
+
+	rc = down_read_trylock(&kgnilnd_data.kgn_net_rw_sem);
+
+	if (rc) {
+	    /* dont do this if this fails since LNET is in shutdown or something else
+	     */
+
+		for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) {
+			list_for_each_entry(net , &kgnilnd_data.kgn_nets[i], gnn_list) {
+				/* if gnn_shutdown set for any net shutdown is in progress just return */
+				if (net->gnn_shutdown) {
+					up_read(&kgnilnd_data.kgn_net_rw_sem);
+					return;
+				}
+				nnets++;
+			}
+		}
+
+		if (nnets == 0) {
+			/* shutdown in progress most likely */
+			up_read(&kgnilnd_data.kgn_net_rw_sem);
+			return;
+		}
+
+		LIBCFS_ALLOC(nets, nnets * sizeof(*nets));
+
+		if (nets == NULL) {
+			up_read(&kgnilnd_data.kgn_net_rw_sem);
+			CERROR("Failed to allocate nets[%d]\n", nnets);
+			return;
+		}
+
+		j = 0;
+		for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) {
+			list_for_each_entry(net, &kgnilnd_data.kgn_nets[i], gnn_list) {
+				nets[j] = net;
+				kgnilnd_net_addref(net);
+				j++;
+			}
+		}
+		up_read(&kgnilnd_data.kgn_net_rw_sem);
+
+		for (i = 0; i < nnets; i++) {
+			lnet_nid_t peer_nid;
+
+			net = nets[i];
+
+			peer_nid = kgnilnd_lnd2lnetnid(net->gnn_ni->ni_nid,
+								 peer->gnp_nid);
+
+			CDEBUG(D_NET, "peer 0x%p->%s last_alive %lu (%lus ago)\n",
+				peer, libcfs_nid2str(peer_nid), peer->gnp_last_alive,
+				cfs_duration_sec(jiffies - peer->gnp_last_alive));
+
+			lnet_notify(net->gnn_ni, peer_nid, 0, peer->gnp_last_alive);
+
+
+			kgnilnd_net_decref(net);
+		}
+
+		LIBCFS_FREE(nets, nnets * sizeof(*nets));
+	}
+}
+
+/* need write_lock on kgn_peer_conn_lock */
+void
+kgnilnd_close_conn_locked(kgn_conn_t *conn, int error)
+{
+	kgn_peer_t        *peer = conn->gnc_peer;
+	ENTRY;
+
+	LASSERT(!in_interrupt());
+
+	/* store error for tx completion */
+	conn->gnc_error = error;
+	peer->gnp_last_errno = error;
+
+	/* use real error from peer if possible */
+	if (error == -ECONNRESET) {
+		error = conn->gnc_peer_error;
+	}
+
+	/* if we NETERROR, make sure it is rate limited */
+	if (!kgnilnd_conn_clean_errno(error)) {
+		CNETERR("closing conn to %s: error %d\n",
+		       libcfs_nid2str(peer->gnp_nid), error);
+	} else {
+		CDEBUG(D_NET, "closing conn to %s: error %d\n",
+		       libcfs_nid2str(peer->gnp_nid), error);
+	}
+
+	LASSERTF(conn->gnc_state == GNILND_CONN_ESTABLISHED,
+		"conn %p to %s with bogus state %s\n", conn,
+		libcfs_nid2str(conn->gnc_peer->gnp_nid),
+		kgnilnd_conn_state2str(conn));
+	LASSERT(!list_empty(&conn->gnc_hashlist));
+	LASSERT(!list_empty(&conn->gnc_list));
+
+
+	/* mark peer count here so any place the EP gets destroyed will
+	 * open up the peer count so that a new ESTABLISHED conn is then free
+	 * to send new messages -- sending before the previous EPs are destroyed
+	 * could end up with messages on the network for the old conn _after_
+	 * the new conn and break the mbox safety protocol */
+	kgnilnd_admin_addref(conn->gnc_peer->gnp_dirty_eps);
+
+	/* Remove from conn hash table: no new callbacks */
+	list_del_init(&conn->gnc_hashlist);
+	kgnilnd_data.kgn_conn_version++;
+
+	/* if we are in reset, go right to CLOSED as there is no scheduler
+	 * thread to move from CLOSING to CLOSED */
+	if (unlikely(kgnilnd_data.kgn_in_reset)) {
+		conn->gnc_state = GNILND_CONN_CLOSED;
+	} else {
+		conn->gnc_state = GNILND_CONN_CLOSING;
+	}
+
+	/* leave on peer->gnp_conns to make sure we don't let the reaper
+	 * or others try to unlink this peer until the conn is fully
+	 * processed for closing */
+
+	if (kgnilnd_check_purgatory_conn(conn)) {
+		kgnilnd_add_purgatory_locked(conn, conn->gnc_peer);
+	}
+
+	/* Reset RX timeout to ensure we wait for an incoming CLOSE
+	 * for the full timeout.  If we get a CLOSE we know the
+	 * peer has stopped all RDMA.  Otherwise if we wait for
+	 * the full timeout we can also be sure all RDMA has stopped. */
+	conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies;
+	mb();
+
+	/* schedule sending CLOSE - if we are in quiesce, this adds to
+	 * gnd_ready_conns and allows us to find it in quiesce processing */
+	kgnilnd_schedule_conn(conn);
+
+	/* lose peer's ref */
+	kgnilnd_conn_decref(conn);
+	/* -1 for conn table */
+	kgnilnd_conn_decref(conn);
+
+	EXIT;
+}
+
+void
+kgnilnd_close_conn(kgn_conn_t *conn, int error)
+{
+	write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+	/* need to check the state here - this call is racy and we don't
+	 * know the state until after the lock is grabbed */
+	if (conn->gnc_state == GNILND_CONN_ESTABLISHED) {
+		kgnilnd_close_conn_locked(conn, error);
+	}
+	write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+}
+
+void
+kgnilnd_complete_closed_conn(kgn_conn_t *conn)
+{
+	LIST_HEAD		(sinners);
+	kgn_tx_t               *tx, *txn;
+	int                     nlive = 0;
+	int                     nrdma = 0;
+	int                     nq_rdma = 0;
+	int                     logmsg;
+	ENTRY;
+
+	/* Dump log  on cksum error - wait until complete phase to let
+	 * RX of error happen */
+	if (*kgnilnd_tunables.kgn_checksum_dump &&
+	    (conn != NULL && conn->gnc_peer_error == -ENOKEY)) {
+		libcfs_debug_dumplog();
+	}
+
+	/* _CLOSED set in kgnilnd_process_fmaq once we decide to
+	 * send the CLOSE or not */
+	LASSERTF(conn->gnc_state == GNILND_CONN_CLOSED,
+		 "conn 0x%p->%s with bad state %s\n",
+		 conn, conn->gnc_peer ?
+			libcfs_nid2str(conn->gnc_peer->gnp_nid) :
+			"<?>",
+		 kgnilnd_conn_state2str(conn));
+
+	LASSERT(list_empty(&conn->gnc_hashlist));
+
+	/* we've sent the close, start nuking */
+
+	/* we don't use lists to track things that we can get out of the
+	 * tx_ref table... */
+
+	/* need to hold locks for tx_list_state, sampling it is too racy:
+	 * - the lock actually protects tx != NULL, but we can't take the proper
+	 *   lock until we check tx_list_state, which would be too late and
+	 *   we could have the TX change under us.
+	 * gnd_rdmaq_lock and gnd_lock and not used together, so taking both
+	 * should be fine */
+	spin_lock(&conn->gnc_device->gnd_rdmaq_lock);
+	spin_lock(&conn->gnc_device->gnd_lock);
+
+	for (nrdma = 0; nrdma < GNILND_MAX_MSG_ID; nrdma++) {
+		tx = conn->gnc_tx_ref_table[nrdma];
+
+		if (tx != NULL) {
+			/* only print the first error and if not CLOSE, we often don't see
+			 * CQ events for that by the time we get here... and really don't care */
+			if (nlive || tx->tx_msg.gnm_type == GNILND_MSG_CLOSE)
+				tx->tx_state |= GNILND_TX_QUIET_ERROR;
+			nlive++;
+			GNIDBG_TX(D_NET, tx, "cleaning up on close, nlive %d", nlive);
+
+			/* don't worry about gnc_lock here as nobody else should be
+			 * touching this conn */
+			kgnilnd_tx_del_state_locked(tx, NULL, conn, GNILND_TX_ALLOCD);
+			list_add_tail(&tx->tx_list, &sinners);
+		}
+	}
+	spin_unlock(&conn->gnc_device->gnd_lock);
+	spin_unlock(&conn->gnc_device->gnd_rdmaq_lock);
+
+	/* nobody should have marked this as needing scheduling after
+	 * we called close - so only ref should be us handling it */
+	LASSERTF(conn->gnc_scheduled == GNILND_CONN_PROCESS,
+		 "conn 0x%p scheduled %d\n", conn, conn->gnc_scheduled);
+
+	/* now reset a few to actual counters... */
+	nrdma = atomic_read(&conn->gnc_nlive_rdma);
+	nq_rdma = atomic_read(&conn->gnc_nq_rdma);
+
+	if (!list_empty(&sinners)) {
+		list_for_each_entry_safe(tx, txn, &sinners, tx_list) {
+			/* clear tx_list to make tx_add_list_locked happy */
+			list_del_init(&tx->tx_list);
+			/* The error codes determine if we hold onto the MDD */
+			kgnilnd_tx_done(tx, conn->gnc_error);
+		}
+	}
+
+	logmsg = (nlive + nrdma + nq_rdma);
+
+	if (logmsg) {
+		if (conn->gnc_peer_error != 0) {
+			CNETERR("Closed conn 0x%p->%s (errno %d, peer errno %d): "
+				"canceled %d TX, %d/%d RDMA\n",
+				conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+				conn->gnc_error, conn->gnc_peer_error,
+				nlive, nq_rdma, nrdma);
+		} else {
+			CNETERR("Closed conn 0x%p->%s (errno %d): "
+				"canceled %d TX, %d/%d RDMA\n",
+				conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+				conn->gnc_error,
+				nlive, nq_rdma, nrdma);
+		}
+	}
+
+	kgnilnd_destroy_conn_ep(conn);
+
+	/* Bug 765042 - race this with completing a new conn to same peer - we need
+	 * finish_connect to detach purgatory before we can do it ourselves here */
+	CFS_RACE(CFS_FAIL_GNI_FINISH_PURG);
+
+	/* now it is safe to remove from peer list - anyone looking at
+	 * gnp_conns now is free to unlink if not on purgatory */
+	write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+	conn->gnc_state = GNILND_CONN_DONE;
+
+	/* Decrement counter if we are marked by del_conn_or_peers for closing
+	 */
+	if (conn->gnc_needs_closing)
+		kgnilnd_admin_decref(kgnilnd_data.kgn_npending_conns);
+
+	/* Remove from peer's list of valid connections if its not in purgatory */
+	if (!conn->gnc_in_purgatory) {
+		list_del_init(&conn->gnc_list);
+	}
+
+	/* NB - only unlinking if we set pending in del_peer_locked from admin or
+	 * shutdown */
+	if (kgnilnd_peer_active(conn->gnc_peer) &&
+	    conn->gnc_peer->gnp_pending_unlink &&
+	    kgnilnd_can_unlink_peer_locked(conn->gnc_peer)) {
+		kgnilnd_unlink_peer_locked(conn->gnc_peer);
+	}
+
+	write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+	/* I'm telling Mommy! - use peer_error if they initiated close */
+	kgnilnd_peer_notify(conn->gnc_peer,
+			    conn->gnc_error == -ECONNRESET ? conn->gnc_peer_error
+							   : conn->gnc_error);
+
+	EXIT;
+}
+
+int
+kgnilnd_set_conn_params(kgn_dgram_t *dgram)
+{
+	kgn_conn_t             *conn = dgram->gndg_conn;
+	kgn_connreq_t          *connreq = &dgram->gndg_conn_in;
+	kgn_gniparams_t        *rem_param = &connreq->gncr_gnparams;
+	gni_return_t            rrc;
+	int                     rc = 0;
+
+	/* set timeout vals in conn early so we can use them for the NAK */
+
+	/* use max of the requested and our timeout, peer will do the same */
+	conn->gnc_timeout = MAX(conn->gnc_timeout, connreq->gncr_timeout);
+
+	/* only ep_bind really mucks around with the CQ */
+	/* only ep bind if we are not connecting to ourself and the dstnid is not a wildcard. this check
+	 * is necessary as you can only bind an ep once and we must make sure we dont bind when already bound.
+	 */
+	if (connreq->gncr_dstnid != LNET_NID_ANY && dgram->gndg_conn_out.gncr_dstnid != connreq->gncr_srcnid) {
+		mutex_lock(&conn->gnc_device->gnd_cq_mutex);
+		rrc = kgnilnd_ep_bind(conn->gnc_ephandle,
+			connreq->gncr_gnparams.gnpr_host_id,
+			conn->gnc_cqid);
+		mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+		if (rrc != GNI_RC_SUCCESS) {
+			rc = -ECONNABORTED;
+			goto return_out;
+		}
+	}
+
+	rrc = kgnilnd_ep_set_eventdata(conn->gnc_ephandle, conn->gnc_cqid,
+			 connreq->gncr_gnparams.gnpr_cqid);
+	if (rrc != GNI_RC_SUCCESS) {
+		rc = -ECONNABORTED;
+		goto cleanup_out;
+	}
+
+	/* Initialize SMSG */
+	rrc = kgnilnd_smsg_init(conn->gnc_ephandle, &conn->gnpr_smsg_attr,
+			&connreq->gncr_gnparams.gnpr_smsg_attr);
+	if (unlikely(rrc == GNI_RC_INVALID_PARAM)) {
+		gni_smsg_attr_t *local = &conn->gnpr_smsg_attr;
+		gni_smsg_attr_t *remote = &connreq->gncr_gnparams.gnpr_smsg_attr;
+		/* help folks figure out if there is a tunable off, etc. */
+		LCONSOLE_ERROR("SMSG attribute mismatch. Data from local/remote:"
+			       " type %d/%d msg_maxsize %u/%u"
+			       " mbox_maxcredit %u/%u. Please check kgni"
+			       " logs for further data\n",
+			       local->msg_type, remote->msg_type,
+			       local->msg_maxsize, remote->msg_maxsize,
+			       local->mbox_maxcredit, remote->mbox_maxcredit);
+	}
+	if (rrc != GNI_RC_SUCCESS) {
+		rc = -ECONNABORTED;
+		goto cleanup_out;
+	}
+
+	/* log this for help in debuggin SMSG buffer re-use */
+	CDEBUG(D_NET, "conn %p src %s dst %s smsg %p acquired"
+		" local cqid %u SMSG %p->%u hndl "LPX64"."LPX64
+		" remote cqid %u SMSG %p->%u hndl "LPX64"."LPX64"\n",
+		conn, libcfs_nid2str(connreq->gncr_srcnid),
+		libcfs_nid2str(connreq->gncr_dstnid),
+		&conn->gnpr_smsg_attr,
+		conn->gnc_cqid,
+		conn->gnpr_smsg_attr.msg_buffer,
+		conn->gnpr_smsg_attr.mbox_offset,
+		conn->gnpr_smsg_attr.mem_hndl.qword1,
+		conn->gnpr_smsg_attr.mem_hndl.qword2,
+		rem_param->gnpr_cqid,
+		rem_param->gnpr_smsg_attr.msg_buffer,
+		rem_param->gnpr_smsg_attr.mbox_offset,
+		rem_param->gnpr_smsg_attr.mem_hndl.qword1,
+		rem_param->gnpr_smsg_attr.mem_hndl.qword2);
+
+	conn->gnc_peerstamp = connreq->gncr_peerstamp;
+	conn->gnc_peer_connstamp = connreq->gncr_connstamp;
+
+	/* We update the reaper timeout once we have a valid conn and timeout */
+	kgnilnd_update_reaper_timeout(GNILND_TO2KA(conn->gnc_timeout));
+
+	return 0;
+
+cleanup_out:
+	rrc = kgnilnd_ep_unbind(conn->gnc_ephandle);
+	/* not sure I can just let this fly */
+	LASSERTF(rrc == GNI_RC_SUCCESS,
+		"bad rc from gni_ep_unbind trying to cleanup: %d\n", rrc);
+
+return_out:
+	LASSERTF(rc != 0, "SOFTWARE BUG: rc == 0\n");
+	CERROR("Error setting connection params from %s: %d\n",
+	       libcfs_nid2str(connreq->gncr_srcnid), rc);
+	return rc;
+}
+
+/* needs down_read on kgn_net_rw_sem held from before this call until
+ * after the write_lock on kgn_peer_conn_lock - this ensures we stay sane
+ * with kgnilnd_shutdown - it'll get the sem and set shutdown, then get the
+ * kgn_peer_conn_lock to start del_peer'ing. If we hold the sem until after
+ * kgn_peer_conn_lock is held, we guarantee that nobody calls
+ * kgnilnd_add_peer_locked without checking gnn_shutdown */
+int
+kgnilnd_create_peer_safe(kgn_peer_t **peerp, lnet_nid_t nid, kgn_net_t *net)
+{
+	kgn_peer_t    *peer;
+	int            rc;
+
+	LASSERT(nid != LNET_NID_ANY);
+
+	/* We dont pass the net around in the dgram anymore so here is where we find it
+	 * this will work unless its in shutdown or the nid has a net that is invalid.
+	 * Either way error code needs to be returned in that case.
+	 *
+	 * If the net passed in is not NULL then we can use it, this alleviates looking it
+	 * when the calling function has access to the data.
+	 */
+	if (net == NULL) {
+		rc = kgnilnd_find_net(nid, &net);
+		if (rc < 0)
+			return rc;
+	} else {
+		/* find net adds a reference on the net if we are not using
+		 * it we must do it manually so the net references are
+		 * correct when tearing down the net
+		 */
+		kgnilnd_net_addref(net);
+	}
+
+	LIBCFS_ALLOC(peer, sizeof(*peer));
+	if (peer == NULL) {
+		kgnilnd_net_decref(net);
+		return -ENOMEM;
+	}
+	peer->gnp_nid = nid;
+
+	/* translate from nid to nic addr & store */
+	rc = kgnilnd_nid_to_nicaddrs(LNET_NIDADDR(nid), 1, &peer->gnp_host_id);
+	if (rc <= 0) {
+		kgnilnd_net_decref(net);
+		LIBCFS_FREE(peer, sizeof(*peer));
+		return -ESRCH;
+	}
+	CDEBUG(D_NET, "peer 0x%p->%s -> NIC 0x%x\n", peer,
+		libcfs_nid2str(nid), peer->gnp_host_id);
+
+	atomic_set(&peer->gnp_refcount, 1);     /* 1 ref for caller */
+	atomic_set(&peer->gnp_dirty_eps, 0);
+
+	INIT_LIST_HEAD(&peer->gnp_list);
+	INIT_LIST_HEAD(&peer->gnp_connd_list);
+	INIT_LIST_HEAD(&peer->gnp_conns);
+	INIT_LIST_HEAD(&peer->gnp_tx_queue);
+
+	/* the first reconnect should happen immediately, so we leave
+	 * gnp_reconnect_interval set to 0 */
+
+	LASSERTF(net != NULL, "peer 0x%p->%s with NULL net\n",
+		 peer, libcfs_nid2str(nid));
+
+	/* must have kgn_net_rw_sem held for this...  */
+	if (net->gnn_shutdown) {
+		/* shutdown has started already */
+		kgnilnd_net_decref(net);
+		LIBCFS_FREE(peer, sizeof(*peer));
+		return -ESHUTDOWN;
+	}
+
+	peer->gnp_net = net;
+
+	atomic_inc(&kgnilnd_data.kgn_npeers);
+
+	*peerp = peer;
+	return 0;
+}
+
+void
+kgnilnd_destroy_peer(kgn_peer_t *peer)
+{
+	CDEBUG(D_NET, "peer %s %p deleted\n",
+	       libcfs_nid2str(peer->gnp_nid), peer);
+	LASSERTF(atomic_read(&peer->gnp_refcount) == 0,
+		 "peer 0x%p->%s refs %d\n",
+		 peer, libcfs_nid2str(peer->gnp_nid),
+		 atomic_read(&peer->gnp_refcount));
+	LASSERTF(atomic_read(&peer->gnp_dirty_eps) == 0,
+		 "peer 0x%p->%s dirty eps %d\n",
+		 peer, libcfs_nid2str(peer->gnp_nid),
+		 atomic_read(&peer->gnp_dirty_eps));
+	LASSERTF(peer->gnp_net != NULL, "peer %p (%s) with NULL net\n",
+		 peer, libcfs_nid2str(peer->gnp_nid));
+	LASSERTF(!kgnilnd_peer_active(peer),
+		 "peer 0x%p->%s\n",
+		peer, libcfs_nid2str(peer->gnp_nid));
+	LASSERTF(peer->gnp_connecting == GNILND_PEER_IDLE || peer->gnp_connecting == GNILND_PEER_KILL,
+		 "peer 0x%p->%s, connecting %d\n",
+		peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
+	LASSERTF(list_empty(&peer->gnp_conns),
+		 "peer 0x%p->%s\n",
+		peer, libcfs_nid2str(peer->gnp_nid));
+	LASSERTF(list_empty(&peer->gnp_tx_queue),
+		 "peer 0x%p->%s\n",
+		peer, libcfs_nid2str(peer->gnp_nid));
+	LASSERTF(list_empty(&peer->gnp_connd_list),
+		 "peer 0x%p->%s\n",
+		peer, libcfs_nid2str(peer->gnp_nid));
+
+	/* NB a peer's connections keep a reference on their peer until
+	 * they are destroyed, so we can be assured that _all_ state to do
+	 * with this peer has been cleaned up when its refcount drops to
+	 * zero. */
+
+	atomic_dec(&kgnilnd_data.kgn_npeers);
+	kgnilnd_net_decref(peer->gnp_net);
+
+	LIBCFS_FREE(peer, sizeof(*peer));
+}
+
+/* the conn might not have made it all the way through to a connected
+ * state - but we need to purgatory any conn that a remote peer might
+ * have seen through a posted dgram as well */
+void
+kgnilnd_add_purgatory_locked(kgn_conn_t *conn, kgn_peer_t *peer)
+{
+	kgn_mbox_info_t *mbox = NULL;
+	ENTRY;
+
+	/* NB - the caller should own conn by removing him from the
+	 * scheduler thread when finishing the close */
+
+	LASSERTF(peer != NULL, "conn %p with NULL peer\n", conn);
+
+	/* If this is still true, need to add the calls to unlink back in and
+	 * figure out how to close the hole on loopback conns */
+	LASSERTF(kgnilnd_peer_active(peer), "can't use inactive peer %s (%p)"
+		" we'll never recover the resources\n",
+		libcfs_nid2str(peer->gnp_nid), peer);
+
+	CDEBUG(D_NET, "conn %p peer %p dev %p\n", conn, peer,
+		conn->gnc_device);
+
+	/* add ref for mbox purgatory hold */
+	kgnilnd_peer_addref(peer);
+	kgnilnd_conn_addref(conn);
+	conn->gnc_in_purgatory = 1;
+
+	mbox = &conn->gnc_fma_blk->gnm_mbox_info[conn->gnc_mbox_id];
+	mbox->mbx_prev_nid = peer->gnp_nid;
+	mbox->mbx_add_purgatory = jiffies;
+	kgnilnd_release_mbox(conn, 1);
+
+	LASSERTF(list_empty(&conn->gnc_mdd_list),
+		"conn 0x%p->%s with active purgatory hold MDD %d\n",
+		conn, libcfs_nid2str(peer->gnp_nid),
+		kgnilnd_count_list(&conn->gnc_mdd_list));
+
+	EXIT;
+}
+
+/* Instead of detaching everything from purgatory here we just mark the conn as needing
+ * detach, when the reaper checks the conn the next time it will detach it.
+ * Calling function requires write_lock held on kgn_peer_conn_lock
+ */
+void
+kgnilnd_mark_for_detach_purgatory_all_locked(kgn_peer_t *peer) {
+	kgn_conn_t       *conn;
+
+	list_for_each_entry(conn, &peer->gnp_conns, gnc_list) {
+		if (conn->gnc_in_purgatory && !conn->gnc_needs_detach) {
+			conn->gnc_needs_detach = 1;
+			kgnilnd_admin_addref(kgnilnd_data.kgn_npending_detach);
+		}
+	}
+}
+
+/* Calling function needs a write_lock held on kgn_peer_conn_lock */
+void
+kgnilnd_detach_purgatory_locked(kgn_conn_t *conn, struct list_head *conn_list)
+{
+	kgn_mbox_info_t *mbox = NULL;
+
+	/* if needed, add the conn purgatory data to the list passed in */
+	if (conn->gnc_in_purgatory) {
+		CDEBUG(D_NET, "peer %p->%s purg_conn %p@%s mdd_list #tx %d\n",
+			conn->gnc_peer, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+			conn, kgnilnd_conn_state2str(conn),
+			kgnilnd_count_list(&conn->gnc_mdd_list));
+
+		mbox = &conn->gnc_fma_blk->gnm_mbox_info[conn->gnc_mbox_id];
+		mbox->mbx_detach_of_purgatory = jiffies;
+
+		/* conn->gnc_list is the entry point on peer->gnp_conns, so detaching it
+		 * here removes it from the list of 'valid' peer connections.
+		 * We put the current conn onto a list of conns to call kgnilnd_release_purgatory_locked()
+		 * and as such the caller of kgnilnd_detach_purgatory_locked() now owns that conn, since its not
+		 * on the peer's conn_list anymore.
+		 */
+
+		kgnilnd_peer_decref(conn->gnc_peer);
+		list_del_init(&conn->gnc_list);
+
+		/* NB - only unlinking if we set pending in del_peer_locked from admin or
+		 * shutdown */
+		if (kgnilnd_peer_active(conn->gnc_peer) &&
+		    conn->gnc_peer->gnp_pending_unlink &&
+		    kgnilnd_can_unlink_peer_locked(conn->gnc_peer)) {
+			kgnilnd_unlink_peer_locked(conn->gnc_peer);
+		}
+		/* The reaper will not call detach unless the conn is fully through kgnilnd_complete_closed_conn.
+		 * If the conn is not in a DONE state somehow we are attempting to detach even though
+		 * the conn has not been fully cleaned up. If we detach while the conn is still closing
+		 * we will end up with an orphaned connection that has valid ep_handle, that is not on a
+		 * peer.
+		 */
+
+		LASSERTF(conn->gnc_state == GNILND_CONN_DONE, "Conn in invalid state  %p@%s \n",
+				conn, kgnilnd_conn_state2str(conn));
+
+		/* move from peer to the delayed release list */
+		list_add_tail(&conn->gnc_list, conn_list);
+	}
+}
+
+void
+kgnilnd_release_purgatory_list(struct list_head *conn_list)
+{
+	kgn_device_t            *dev;
+	kgn_conn_t              *conn, *connN;
+	kgn_mdd_purgatory_t     *gmp, *gmpN;
+
+	list_for_each_entry_safe(conn, connN, conn_list, gnc_list) {
+		dev = conn->gnc_device;
+
+		kgnilnd_release_mbox(conn, -1);
+		conn->gnc_in_purgatory = 0;
+
+		list_del_init(&conn->gnc_list);
+
+		/* gnc_needs_detach is set in kgnilnd_del_conn_or_peer. It is used to keep track
+		 * of conns that have been marked for detach by kgnilnd_del_conn_or_peer.
+		 * The function uses kgn_npending_detach to verify the conn has
+		 * actually been detached.
+		 */
+
+		if (conn->gnc_needs_detach)
+			kgnilnd_admin_decref(kgnilnd_data.kgn_npending_detach);
+
+		/* if this guy is really dead (we are doing release from reaper),
+		 * make sure we tell LNet - if this is from other context,
+		 * the checks in the function will prevent an errant
+		 * notification */
+		kgnilnd_peer_notify(conn->gnc_peer, conn->gnc_error);
+
+		list_for_each_entry_safe(gmp, gmpN, &conn->gnc_mdd_list,
+					 gmp_list) {
+			CDEBUG(D_NET,
+			       "dev %p releasing held mdd "LPX64"."LPX64"\n",
+			       conn->gnc_device, gmp->gmp_map_key.qword1,
+			       gmp->gmp_map_key.qword2);
+
+			atomic_dec(&dev->gnd_n_mdd_held);
+			kgnilnd_mem_mdd_release(conn->gnc_device->gnd_handle,
+						&gmp->gmp_map_key);
+			/* ignoring the return code - if kgni/ghal can't find it
+			 * it must be released already */
+
+			list_del_init(&gmp->gmp_list);
+			LIBCFS_FREE(gmp, sizeof(*gmp));
+		}
+		/* lose conn ref for purgatory */
+		kgnilnd_conn_decref(conn);
+	}
+}
+
+/* needs write_lock on kgnilnd_data.kgn_peer_conn_lock held */
+void
+kgnilnd_peer_increase_reconnect_locked(kgn_peer_t *peer)
+{
+	int current_to;
+
+	current_to = peer->gnp_reconnect_interval;
+
+	/* we'll try to reconnect fast the first time, then back-off */
+	if (current_to == 0) {
+		peer->gnp_reconnect_time = jiffies - 1;
+		current_to = *kgnilnd_tunables.kgn_min_reconnect_interval;
+	} else {
+		peer->gnp_reconnect_time = jiffies + cfs_time_seconds(current_to);
+		/* add 50% of min timeout & retry */
+		current_to += *kgnilnd_tunables.kgn_min_reconnect_interval / 2;
+	}
+
+	current_to = MIN(current_to,
+				*kgnilnd_tunables.kgn_max_reconnect_interval);
+
+	peer->gnp_reconnect_interval = current_to;
+	CDEBUG(D_NET, "peer %s can reconnect at %lu interval %lu\n",
+	       libcfs_nid2str(peer->gnp_nid), peer->gnp_reconnect_time,
+	       peer->gnp_reconnect_interval);
+}
+
+/* needs kgnilnd_data.kgn_peer_conn_lock held */
+kgn_peer_t *
+kgnilnd_find_peer_locked(lnet_nid_t nid)
+{
+	struct list_head *peer_list = kgnilnd_nid2peerlist(nid);
+	kgn_peer_t       *peer;
+
+	/* Chopping nid down to only NIDADDR using LNET_NIDADDR so we only
+	 * have a single peer per device instead of a peer per nid/net combo.
+	 */
+
+	list_for_each_entry(peer, peer_list, gnp_list) {
+		if (LNET_NIDADDR(nid) != LNET_NIDADDR(peer->gnp_nid))
+			continue;
+
+		CDEBUG(D_NET, "got peer [%p] -> %s c %d (%d)\n",
+		       peer, libcfs_nid2str(nid),
+		       peer->gnp_connecting,
+		       atomic_read(&peer->gnp_refcount));
+		return peer;
+	}
+	return NULL;
+}
+
+/* need write_lock on kgn_peer_conn_lock */
+void
+kgnilnd_unlink_peer_locked(kgn_peer_t *peer)
+{
+	LASSERTF(list_empty(&peer->gnp_conns),
+		"peer 0x%p->%s\n",
+		 peer, libcfs_nid2str(peer->gnp_nid));
+	LASSERTF(list_empty(&peer->gnp_tx_queue),
+		"peer 0x%p->%s\n",
+		 peer, libcfs_nid2str(peer->gnp_nid));
+	LASSERTF(kgnilnd_peer_active(peer),
+		"peer 0x%p->%s\n",
+		 peer, libcfs_nid2str(peer->gnp_nid));
+	CDEBUG(D_NET, "unlinking peer 0x%p->%s\n",
+		peer, libcfs_nid2str(peer->gnp_nid));
+
+	list_del_init(&peer->gnp_list);
+	kgnilnd_data.kgn_peer_version++;
+	kgnilnd_admin_decref(kgnilnd_data.kgn_npending_unlink);
+	/* lose peerlist's ref */
+	kgnilnd_peer_decref(peer);
+}
+
+int
+kgnilnd_get_peer_info(int index,
+		      kgn_peer_t **found_peer,
+		      lnet_nid_t *id, __u32 *nic_addr,
+		      int *refcount, int *connecting)
+{
+	struct list_head  *ptmp;
+	kgn_peer_t        *peer;
+	int               i;
+	int               rc = -ENOENT;
+
+	read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+	for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
+
+		list_for_each(ptmp, &kgnilnd_data.kgn_peers[i]) {
+			peer = list_entry(ptmp, kgn_peer_t, gnp_list);
+
+			if (peer->gnp_nid != *id)
+				continue;
+
+			if (index-- > 0)
+				continue;
+
+			CDEBUG(D_NET, "found peer %p (%s) at index %d\n",
+			       peer, libcfs_nid2str(peer->gnp_nid), index);
+
+			*found_peer  = peer;
+			*id          = peer->gnp_nid;
+			*nic_addr    = peer->gnp_host_id;
+			*refcount    = atomic_read(&peer->gnp_refcount);
+			*connecting  = peer->gnp_connecting;
+
+			rc = 0;
+			goto out;
+		}
+	}
+out:
+	read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+	if (rc)
+		CDEBUG(D_NET, "no gni peer at index %d\n", index);
+	return rc;
+}
+
+/* requires write_lock on kgn_peer_conn_lock held */
+void
+kgnilnd_add_peer_locked(lnet_nid_t nid, kgn_peer_t *new_stub_peer, kgn_peer_t **peerp)
+{
+	kgn_peer_t        *peer, *peer2;
+
+	LASSERTF(new_stub_peer != NULL, "bad stub peer for nid %s\n",
+		 libcfs_nid2str(nid));
+
+	peer2 = kgnilnd_find_peer_locked(nid);
+	if (peer2 != NULL) {
+		/* A peer was created during the lock transition, so drop
+		 * the new one we created */
+		kgnilnd_peer_decref(new_stub_peer);
+		peer = peer2;
+	} else {
+		peer = new_stub_peer;
+		/* peer table takes existing ref on peer */
+
+		LASSERTF(!kgnilnd_peer_active(peer),
+			"peer 0x%p->%s already in peer table\n",
+			peer, libcfs_nid2str(peer->gnp_nid));
+		list_add_tail(&peer->gnp_list,
+			      kgnilnd_nid2peerlist(nid));
+		kgnilnd_data.kgn_peer_version++;
+	}
+
+	LASSERTF(peer->gnp_net != NULL, "peer 0x%p->%s with NULL net\n",
+		 peer, libcfs_nid2str(peer->gnp_nid));
+	*peerp = peer;
+}
+
+int
+kgnilnd_add_peer(kgn_net_t *net, lnet_nid_t nid, kgn_peer_t **peerp)
+{
+	kgn_peer_t        *peer;
+	int                rc;
+	ENTRY;
+
+	if (nid == LNET_NID_ANY)
+		return -EINVAL;
+
+	/* NB - this will not block during normal operations -
+	 * the only writer of this is in the startup/shutdown path. */
+	rc = down_read_trylock(&kgnilnd_data.kgn_net_rw_sem);
+	if (!rc) {
+		rc = -ESHUTDOWN;
+		RETURN(rc);
+	}
+	rc = kgnilnd_create_peer_safe(&peer, nid, net);
+	if (rc != 0) {
+		up_read(&kgnilnd_data.kgn_net_rw_sem);
+		RETURN(rc);
+	}
+
+	write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+	up_read(&kgnilnd_data.kgn_net_rw_sem);
+
+	kgnilnd_add_peer_locked(nid, peer, peerp);
+
+	CDEBUG(D_NET, "peer 0x%p->%s connecting %d\n",
+	       peerp, libcfs_nid2str((*peerp)->gnp_nid),
+	       (*peerp)->gnp_connecting);
+
+	write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+	RETURN(0);
+}
+
+/* needs write_lock on kgn_peer_conn_lock */
+void
+kgnilnd_cancel_peer_connect_locked(kgn_peer_t *peer, struct list_head *zombies)
+{
+	kgn_tx_t        *tx, *txn;
+
+	/* we do care about state of gnp_connecting - we could be between
+	 * reconnect attempts, so try to find the dgram and cancel the TX
+	 * anyways. If we are in the process of posting DONT do anything;
+	 * once it fails or succeeds we can nuke the connect attempt.
+	 * We have no idea where in kgnilnd_post_dgram we are so we cant
+	 * attempt to cancel until the function is done.
+	 */
+
+	/* make sure peer isn't in process of connecting or waiting for connect*/
+	spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
+	if (!(list_empty(&peer->gnp_connd_list))) {
+		list_del_init(&peer->gnp_connd_list);
+		/* remove connd ref */
+		kgnilnd_peer_decref(peer);
+	}
+	spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
+
+	if (peer->gnp_connecting == GNILND_PEER_POSTING || peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
+		peer->gnp_connecting = GNILND_PEER_NEEDS_DEATH;
+		/* We are in process of posting right now the xchg set it up for us to
+		 * cancel the connect so we are finished for now */
+	} else {
+		/* no need for exchange we have the peer lock and its ready for us to nuke */
+		LASSERTF(peer->gnp_connecting != GNILND_PEER_POSTING,
+			"Peer in invalid state 0x%p->%s, connecting %d\n",
+			peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
+		peer->gnp_connecting = GNILND_PEER_IDLE;
+		set_mb(peer->gnp_last_dgram_errno, -ETIMEDOUT);
+		kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev,
+						      peer->gnp_nid);
+	}
+
+	/* The least we can do is nuke the tx's no matter what.... */
+	list_for_each_entry_safe(tx, txn, &peer->gnp_tx_queue, tx_list) {
+		kgnilnd_tx_del_state_locked(tx, peer, NULL,
+					   GNILND_TX_ALLOCD);
+		list_add_tail(&tx->tx_list, zombies);
+	}
+}
+
+/* needs write_lock on kgn_peer_conn_lock */
+void
+kgnilnd_del_peer_locked(kgn_peer_t *peer, int error)
+{
+	/* this peer could be passive and only held for purgatory,
+	 * take a ref to ensure it doesn't disappear in this function */
+	kgnilnd_peer_addref(peer);
+
+	CFS_RACE(CFS_FAIL_GNI_FIND_TARGET);
+
+	/* if purgatory release cleared it out, don't try again */
+	if (kgnilnd_peer_active(peer)) {
+		/* always do this to allow kgnilnd_start_connect and
+		 * kgnilnd_finish_connect to catch this before they
+		 * wrap up their operations */
+		if (kgnilnd_can_unlink_peer_locked(peer)) {
+			/* already released purgatory, so only active
+			 * conns hold it */
+			kgnilnd_unlink_peer_locked(peer);
+		} else {
+			kgnilnd_close_peer_conns_locked(peer, error);
+			/* peer unlinks itself when last conn is closed */
+		}
+	}
+
+	/* we are done, release back to the wild */
+	kgnilnd_peer_decref(peer);
+}
+
+int
+kgnilnd_del_conn_or_peer(kgn_net_t *net, lnet_nid_t nid, int command,
+			  int error)
+{
+	LIST_HEAD		(souls);
+	LIST_HEAD		(zombies);
+	struct list_head	*ptmp, *pnxt;
+	kgn_peer_t		*peer;
+	int			lo;
+	int			hi;
+	int			i;
+	int			rc = -ENOENT;
+
+	write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+	if (nid != LNET_NID_ANY)
+		lo = hi = kgnilnd_nid2peerlist(nid) - kgnilnd_data.kgn_peers;
+	else {
+		lo = 0;
+		hi = *kgnilnd_tunables.kgn_peer_hash_size - 1;
+		/* wildcards always succeed */
+		rc = 0;
+	}
+
+	for (i = lo; i <= hi; i++) {
+		list_for_each_safe(ptmp, pnxt, &kgnilnd_data.kgn_peers[i]) {
+			peer = list_entry(ptmp, kgn_peer_t, gnp_list);
+
+			LASSERTF(peer->gnp_net != NULL,
+				"peer %p (%s) with NULL net\n",
+				 peer, libcfs_nid2str(peer->gnp_nid));
+
+			if (net != NULL && peer->gnp_net != net)
+				continue;
+
+			if (!(nid == LNET_NID_ANY || LNET_NIDADDR(peer->gnp_nid) == LNET_NIDADDR(nid)))
+				continue;
+
+			/* In both cases, we want to stop any in-flight
+			 * connect attempts */
+			kgnilnd_cancel_peer_connect_locked(peer, &zombies);
+
+			switch (command) {
+			case GNILND_DEL_CONN:
+				kgnilnd_close_peer_conns_locked(peer, error);
+				break;
+			case GNILND_DEL_PEER:
+				peer->gnp_pending_unlink = 1;
+				kgnilnd_admin_addref(kgnilnd_data.kgn_npending_unlink);
+				kgnilnd_mark_for_detach_purgatory_all_locked(peer);
+				kgnilnd_del_peer_locked(peer, error);
+				break;
+			case GNILND_CLEAR_PURGATORY:
+				/* Mark everything ready for detach reaper will cleanup
+				 * once we release the kgn_peer_conn_lock
+				 */
+				kgnilnd_mark_for_detach_purgatory_all_locked(peer);
+				peer->gnp_last_errno = -EISCONN;
+				/* clear reconnect so he can reconnect soon */
+				peer->gnp_reconnect_time = 0;
+				peer->gnp_reconnect_interval = 0;
+				break;
+			default:
+				CERROR("bad command %d\n", command);
+				LBUG();
+			}
+			/* we matched something */
+			rc = 0;
+		}
+	}
+
+	write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+	/* release all of the souls found held in purgatory */
+	kgnilnd_release_purgatory_list(&souls);
+
+	/* nuke peer TX */
+	kgnilnd_txlist_done(&zombies, error);
+
+	/* This function does not return until the commands it initiated have completed,
+	 * since they have to work there way through the other threads. In the case of shutdown
+	 * threads are not woken up until after this call is initiated so we cannot wait, we just
+	 * need to return. The same applies for stack reset we shouldnt wait as the reset thread
+	 * handles closing.
+	 */
+
+	CFS_RACE(CFS_FAIL_GNI_RACE_RESET);
+
+	if (error == -ENOTRECOVERABLE || error == -ESHUTDOWN) {
+		return rc;
+	}
+
+	i = 4;
+	while (atomic_read(&kgnilnd_data.kgn_npending_conns)   ||
+	       atomic_read(&kgnilnd_data.kgn_npending_detach)  ||
+	       atomic_read(&kgnilnd_data.kgn_npending_unlink)) {
+
+		cfs_pause(cfs_time_seconds(1));
+		i++;
+
+		CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, "Waiting on %d peers %d closes %d detaches\n",
+				atomic_read(&kgnilnd_data.kgn_npending_unlink),
+				atomic_read(&kgnilnd_data.kgn_npending_conns),
+				atomic_read(&kgnilnd_data.kgn_npending_detach));
+	}
+
+	return rc;
+}
+
+kgn_conn_t *
+kgnilnd_get_conn_by_idx(int index)
+{
+	kgn_peer_t        *peer;
+	struct list_head  *ptmp;
+	kgn_conn_t        *conn;
+	struct list_head  *ctmp;
+	int                i;
+
+
+	for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
+		read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+		list_for_each(ptmp, &kgnilnd_data.kgn_peers[i]) {
+
+			peer = list_entry(ptmp, kgn_peer_t, gnp_list);
+
+			list_for_each(ctmp, &peer->gnp_conns) {
+				conn = list_entry(ctmp, kgn_conn_t, gnc_list);
+
+				if (conn->gnc_state != GNILND_CONN_ESTABLISHED)
+					continue;
+
+				if (index-- > 0)
+					continue;
+
+				CDEBUG(D_NET, "++conn[%p] -> %s (%d)\n", conn,
+				       libcfs_nid2str(conn->gnc_peer->gnp_nid),
+				       atomic_read(&conn->gnc_refcount));
+				kgnilnd_conn_addref(conn);
+				read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+				return conn;
+			}
+		}
+		read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+	}
+
+	return NULL;
+}
+
+int
+kgnilnd_get_conn_info(kgn_peer_t *peer,
+		      int *device_id, __u64 *peerstamp,
+		      int *tx_seq, int *rx_seq,
+		      int *fmaq_len, int *nfma, int *nrdma)
+{
+	kgn_conn_t        *conn;
+	int               rc = 0;
+
+	read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+	conn = kgnilnd_find_conn_locked(peer);
+	if (conn == NULL) {
+		rc = -ENOENT;
+		goto out;
+	}
+
+	*device_id = conn->gnc_device->gnd_host_id;
+	*peerstamp = conn->gnc_peerstamp;
+	*tx_seq = conn->gnc_tx_seq;
+	*rx_seq = conn->gnc_rx_seq;
+	*fmaq_len = kgnilnd_count_list(&conn->gnc_fmaq);
+	*nfma = atomic_read(&conn->gnc_nlive_fma);
+	*nrdma = atomic_read(&conn->gnc_nlive_rdma);
+out:
+	read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+	return rc;
+}
+
+/* needs write_lock on kgn_peer_conn_lock */
+int
+kgnilnd_close_peer_conns_locked(kgn_peer_t *peer, int why)
+{
+	kgn_conn_t         *conn;
+	struct list_head   *ctmp, *cnxt;
+	int                 count = 0;
+
+	list_for_each_safe(ctmp, cnxt, &peer->gnp_conns) {
+		conn = list_entry(ctmp, kgn_conn_t, gnc_list);
+
+		if (conn->gnc_state != GNILND_CONN_ESTABLISHED)
+			continue;
+
+		count++;
+		/* we mark gnc_needs closing and increment kgn_npending_conns so that
+		 * kgnilnd_del_conn_or_peer can wait on the other threads closing
+		 * and cleaning up the connection.
+		 */
+		if (!conn->gnc_needs_closing) {
+			conn->gnc_needs_closing = 1;
+			kgnilnd_admin_addref(kgnilnd_data.kgn_npending_conns);
+		}
+		kgnilnd_close_conn_locked(conn, why);
+	}
+	return count;
+}
+
+int
+kgnilnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
+{
+	struct libcfs_ioctl_data *data = arg;
+	kgn_net_t                *net = ni->ni_data;
+	int                       rc = -EINVAL;
+
+	LASSERT(ni == net->gnn_ni);
+
+	switch (cmd) {
+	case IOC_LIBCFS_GET_PEER: {
+		lnet_nid_t   nid = 0;
+		kgn_peer_t  *peer = NULL;
+		__u32 nic_addr = 0;
+		__u64 peerstamp = 0;
+		int peer_refcount = 0, peer_connecting = 0;
+		int device_id = 0;
+		int tx_seq = 0, rx_seq = 0;
+		int fmaq_len = 0, nfma = 0, nrdma = 0;
+
+		rc = kgnilnd_get_peer_info(data->ioc_count, &peer,
+					   &nid, &nic_addr, &peer_refcount,
+					   &peer_connecting);
+		if (rc)
+			break;
+
+		/* Barf */
+		/* LNET_MKNID is used to mask from lnet the multiplexing/demultiplexing of connections and peers
+		 * LNET assumes a conn and peer per net, the LNET_MKNID/LNET_NIDADDR allows us to let Lnet see what it
+		 * wants to see instead of the underlying network that is being used to send the data
+		 */
+		data->ioc_nid    = LNET_MKNID(LNET_NIDNET(ni->ni_nid), LNET_NIDADDR(nid));
+		data->ioc_flags  = peer_connecting;
+		data->ioc_count  = peer_refcount;
+
+		rc = kgnilnd_get_conn_info(peer, &device_id, &peerstamp,
+					   &tx_seq, &rx_seq, &fmaq_len,
+					   &nfma, &nrdma);
+
+		/* This is allowable - a persistent peer could not
+		 * have a connection */
+		if (rc) {
+			/* flag to indicate we are not connected -
+			 * need to print as such */
+			data->ioc_flags |= (1<<16);
+			rc = 0;
+		} else {
+			/* still barf */
+			data->ioc_net = device_id;
+			data->ioc_u64[0] = peerstamp;
+			data->ioc_u32[0] = fmaq_len;
+			data->ioc_u32[1] = nfma;
+			data->ioc_u32[2] = tx_seq;
+			data->ioc_u32[3] = rx_seq;
+			data->ioc_u32[4] = nrdma;
+		}
+		break;
+	}
+	case IOC_LIBCFS_ADD_PEER: {
+		/* just dummy value to allow using common interface */
+		kgn_peer_t      *peer;
+		rc = kgnilnd_add_peer(net, data->ioc_nid, &peer);
+		break;
+	}
+	case IOC_LIBCFS_DEL_PEER: {
+		/* NULL is passed in so it affects all peers in existence without regard to network
+		 * as the peer may not exist on the network LNET believes it to be on.
+		 */
+		rc = kgnilnd_del_conn_or_peer(NULL, data->ioc_nid,
+					      GNILND_DEL_PEER, -EUCLEAN);
+		break;
+	}
+	case IOC_LIBCFS_GET_CONN: {
+		kgn_conn_t *conn = kgnilnd_get_conn_by_idx(data->ioc_count);
+
+		if (conn == NULL)
+			rc = -ENOENT;
+		else {
+			rc = 0;
+			/* LNET_MKNID is used to build the correct address based on what LNET wants to see instead of
+			 * the generic connection that is used to send the data
+			 */
+			data->ioc_nid    = LNET_MKNID(LNET_NIDNET(ni->ni_nid), LNET_NIDADDR(conn->gnc_peer->gnp_nid));
+			data->ioc_u32[0] = conn->gnc_device->gnd_id;
+			kgnilnd_conn_decref(conn);
+		}
+		break;
+	}
+	case IOC_LIBCFS_CLOSE_CONNECTION: {
+		/* use error = -ENETRESET to indicate it was lctl disconnect */
+		/* NULL is passed in so it affects all the nets as the connection is virtual
+		 * and may not exist on the network LNET believes it to be on.
+		 */
+		rc = kgnilnd_del_conn_or_peer(NULL, data->ioc_nid,
+					      GNILND_DEL_CONN, -ENETRESET);
+		break;
+	}
+	case IOC_LIBCFS_PUSH_CONNECTION: {
+		/* we use this to flush purgatory */
+		rc = kgnilnd_del_conn_or_peer(NULL, data->ioc_nid,
+					      GNILND_CLEAR_PURGATORY, -EUCLEAN);
+		break;
+	}
+	case IOC_LIBCFS_REGISTER_MYNID: {
+		/* Ignore if this is a noop */
+		if (data->ioc_nid == ni->ni_nid) {
+			rc = 0;
+		} else {
+			CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
+			       libcfs_nid2str(data->ioc_nid),
+			       libcfs_nid2str(ni->ni_nid));
+			rc = -EINVAL;
+		}
+		break;
+	}
+	}
+
+	return rc;
+}
+
+void
+kgnilnd_query(lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
+{
+	kgn_net_t               *net = ni->ni_data;
+	kgn_tx_t                *tx;
+	kgn_peer_t              *peer = NULL;
+	kgn_conn_t              *conn = NULL;
+	lnet_process_id_t       id = {.nid = nid, .pid = LUSTRE_SRV_LNET_PID};
+	ENTRY;
+
+	/* I expect to find him, so only take a read lock */
+	read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+	peer = kgnilnd_find_peer_locked(nid);
+	if (peer != NULL) {
+		/* LIE if in a quiesce - we will update the timeouts after,
+		 * but we don't want sends failing during it */
+		if (kgnilnd_data.kgn_quiesce_trigger) {
+			*when = jiffies;
+			read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+			GOTO(out, 0);
+		}
+
+		/* Update to best guess, might refine on later checks */
+		*when = peer->gnp_last_alive;
+
+		/* we have a peer, how about a conn? */
+		conn = kgnilnd_find_conn_locked(peer);
+
+		if (conn == NULL)  {
+			/* if there is no conn, check peer last errno to see if clean disconnect
+			 * - if it was, we lie to LNet because we believe a TX would complete
+			 * on reconnect */
+			if (kgnilnd_conn_clean_errno(peer->gnp_last_errno)) {
+				*when = jiffies;
+			}
+			/* we still want to fire a TX and new conn in this case */
+		} else {
+			/* gnp_last_alive is valid, run for the hills */
+			read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+			GOTO(out, 0);
+		}
+	}
+	/* if we get here, either we have no peer or no conn for him, so fire off
+	 * new TX to trigger conn setup */
+	read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+	/* if we couldn't find him, we'll fire up a TX and get connected -
+	 * if we don't do this, after ni_peer_timeout, LNet will declare him dead.
+	 * So really we treat kgnilnd_query as a bit of a 'connect now' type
+	 * event because it'll only do this when it wants to send
+	 *
+	 * Use a real TX for this to get the proper gnp_tx_queue behavior, etc
+	 * normally we'd use kgnilnd_send_ctlmsg for this, but we don't really
+	 * care that this goes out quickly since we already know we need a new conn
+	 * formed */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_NOOP_SEND))
+		return;
+
+	tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP, ni->ni_nid);
+	if (tx != NULL) {
+		kgnilnd_launch_tx(tx, net, &id);
+	}
+out:
+	CDEBUG(D_NETTRACE, "peer 0x%p->%s when %lu\n", peer,
+	       libcfs_nid2str(nid), *when);
+	EXIT;
+}
+
+int
+kgnilnd_dev_init(kgn_device_t *dev)
+{
+	gni_return_t      rrc;
+	int               rc = 0;
+	unsigned int      cq_size;
+	ENTRY;
+
+	/* size of these CQs should be able to accommodate the outgoing
+	 * RDMA and SMSG transactions.  Since we really don't know what we
+	 * really need here, we'll take credits * 2 * 3 to allow a bunch.
+	 * We need to dig into this more with the performance work. */
+	cq_size = *kgnilnd_tunables.kgn_credits * 2 * 3;
+
+	rrc = kgnilnd_cdm_create(dev->gnd_id, *kgnilnd_tunables.kgn_ptag,
+				 GNILND_COOKIE, 0,
+				 &dev->gnd_domain);
+	if (rrc != GNI_RC_SUCCESS) {
+		CERROR("Can't create CDM %d (%d)\n", dev->gnd_id, rrc);
+		rc = -ENODEV;
+		GOTO(failed, rc);
+	}
+
+	rrc = kgnilnd_cdm_attach(dev->gnd_domain, dev->gnd_id,
+				 &dev->gnd_host_id, &dev->gnd_handle);
+	if (rrc != GNI_RC_SUCCESS) {
+		CERROR("Can't attach CDM to device %d (%d)\n",
+			dev->gnd_id, rrc);
+		rc = -ENODEV;
+		GOTO(failed, rc);
+	}
+
+	rc = kgnilnd_setup_nic_translation(dev->gnd_host_id);
+	if (rc != 0) {
+		rc = -ENODEV;
+		GOTO(failed, rc);
+	}
+
+	/* only dev 0 gets the errors - no need to reset the stack twice
+	 * - this works because we have a single PTAG, if we had more
+	 * then we'd need to have multiple handlers */
+	if (dev->gnd_id == 0) {
+		rrc = kgnilnd_subscribe_errors(dev->gnd_handle, GNI_ERRMASK_CRITICAL,
+					      0, NULL, kgnilnd_critical_error,
+					      &dev->gnd_err_handle);
+		if (rrc != GNI_RC_SUCCESS) {
+			CERROR("Can't subscribe for errors on device %d: rc %d\n",
+				dev->gnd_id, rrc);
+			rc = -ENODEV;
+			GOTO(failed, rc);
+		}
+
+		rc = kgnilnd_set_quiesce_callback(dev->gnd_handle,
+						  kgnilnd_quiesce_end_callback);
+		if (rc != GNI_RC_SUCCESS) {
+			CERROR("Can't subscribe for quiesce callback on device %d: rc %d\n",
+				dev->gnd_id, rrc);
+			rc = -ENODEV;
+			GOTO(failed, rc);
+		}
+	}
+
+	rc = kgnilnd_nicaddr_to_nid(dev->gnd_host_id, &dev->gnd_nid);
+	if (rc < 0) {
+		/* log messages during startup */
+		if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) {
+			CERROR("couldn't translate host_id 0x%x to nid. rc %d\n",
+				dev->gnd_host_id, rc);
+		}
+		rc = -ESRCH;
+		GOTO(failed, rc);
+	}
+	CDEBUG(D_NET, "NIC %x -> NID %d\n", dev->gnd_host_id, dev->gnd_nid);
+
+	rrc = kgnilnd_cq_create(dev->gnd_handle, cq_size,
+				0, kgnilnd_device_callback,
+				dev->gnd_id, &dev->gnd_snd_rdma_cqh);
+	if (rrc != GNI_RC_SUCCESS) {
+		CERROR("Can't create rdma send cq size %u for device "
+		       "%d (%d)\n", cq_size, dev->gnd_id, rrc);
+		rc = -EINVAL;
+		GOTO(failed, rc);
+	}
+
+	rrc = kgnilnd_cq_create(dev->gnd_handle, cq_size,
+			0, kgnilnd_device_callback, dev->gnd_id,
+			&dev->gnd_snd_fma_cqh);
+	if (rrc != GNI_RC_SUCCESS) {
+		CERROR("Can't create fma send cq size %u for device %d (%d)\n",
+		       cq_size, dev->gnd_id, rrc);
+		rc = -EINVAL;
+		GOTO(failed, rc);
+	}
+
+	/* This one we size differently - overflows are possible and it needs to be
+	 * sized based on machine size */
+	rrc = kgnilnd_cq_create(dev->gnd_handle,
+			*kgnilnd_tunables.kgn_fma_cq_size,
+			0, kgnilnd_device_callback, dev->gnd_id,
+			&dev->gnd_rcv_fma_cqh);
+	if (rrc != GNI_RC_SUCCESS) {
+		CERROR("Can't create fma cq size %d for device %d (%d)\n",
+		       *kgnilnd_tunables.kgn_fma_cq_size, dev->gnd_id, rrc);
+		rc = -EINVAL;
+		GOTO(failed, rc);
+	}
+
+	RETURN(0);
+
+failed:
+	kgnilnd_dev_fini(dev);
+	RETURN(rc);
+}
+
+void
+kgnilnd_dev_fini(kgn_device_t *dev)
+{
+	gni_return_t rrc;
+	ENTRY;
+
+	/* At quiesce or rest time, need to loop through and clear gnd_ready_conns ?*/
+	LASSERTF(list_empty(&dev->gnd_ready_conns) &&
+		 list_empty(&dev->gnd_map_tx) &&
+		 list_empty(&dev->gnd_rdmaq),
+		 "dev 0x%p ready_conns %d@0x%p map_tx %d@0x%p rdmaq %d@0x%p\n",
+		 dev, kgnilnd_count_list(&dev->gnd_ready_conns), &dev->gnd_ready_conns,
+		 kgnilnd_count_list(&dev->gnd_map_tx), &dev->gnd_map_tx,
+		 kgnilnd_count_list(&dev->gnd_rdmaq), &dev->gnd_rdmaq);
+
+	/* These should follow from tearing down all connections */
+	LASSERTF(dev->gnd_map_nphys == 0 && dev->gnd_map_physnop == 0,
+		"%d physical mappings of %d pages still mapped\n",
+		 dev->gnd_map_nphys, dev->gnd_map_physnop);
+
+	LASSERTF(dev->gnd_map_nvirt == 0 && dev->gnd_map_virtnob == 0,
+		"%d virtual mappings of "LPU64" bytes still mapped\n",
+		 dev->gnd_map_nvirt, dev->gnd_map_virtnob);
+
+	LASSERTF(atomic_read(&dev->gnd_n_mdd) == 0 &&
+		 atomic_read(&dev->gnd_n_mdd_held) == 0 &&
+		 atomic64_read(&dev->gnd_nbytes_map) == 0,
+		"%d SMSG mappings of %ld bytes still mapped or held %d\n",
+		 atomic_read(&dev->gnd_n_mdd),
+		 atomic64_read(&dev->gnd_nbytes_map), atomic_read(&dev->gnd_n_mdd_held));
+
+	LASSERT(list_empty(&dev->gnd_map_list));
+
+	/* What other assertions needed to ensure all connections torn down ? */
+
+	/* check all counters == 0 (EP, MDD, etc) */
+
+	/* if we are resetting due to quiese (stack reset), don't check
+	 * thread states */
+	LASSERTF(kgnilnd_data.kgn_quiesce_trigger ||
+		atomic_read(&kgnilnd_data.kgn_nthreads) == 0,
+		"tried to shutdown with threads active\n");
+
+	if (dev->gnd_rcv_fma_cqh) {
+		rrc = kgnilnd_cq_destroy(dev->gnd_rcv_fma_cqh);
+		LASSERTF(rrc == GNI_RC_SUCCESS,
+			"bad rc from gni_cq_destroy on rcv_fma_cqh: %d\n", rrc);
+		dev->gnd_rcv_fma_cqh = NULL;
+	}
+
+	if (dev->gnd_snd_rdma_cqh) {
+		rrc = kgnilnd_cq_destroy(dev->gnd_snd_rdma_cqh);
+		LASSERTF(rrc == GNI_RC_SUCCESS,
+			"bad rc from gni_cq_destroy on send_rdma_cqh: %d\n", rrc);
+		dev->gnd_snd_rdma_cqh = NULL;
+	}
+
+	if (dev->gnd_snd_fma_cqh) {
+		rrc = kgnilnd_cq_destroy(dev->gnd_snd_fma_cqh);
+		LASSERTF(rrc == GNI_RC_SUCCESS,
+			"bad rc from gni_cq_destroy on snd_fma_cqh: %d\n", rrc);
+		dev->gnd_snd_fma_cqh = NULL;
+	}
+
+	if (dev->gnd_err_handle) {
+		rrc = kgnilnd_release_errors(dev->gnd_err_handle);
+		LASSERTF(rrc == GNI_RC_SUCCESS,
+			"bad rc from gni_release_errors: %d\n", rrc);
+		dev->gnd_err_handle = NULL;
+	}
+
+	if (dev->gnd_domain) {
+		rrc = kgnilnd_cdm_destroy(dev->gnd_domain);
+		LASSERTF(rrc == GNI_RC_SUCCESS,
+			"bad rc from gni_cdm_destroy: %d\n", rrc);
+		dev->gnd_domain = NULL;
+	}
+
+	EXIT;
+}
+
+
+int kgnilnd_base_startup(void)
+{
+	struct timeval       tv;
+	int                  pkmem = atomic_read(&libcfs_kmemory);
+	int                  rc;
+	int                  i;
+	kgn_device_t        *dev;
+	struct task_struct  *thrd;
+	ENTRY;
+
+	LASSERTF(kgnilnd_data.kgn_init == GNILND_INIT_NOTHING,
+		"init %d\n", kgnilnd_data.kgn_init);
+
+	/* zero pointers, flags etc */
+	memset(&kgnilnd_data, 0, sizeof(kgnilnd_data));
+	memset(&kgnilnd_hssops, 0, sizeof(kgnilnd_hssops));
+
+	/* CAVEAT EMPTOR: Every 'Fma' message includes the sender's NID and
+	 * a unique (for all time) connstamp so we can uniquely identify
+	 * the sender.  The connstamp is an incrementing counter
+	 * initialised with seconds + microseconds at startup time.  So we
+	 * rely on NOT creating connections more frequently on average than
+	 * 1MHz to ensure we don't use old connstamps when we reboot. */
+	do_gettimeofday(&tv);
+	kgnilnd_data.kgn_connstamp =
+		 kgnilnd_data.kgn_peerstamp =
+			(((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+
+	init_rwsem(&kgnilnd_data.kgn_net_rw_sem);
+
+	for (i = 0; i < GNILND_MAXDEVS; i++) {
+		kgn_device_t  *dev = &kgnilnd_data.kgn_devices[i];
+
+		dev->gnd_id = i;
+		INIT_LIST_HEAD(&dev->gnd_ready_conns);
+		INIT_LIST_HEAD(&dev->gnd_map_tx);
+		INIT_LIST_HEAD(&dev->gnd_fma_buffs);
+		mutex_init(&dev->gnd_cq_mutex);
+		sema_init(&dev->gnd_fmablk_sem, 1);
+		spin_lock_init(&dev->gnd_fmablk_lock);
+		init_waitqueue_head(&dev->gnd_waitq);
+		init_waitqueue_head(&dev->gnd_dgram_waitq);
+		init_waitqueue_head(&dev->gnd_dgping_waitq);
+		spin_lock_init(&dev->gnd_lock);
+		INIT_LIST_HEAD(&dev->gnd_map_list);
+		spin_lock_init(&dev->gnd_map_lock);
+		atomic_set(&dev->gnd_nfmablk, 0);
+		atomic_set(&dev->gnd_fmablk_vers, 1);
+		atomic_set(&dev->gnd_neps, 0);
+		atomic_set(&dev->gnd_canceled_dgrams, 0);
+		INIT_LIST_HEAD(&dev->gnd_connd_peers);
+		spin_lock_init(&dev->gnd_connd_lock);
+		spin_lock_init(&dev->gnd_dgram_lock);
+		spin_lock_init(&dev->gnd_rdmaq_lock);
+		INIT_LIST_HEAD(&dev->gnd_rdmaq);
+
+		/* alloc & setup nid based dgram table */
+		LIBCFS_ALLOC(dev->gnd_dgrams,
+			    sizeof(struct list_head) * *kgnilnd_tunables.kgn_peer_hash_size);
+
+		if (dev->gnd_dgrams == NULL) {
+			rc = -ENOMEM;
+			GOTO(failed, rc);
+		}
+
+		for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
+			INIT_LIST_HEAD(&dev->gnd_dgrams[i]);
+		}
+		atomic_set(&dev->gnd_ndgrams, 0);
+
+		/* setup timer for RDMAQ processing */
+		setup_timer(&dev->gnd_rdmaq_timer, kgnilnd_schedule_device_timer,
+			    (unsigned long)dev);
+	}
+
+	/* CQID 0 isn't allowed, set to MAX_MSG_ID - 1 to check for conflicts early */
+	kgnilnd_data.kgn_next_cqid = GNILND_MAX_MSG_ID - 1;
+	kgnilnd_data.kgn_new_min_timeout = *kgnilnd_tunables.kgn_timeout;
+	init_waitqueue_head(&kgnilnd_data.kgn_reaper_waitq);
+	init_waitqueue_head(&kgnilnd_data.kgn_ruhroh_waitq);
+	spin_lock_init(&kgnilnd_data.kgn_reaper_lock);
+
+	sema_init(&kgnilnd_data.kgn_quiesce_sem, 1);
+	atomic_set(&kgnilnd_data.kgn_nquiesce, 0);
+	atomic_set(&kgnilnd_data.kgn_npending_conns, 0);
+	atomic_set(&kgnilnd_data.kgn_npending_unlink, 0);
+	atomic_set(&kgnilnd_data.kgn_npending_detach, 0);
+	/* OK to call kgnilnd_api_shutdown() to cleanup now */
+	kgnilnd_data.kgn_init = GNILND_INIT_DATA;
+	PORTAL_MODULE_USE;
+
+	rwlock_init(&kgnilnd_data.kgn_peer_conn_lock);
+
+	LIBCFS_ALLOC(kgnilnd_data.kgn_peers,
+		    sizeof(struct list_head) * *kgnilnd_tunables.kgn_peer_hash_size);
+
+	if (kgnilnd_data.kgn_peers == NULL) {
+		rc = -ENOMEM;
+		GOTO(failed, rc);
+	}
+
+	for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
+		INIT_LIST_HEAD(&kgnilnd_data.kgn_peers[i]);
+	}
+
+	LIBCFS_ALLOC(kgnilnd_data.kgn_conns,
+		    sizeof(struct list_head) * *kgnilnd_tunables.kgn_peer_hash_size);
+
+	if (kgnilnd_data.kgn_conns == NULL) {
+		rc = -ENOMEM;
+		GOTO(failed, rc);
+	}
+
+	for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
+		INIT_LIST_HEAD(&kgnilnd_data.kgn_conns[i]);
+	}
+
+	LIBCFS_ALLOC(kgnilnd_data.kgn_nets,
+		    sizeof(struct list_head) * *kgnilnd_tunables.kgn_net_hash_size);
+
+	if (kgnilnd_data.kgn_nets == NULL) {
+		rc = -ENOMEM;
+		GOTO(failed, rc);
+	}
+
+	for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) {
+		INIT_LIST_HEAD(&kgnilnd_data.kgn_nets[i]);
+	}
+
+	kgnilnd_data.kgn_mbox_cache =
+		cfs_mem_cache_create("kgn_mbox_block",
+				     KMALLOC_MAX_SIZE,
+				     0,    /* offset */
+				     SLAB_HWCACHE_ALIGN);   /* flags */
+	if (kgnilnd_data.kgn_mbox_cache == NULL) {
+		CERROR("Can't create slab for physical mbox blocks\n");
+		rc = -ENOMEM;
+		GOTO(failed, rc);
+	}
+
+	kgnilnd_data.kgn_rx_cache =
+		cfs_mem_cache_create("kgn_rx_t",
+				     sizeof(kgn_rx_t),
+				     0,    /* offset */
+				     0);   /* flags */
+	if (kgnilnd_data.kgn_rx_cache == NULL) {
+		CERROR("Can't create slab for kgn_rx_t descriptors\n");
+		rc = -ENOMEM;
+		GOTO(failed, rc);
+	}
+
+	kgnilnd_data.kgn_tx_cache =
+		cfs_mem_cache_create("kgn_tx_t",
+				     sizeof(kgn_tx_t),
+				     0,    /* offset */
+				     0);   /* flags */
+	if (kgnilnd_data.kgn_tx_cache == NULL) {
+		CERROR("Can't create slab for kgn_tx_t\n");
+		rc = -ENOMEM;
+		GOTO(failed, rc);
+	}
+
+	kgnilnd_data.kgn_tx_phys_cache =
+		cfs_mem_cache_create("kgn_tx_phys",
+				     LNET_MAX_IOV * sizeof(gni_mem_segment_t),
+				     0,    /* offset */
+				     0);   /* flags */
+	if (kgnilnd_data.kgn_tx_phys_cache == NULL) {
+		CERROR("Can't create slab for kgn_tx_phys\n");
+		rc = -ENOMEM;
+		GOTO(failed, rc);
+	}
+
+	kgnilnd_data.kgn_dgram_cache =
+		cfs_mem_cache_create("kgn_dgram_t",
+				     sizeof(kgn_dgram_t),
+				     0,    /* offset */
+				     0);   /* flags */
+	if (kgnilnd_data.kgn_dgram_cache == NULL) {
+		CERROR("Can't create slab for outgoing datagrams\n");
+		rc = -ENOMEM;
+		GOTO(failed, rc);
+	}
+
+	/* allocate a MAX_IOV array of page pointers for each cpu */
+	kgnilnd_data.kgn_cksum_map_pages = kmalloc(num_possible_cpus() * sizeof (struct page *),
+						   GFP_KERNEL);
+	if (kgnilnd_data.kgn_cksum_map_pages == NULL) {
+		CERROR("Can't allocate vmap cksum pages\n");
+		rc = -ENOMEM;
+		GOTO(failed, rc);
+	}
+	kgnilnd_data.kgn_cksum_npages = num_possible_cpus();
+	memset(kgnilnd_data.kgn_cksum_map_pages, 0,
+		kgnilnd_data.kgn_cksum_npages * sizeof (struct page *));
+
+	for (i = 0; i < kgnilnd_data.kgn_cksum_npages; i++) {
+		kgnilnd_data.kgn_cksum_map_pages[i] = kmalloc(LNET_MAX_IOV * sizeof (struct page *),
+							      GFP_KERNEL);
+		if (kgnilnd_data.kgn_cksum_map_pages[i] == NULL) {
+			CERROR("Can't allocate vmap cksum pages for cpu %d\n", i);
+			rc = -ENOMEM;
+			GOTO(failed, rc);
+		}
+	}
+
+	LASSERT(kgnilnd_data.kgn_ndevs == 0);
+
+	/* Use all available GNI devices */
+	for (i = 0; i < GNILND_MAXDEVS; i++) {
+		dev = &kgnilnd_data.kgn_devices[kgnilnd_data.kgn_ndevs];
+
+		rc = kgnilnd_dev_init(dev);
+		if (rc == 0) {
+			/* Increment here so base_shutdown cleans it up */
+			kgnilnd_data.kgn_ndevs++;
+
+			rc = kgnilnd_allocate_phys_fmablk(dev);
+			if (rc) {
+				GOTO(failed, rc);
+			}
+		}
+	}
+
+	if (kgnilnd_data.kgn_ndevs == 0) {
+		CERROR("Can't initialise any GNI devices\n");
+		rc = -ENODEV;
+		GOTO(failed, rc);
+	}
+
+	rc = kgnilnd_thread_start(kgnilnd_reaper, NULL, "kgnilnd_rpr", 0);
+	if (rc != 0) {
+		CERROR("Can't spawn gnilnd reaper: %d\n", rc);
+		GOTO(failed, rc);
+	}
+
+	/*
+	 * Start ruhroh thread.  We can't use kgnilnd_thread_start() because
+	 * we don't want this thread included in kgnilnd_data.kgn_nthreads
+	 * count.  This thread controls quiesce, so it mustn't
+	 * quiesce itself.
+	 */
+	thrd = kthread_run(kgnilnd_ruhroh_thread, NULL, "%s_%02d", "kgnilnd_rr", 0);
+	if (IS_ERR(thrd)) {
+		rc = PTR_ERR(thrd);
+		CERROR("Can't spawn gnilnd ruhroh thread: %d\n", rc);
+		GOTO(failed, rc);
+	}
+
+	/* threads will load balance across devs as they are available */
+	for (i = 0; i < *kgnilnd_tunables.kgn_sched_threads; i++) {
+		rc = kgnilnd_thread_start(kgnilnd_scheduler, (void *)((long)i),
+					  "kgnilnd_sd", i);
+		if (rc != 0) {
+			CERROR("Can't spawn gnilnd scheduler[%d]: %d\n",
+			       i, rc);
+			GOTO(failed, rc);
+		}
+	}
+
+	for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+		dev = &kgnilnd_data.kgn_devices[i];
+		rc = kgnilnd_thread_start(kgnilnd_dgram_mover, dev,
+					  "kgnilnd_dg", dev->gnd_id);
+		if (rc != 0) {
+			CERROR("Can't spawn gnilnd dgram_mover[%d]: %d\n",
+			       dev->gnd_id, rc);
+			GOTO(failed, rc);
+		}
+
+		rc = kgnilnd_thread_start(kgnilnd_dgram_waitq, dev,
+					  "kgnilnd_dgn", dev->gnd_id);
+		if (rc != 0) {
+			CERROR("Can't spawn gnilnd dgram_waitq[%d]: %d\n",
+				dev->gnd_id, rc);
+			GOTO(failed, rc);
+		}
+
+		rc = kgnilnd_setup_wildcard_dgram(dev);
+
+		if (rc != 0) {
+			CERROR("Can't create wildcard dgrams[%d]: %d\n",
+				dev->gnd_id, rc);
+			GOTO(failed, rc);
+		}
+	}
+
+
+
+	/* flag everything initialised */
+	kgnilnd_data.kgn_init = GNILND_INIT_ALL;
+	/*****************************************************/
+
+	CDEBUG(D_MALLOC, "initial kmem %d\n", pkmem);
+	RETURN(0);
+
+failed:
+	kgnilnd_base_shutdown();
+	kgnilnd_data.kgn_init = GNILND_INIT_NOTHING;
+	RETURN(rc);
+}
+
+void
+kgnilnd_base_shutdown(void)
+{
+	int           i;
+	ENTRY;
+
+	while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_PAUSE_SHUTDOWN, 1)) {};
+
+	kgnilnd_data.kgn_wc_kill = 1;
+
+	for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+		kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
+		kgnilnd_cancel_wc_dgrams(dev);
+		kgnilnd_del_conn_or_peer(NULL, LNET_NID_ANY, GNILND_DEL_PEER, -ESHUTDOWN);
+		kgnilnd_wait_for_canceled_dgrams(dev);
+	}
+
+	/* Peer state all cleaned up BEFORE setting shutdown, so threads don't
+	 * have to worry about shutdown races.  NB connections may be created
+	 * while there are still active connds, but these will be temporary
+	 * since peer creation always fails after the listener has started to
+	 * shut down.
+	 * all peers should have been cleared out on the nets */
+	LASSERTF(atomic_read(&kgnilnd_data.kgn_npeers) == 0,
+		"peers left %d\n", atomic_read(&kgnilnd_data.kgn_npeers));
+
+	/* Wait for the ruhroh thread to shut down. */
+	kgnilnd_data.kgn_ruhroh_shutdown = 1;
+	wake_up(&kgnilnd_data.kgn_ruhroh_waitq);
+	i = 2;
+	while (kgnilnd_data.kgn_ruhroh_running != 0) {
+		i++;
+		CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
+		       "Waiting for ruhroh thread to terminate\n");
+		cfs_pause(cfs_time_seconds(1));
+	}
+
+       /* Flag threads to terminate */
+	kgnilnd_data.kgn_shutdown = 1;
+
+	for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+		kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
+
+		/* should clear all the MDDs */
+		kgnilnd_unmap_phys_fmablk(dev);
+
+		kgnilnd_schedule_device(dev);
+		wake_up_all(&dev->gnd_dgram_waitq);
+		wake_up_all(&dev->gnd_dgping_waitq);
+		LASSERT(list_empty(&dev->gnd_connd_peers));
+	}
+
+	spin_lock(&kgnilnd_data.kgn_reaper_lock);
+	wake_up_all(&kgnilnd_data.kgn_reaper_waitq);
+	spin_unlock(&kgnilnd_data.kgn_reaper_lock);
+
+	/* Wait for threads to exit */
+	i = 2;
+	while (atomic_read(&kgnilnd_data.kgn_nthreads) != 0) {
+		i++;
+		CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+		       "Waiting for %d threads to terminate\n",
+		       atomic_read(&kgnilnd_data.kgn_nthreads));
+		cfs_pause(cfs_time_seconds(1));
+	}
+
+	LASSERTF(atomic_read(&kgnilnd_data.kgn_npeers) == 0,
+		"peers left %d\n", atomic_read(&kgnilnd_data.kgn_npeers));
+
+	if (kgnilnd_data.kgn_peers != NULL) {
+		for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++)
+			LASSERT(list_empty(&kgnilnd_data.kgn_peers[i]));
+
+		LIBCFS_FREE(kgnilnd_data.kgn_peers,
+			    sizeof (struct list_head) *
+			    *kgnilnd_tunables.kgn_peer_hash_size);
+	}
+
+	down_write(&kgnilnd_data.kgn_net_rw_sem);
+	if (kgnilnd_data.kgn_nets != NULL) {
+		for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++)
+			LASSERT(list_empty(&kgnilnd_data.kgn_nets[i]));
+
+		LIBCFS_FREE(kgnilnd_data.kgn_nets,
+			    sizeof (struct list_head) *
+			    *kgnilnd_tunables.kgn_net_hash_size);
+	}
+	up_write(&kgnilnd_data.kgn_net_rw_sem);
+
+	LASSERTF(atomic_read(&kgnilnd_data.kgn_nconns) == 0,
+		"conns left %d\n", atomic_read(&kgnilnd_data.kgn_nconns));
+
+	if (kgnilnd_data.kgn_conns != NULL) {
+		for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++)
+			LASSERT(list_empty(&kgnilnd_data.kgn_conns[i]));
+
+		LIBCFS_FREE(kgnilnd_data.kgn_conns,
+			    sizeof (struct list_head) *
+			    *kgnilnd_tunables.kgn_peer_hash_size);
+	}
+
+	for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+		kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
+		kgnilnd_dev_fini(dev);
+
+		LASSERTF(atomic_read(&dev->gnd_ndgrams) == 0,
+			"dgrams left %d\n", atomic_read(&dev->gnd_ndgrams));
+
+		if (dev->gnd_dgrams != NULL) {
+			for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++)
+				LASSERT(list_empty(&dev->gnd_dgrams[i]));
+
+			LIBCFS_FREE(dev->gnd_dgrams,
+				    sizeof (struct list_head) *
+				    *kgnilnd_tunables.kgn_peer_hash_size);
+		}
+
+		kgnilnd_free_phys_fmablk(dev);
+	}
+
+	if (kgnilnd_data.kgn_mbox_cache != NULL) {
+		i = cfs_mem_cache_destroy(kgnilnd_data.kgn_mbox_cache);
+		LASSERTF(i == 0, "rc %d destroying kgn_mbox_cache\n", i);
+	}
+
+	if (kgnilnd_data.kgn_rx_cache != NULL) {
+		i = cfs_mem_cache_destroy(kgnilnd_data.kgn_rx_cache);
+		LASSERTF(i == 0, "rc %d destroying kgn_rx_cache\n", i);
+	}
+
+	if (kgnilnd_data.kgn_tx_cache != NULL) {
+		i = cfs_mem_cache_destroy(kgnilnd_data.kgn_tx_cache);
+		LASSERTF(i == 0, "rc %d destroying kgn_tx_cache\n", i);
+	}
+
+	if (kgnilnd_data.kgn_tx_phys_cache != NULL) {
+		i = cfs_mem_cache_destroy(kgnilnd_data.kgn_tx_phys_cache);
+		LASSERTF(i == 0, "rc %d destroying kgn_tx_phys_cache\n", i);
+	}
+
+	if (kgnilnd_data.kgn_dgram_cache != NULL) {
+		i = cfs_mem_cache_destroy(kgnilnd_data.kgn_dgram_cache);
+		LASSERTF(i == 0, "rc %d destroying kgn_dgram_cache\n", i);
+	}
+
+	if (kgnilnd_data.kgn_cksum_map_pages != NULL) {
+		for (i = 0; i < kgnilnd_data.kgn_cksum_npages; i++) {
+			if (kgnilnd_data.kgn_cksum_map_pages[i] != NULL) {
+				kfree(kgnilnd_data.kgn_cksum_map_pages[i]);
+			}
+		}
+		kfree(kgnilnd_data.kgn_cksum_map_pages);
+	}
+
+	CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+	       atomic_read(&libcfs_kmemory));
+
+	kgnilnd_data.kgn_init = GNILND_INIT_NOTHING;
+	PORTAL_MODULE_UNUSE;
+
+	EXIT;
+}
+
+int
+kgnilnd_startup(lnet_ni_t *ni)
+{
+	int               rc, devno;
+	kgn_net_t        *net;
+	ENTRY;
+
+	LASSERTF(ni->ni_lnd == &the_kgnilnd,
+		"bad LND 0x%p != the_kgnilnd @ 0x%p\n",
+		ni->ni_lnd, &the_kgnilnd);
+
+	if (kgnilnd_data.kgn_init == GNILND_INIT_NOTHING) {
+		rc = kgnilnd_base_startup();
+		if (rc != 0)
+			RETURN(rc);
+	}
+
+	/* Serialize with shutdown. */
+	down(&kgnilnd_data.kgn_quiesce_sem);
+
+	LIBCFS_ALLOC(net, sizeof(*net));
+	if (net == NULL) {
+		CERROR("could not allocate net for new interface instance\n");
+		rc = -ENOMEM;
+		/* no need to cleanup the CDM... */
+		GOTO(failed, rc);
+	}
+	INIT_LIST_HEAD(&net->gnn_list);
+	ni->ni_data = net;
+	net->gnn_ni = ni;
+	ni->ni_maxtxcredits = *kgnilnd_tunables.kgn_credits;
+	ni->ni_peertxcredits = *kgnilnd_tunables.kgn_peer_credits;
+
+	if (*kgnilnd_tunables.kgn_peer_health) {
+		int     fudge;
+
+		/* give this a bit of leeway - we don't have a hard timeout
+		 * as we only check timeouts periodically - see comment in kgnilnd_reaper */
+		fudge = (GNILND_TO2KA(*kgnilnd_tunables.kgn_timeout) / GNILND_REAPER_NCHECKS);
+
+		ni->ni_peertimeout = *kgnilnd_tunables.kgn_timeout + fudge;
+
+		LCONSOLE_INFO("Enabling LNet peer health for gnilnd, timeout %ds\n",
+			      ni->ni_peertimeout);
+	}
+
+	atomic_set(&net->gnn_refcount, 1);
+
+	/* if we have multiple devices, spread the nets around */
+	net->gnn_netnum = LNET_NETNUM(LNET_NIDNET(ni->ni_nid));
+
+	devno = LNET_NIDNET(ni->ni_nid) % GNILND_MAXDEVS;
+	net->gnn_dev = &kgnilnd_data.kgn_devices[devno];
+
+	/* allocate a 'dummy' cdm for datagram use. We can only have a single
+	 * datagram between a nid:inst_id and nid2:inst_id. The fake cdm
+	 * give us additional inst_id to use, allowing the datagrams to flow
+	 * like rivers of honey and beer */
+
+	/* the instance id for the cdm is the NETNUM offset by MAXDEVS -
+	 * ensuring we'll have a unique id */
+
+
+	ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), net->gnn_dev->gnd_nid);
+	CDEBUG(D_NET, "adding net %p nid=%s on dev %d \n",
+		net, libcfs_nid2str(ni->ni_nid), net->gnn_dev->gnd_id);
+	/* until the gnn_list is set, we need to cleanup ourselves as
+	 * kgnilnd_shutdown is just gonna get confused */
+
+	down_write(&kgnilnd_data.kgn_net_rw_sem);
+	list_add_tail(&net->gnn_list, kgnilnd_netnum2netlist(net->gnn_netnum));
+	up_write(&kgnilnd_data.kgn_net_rw_sem);
+
+	/* we need a separate thread to call probe_wait_by_id until
+	 * we get a function callback notifier from kgni */
+	up(&kgnilnd_data.kgn_quiesce_sem);
+	RETURN(0);
+ failed:
+	up(&kgnilnd_data.kgn_quiesce_sem);
+	kgnilnd_shutdown(ni);
+	RETURN(rc);
+}
+
+void
+kgnilnd_shutdown(lnet_ni_t *ni)
+{
+	kgn_net_t     *net = ni->ni_data;
+	int           i;
+	int           rc;
+	ENTRY;
+
+	CFS_RACE(CFS_FAIL_GNI_SR_DOWN_RACE);
+
+	LASSERTF(kgnilnd_data.kgn_init == GNILND_INIT_ALL,
+		"init %d\n", kgnilnd_data.kgn_init);
+
+	/* Serialize with startup. */
+	down(&kgnilnd_data.kgn_quiesce_sem);
+	CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
+	       atomic_read(&libcfs_kmemory));
+
+	if (net == NULL) {
+		CERROR("got NULL net for ni %p\n", ni);
+		rc = -EINVAL;
+		GOTO(out, rc);
+	}
+
+	LASSERTF(ni == net->gnn_ni,
+		"ni %p gnn_ni %p\n", net, net->gnn_ni);
+
+	ni->ni_data = NULL;
+
+	LASSERT(!net->gnn_shutdown);
+	LASSERTF(atomic_read(&net->gnn_refcount) != 0,
+		"net %p refcount %d\n",
+		 net, atomic_read(&net->gnn_refcount));
+
+	if (!list_empty(&net->gnn_list)) {
+		/* serialize with peer creation */
+		down_write(&kgnilnd_data.kgn_net_rw_sem);
+		net->gnn_shutdown = 1;
+		up_write(&kgnilnd_data.kgn_net_rw_sem);
+
+		kgnilnd_cancel_net_dgrams(net);
+
+		kgnilnd_del_conn_or_peer(net, LNET_NID_ANY, GNILND_DEL_PEER, -ESHUTDOWN);
+
+		/* if we are quiesced, need to wake up - we need those threads
+		 * alive to release peers, etc */
+		if (GNILND_IS_QUIESCED) {
+			set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_IDLE);
+			kgnilnd_quiesce_wait("shutdown");
+		}
+
+		kgnilnd_wait_for_canceled_dgrams(net->gnn_dev);
+
+		/* We wait until the nets ref's are 1, we will release final ref which is ours
+		 * this allows us to make sure everything else is done before we free the
+		 * net.
+		 */
+		i = 4;
+		while (atomic_read(&net->gnn_refcount) != 1) {
+			i++;
+			CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
+				"Waiting for %d references to clear on net %d\n",
+				atomic_read(&net->gnn_refcount),
+				net->gnn_netnum);
+			cfs_pause(cfs_time_seconds(1));
+		}
+
+		/* release ref from kgnilnd_startup */
+		kgnilnd_net_decref(net);
+		/* serialize with reaper and conn_task looping */
+		down_write(&kgnilnd_data.kgn_net_rw_sem);
+		list_del_init(&net->gnn_list);
+		up_write(&kgnilnd_data.kgn_net_rw_sem);
+
+	}
+
+	/* not locking, this can't race with writers */
+	LASSERTF(atomic_read(&net->gnn_refcount) == 0,
+		"net %p refcount %d\n",
+		 net, atomic_read(&net->gnn_refcount));
+	LIBCFS_FREE(net, sizeof(*net));
+
+out:
+	down_read(&kgnilnd_data.kgn_net_rw_sem);
+	for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) {
+		if (!list_empty(&kgnilnd_data.kgn_nets[i])) {
+			up_read(&kgnilnd_data.kgn_net_rw_sem);
+			break;
+		}
+
+		if (i == *kgnilnd_tunables.kgn_net_hash_size - 1) {
+			up_read(&kgnilnd_data.kgn_net_rw_sem);
+			kgnilnd_base_shutdown();
+		}
+	}
+	CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+	       atomic_read(&libcfs_kmemory));
+
+	up(&kgnilnd_data.kgn_quiesce_sem);
+	EXIT;
+	return;
+}
+
+void __exit
+kgnilnd_module_fini(void)
+{
+	lnet_unregister_lnd(&the_kgnilnd);
+	kgnilnd_proc_fini();
+	kgnilnd_remove_sysctl();
+	kgnilnd_tunables_fini();
+}
+
+int __init
+kgnilnd_module_init(void)
+{
+	int    rc;
+
+	rc = kgnilnd_tunables_init();
+	if (rc != 0)
+		return rc;
+
+	printk(KERN_INFO "Lustre: kgnilnd build version: "KGNILND_BUILD_REV"\n");
+
+	kgnilnd_insert_sysctl();
+	kgnilnd_proc_init();
+
+	lnet_register_lnd(&the_kgnilnd);
+
+	return 0;
+}
+
+MODULE_AUTHOR("Cray, Inc. <nic@cray.com>");
+MODULE_DESCRIPTION("Kernel Gemini LND v"KGNILND_BUILD_REV);
+MODULE_LICENSE("GPL");
+
+module_init(kgnilnd_module_init);
+module_exit(kgnilnd_module_fini);
diff --git a/lnet/klnds/gnilnd/gnilnd.h b/lnet/klnds/gnilnd/gnilnd.h
new file mode 100644
index 0000000..de43728
--- /dev/null
+++ b/lnet/klnds/gnilnd/gnilnd.h
@@ -0,0 +1,1790 @@
+/*
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ * Copyright (C) 2009-2012 Cray, Inc.
+ *
+ *   Derived from work by: Eric Barton <eric@bartonsoftware.com>
+ *   Author: Nic Henke <nic@cray.com>
+ *   Author: James Shimek <jshimek@cray.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#ifndef _GNILND_GNILND_H_
+#define _GNILND_GNILND_H_
+
+#define DEBUG_SUBSYSTEM S_LND
+
+#include <libcfs/libcfs.h>
+#include <lnet/lnet.h>
+#include <lnet/lib-lnet.h>
+#include <lnet/lnet-sysctl.h>
+
+#include <gni_pub.h>
+#include "gnilnd_version.h"
+#include "gnilnd_hss_ops.h"
+
+/* tunables determined at compile time */
+#define GNILND_MIN_TIMEOUT	5		/* minimum timeout interval (seconds) */
+#define GNILND_BASE_TIMEOUT	60		/* default sane timeout */
+#define GNILND_TO2KA(t)		(((t)-1)/2)	/* timeout -> keepalive interval */
+#define GNILND_MIN_RECONNECT_TO	(GNILND_BASE_TIMEOUT/4)
+#define GNILND_MAX_RECONNECT_TO	GNILND_BASE_TIMEOUT
+#define GNILND_HARDWARE_TIMEOUT	15		/* maximum time for data to travel between nodes */
+#define GNILND_MDD_TIMEOUT	15		/* MDD hold timeout in minutes */
+
+/* reaper thread wakup interval */
+#define GNILND_REAPER_THREAD_WAKE  1
+/* reaper thread checks each conn NCHECKS time every kgnilnd_data.kgn_new_min_timeout */
+#define GNILND_REAPER_NCHECKS      4
+
+/* fixed constants */
+#define GNILND_MAXDEVS		1		/* max # of GNI devices currently supported */
+#define GNILND_MBOX_CREDITS	256		/* number of credits per mailbox */
+#define GNILND_COOKIE		0xa3579		/* cookie used by along with ptag by GNI */
+
+/* checksum values */
+#define GNILND_CHECKSUM_OFF		0	/* checksum turned off */
+#define GNILND_CHECKSUM_SMSG_HEADER	1	/* Only checksum SMSG header */
+#define GNILND_CHECKSUM_SMSG		2	/* checksum entire SMSG packet */
+#define GNILND_CHECKSUM_SMSG_BTE	3	/* Full checksum support */
+
+/* tune down some COMPUTE options as they won't see the same number of connections and
+ * don't need the throughput of multiple threads by default */
+#if defined(CONFIG_CRAY_COMPUTE)
+#define GNILND_SCHED_THREADS      1             /* default # of kgnilnd_scheduler threads */
+#define GNILND_FMABLK             64            /* default number of mboxes per fmablk */
+#else
+#define GNILND_SCHED_THREADS      3             /* default # of kgnilnd_scheduler threads */
+#define GNILND_FMABLK             1024          /* default number of mboxes per fmablk */
+#endif
+
+/* EXTRA_BITS are there to allow us to hide NOOP/CLOSE and anything else out of band */
+#define GNILND_EXTRA_BITS         1
+/* maximum number of conns & bits for cqid in the SMSG event data */
+#define GNILND_CQID_NBITS         (21 - GNILND_EXTRA_BITS)
+#define GNILND_MSGID_TX_NBITS     (32 - GNILND_CQID_NBITS)
+#define GNILND_MAX_CQID           (1 << GNILND_CQID_NBITS)
+#define GNILND_MAX_MSG_ID         (1 << GNILND_MSGID_TX_NBITS)
+#define GNILND_MAX_MSG_SIZE       (*kgnilnd_tunables.kgn_max_immediate + sizeof(kgn_msg_t))
+
+/* need sane upper bound to limit copy overhead */
+#define GNILND_MAX_IMMEDIATE      (64<<10)
+
+/* payload size to add to the base mailbox size
+ * This is subtracting 2 from the concurrent_sends as 4 messages are included in the size
+ * gni_smsg_buff_size_needed calculates, the MAX_PAYLOAD is added to
+ * the calculation return from that function.*/
+#define GNILND_MBOX_PAYLOAD     \
+	  (GNILND_MAX_MSG_SIZE * \
+	  ((*kgnilnd_tunables.kgn_concurrent_sends - 2) * 2));
+
+/* timeout -> deadman timer for kgni mdd holds */
+#define GNILND_TIMEOUT2DEADMAN   ((*kgnilnd_tunables.kgn_mdd_timeout) * 1000 * 60)
+
+/* timeout for failing sends in t is in jiffies*/
+#define GNILND_TIMEOUTRX(t)     (t + cfs_time_seconds(*kgnilnd_tunables.kgn_hardware_timeout))
+
+/* time when to release from purgatory in the reaper thread in jiffies */
+#define GNILND_PURG_RELEASE(t)   (GNILND_TIMEOUTRX(t) * 3)
+
+/* Macro for finding last_rx 2 datapoints are compared
+ * and the most recent one in jiffies is returned.
+ */
+#define GNILND_LASTRX(conn) (time_after(conn->gnc_last_rx, conn->gnc_last_rx_cq) \
+				? conn->gnc_last_rx : conn->gnc_last_rx_cq)
+
+/************************************************************************
+ * Enum, flag and tag data
+ */
+#define GNILND_INIT_NOTHING         0
+#define GNILND_INIT_DATA            1
+#define GNILND_INIT_ALL             2
+
+/* If you change the ordering away from MAPPED = UNMAPPED + 1, things break */
+#define GNILND_BUF_NONE           0              /* buffer type not set */
+#define GNILND_BUF_IMMEDIATE      1              /* immediate data */
+#define GNILND_BUF_IMMEDIATE_KIOV 2              /* immediate data */
+#define GNILND_BUF_PHYS_UNMAPPED  3              /* physical: not mapped yet */
+#define GNILND_BUF_PHYS_MAPPED    4              /* physical: mapped already */
+#define GNILND_BUF_VIRT_UNMAPPED  5              /* virtual: not mapped yet */
+#define GNILND_BUF_VIRT_MAPPED    6              /* virtual: mapped already */
+
+#define GNILND_TX_WAITING_REPLY      (1<<1)     /* expecting to receive reply */
+#define GNILND_TX_WAITING_COMPLETION (1<<2)     /* waiting for smsg_send to complete */
+#define GNILND_TX_PENDING_RDMA       (1<<3)     /* RDMA transaction pending until we get prev. completion */
+#define GNILND_TX_QUIET_ERROR        (1<<4)     /* don't print error on tx_done */
+#define GNILND_TX_FAIL_SMSG          (1<<5)     /* pass down error injection for SMSG fail */
+
+/* stash above max CQID to avoid any collision */
+#define GNILND_MSGID_NOOP           (GNILND_MAX_CQID + 128)
+#define GNILND_MSGID_CLOSE          (GNILND_MSGID_NOOP + 1)
+
+/* kgn_msg_t::gnm_type */
+#define GNILND_MSG_NONE              0x00        /* illegal message */
+#define GNILND_MSG_NOOP              0x01        /* empty gnm_u (keepalive) */
+#define GNILND_MSG_IMMEDIATE         0x02        /* gnm_u.immediate */
+#define GNILND_MSG_PUT_REQ           0x03        /* gnm_u.putreq (src->sink) */
+#define GNILND_MSG_PUT_NAK           0x04        /* gnm_u.completion (no PUT match: sink->src) */
+#define GNILND_MSG_PUT_ACK           0x05        /* gnm_u.putack (PUT matched: sink->src) */
+#define GNILND_MSG_PUT_DONE          0x06        /* gnm_u.completion (src->sink) */
+#define GNILND_MSG_GET_REQ           0x07        /* gnm_u.get (sink->src) */
+#define GNILND_MSG_GET_NAK           0x08        /* gnm_u.completion (no GET match: src->sink) */
+#define GNILND_MSG_GET_DONE          0x09        /* gnm_u.completion (src->sink) */
+#define GNILND_MSG_CLOSE             0x0a        /* empty gnm_u */
+
+/* defines for gnc_*scheduled states */
+#define GNILND_CONN_IDLE             0
+#define GNILND_CONN_SCHED            1
+#define GNILND_CONN_WANTS_SCHED      2
+#define GNILND_CONN_PROCESS          3
+
+#define GNILND_DEV_IDLE              0
+#define GNILND_DEV_IRQ               1
+#define GNILND_DEV_LOOP              2
+
+#define GNILND_DGRAM_IDLE            0
+#define GNILND_DGRAM_SCHED           1
+#define GNILND_DGRAM_PROCESS         2
+
+#define GNILND_PEER_IDLE             0
+#define GNILND_PEER_CONNECT          1
+#define GNILND_PEER_POSTING          2
+#define GNILND_PEER_POSTED           3
+#define GNILND_PEER_NEEDS_DEATH      4
+#define GNILND_PEER_KILL             5
+
+/* for gnc_close_recvd */
+#define GNILND_CLOSE_RX              1
+#define GNILND_CLOSE_INJECT1         2
+#define GNILND_CLOSE_INJECT2         3
+#define GNILND_CLOSE_EARLY           4
+
+/* defines for why quiesce trigger set */
+#define GNILND_QUIESCE_IDLE          0
+#define GNILND_QUIESCE_ADMIN         1
+#define GNILND_QUIESCE_RESET         2
+#define GNILND_QUIESCE_HW_QUIESCE    3
+
+#define GNILND_PEER_CLEAN            0
+#define GNILND_PEER_PERSISTING       1
+
+#define GNILND_DEL_CONN              0
+#define GNILND_DEL_PEER              1
+#define GNILND_CLEAR_PURGATORY       2
+
+typedef enum kgn_fmablk_state {
+	GNILND_FMABLK_IDLE = 0, /* is allocated or ready to be freed */
+	GNILND_FMABLK_PHYS,     /* allocated out of slab of physical memory */
+	GNILND_FMABLK_VIRT,     /* 'standard' vmalloc hunk */
+	GNILND_FMABLK_FREED,    /* after free */
+} kgn_fmablk_state_t;
+
+typedef enum kgn_tx_list_state {
+	GNILND_TX_IDLE = 0,     /* TX is on the idle list, kgn_idle_txs */
+	GNILND_TX_ALLOCD,       /* TX has been alloced (off of idle), could be in any state transition */
+	GNILND_TX_PEERQ,        /* TX on peer->gnp_tx_queue (no live conn) */
+	GNILND_TX_MAPQ,         /* TX on dev:gnd_map_tx for buffer mapping */
+	GNILND_TX_FMAQ,         /* TX waiting to be send on conn FMA */
+	GNILND_TX_LIVE_FMAQ,    /* TX live on the FMA wire, waiting for completion or reply */
+	GNILND_TX_RDMAQ,        /* TX waiting to send FMA confirmation to auth RDMA PUT */
+	GNILND_TX_LIVE_RDMAQ,   /* TX live on the RDMA wire, waiting for completion */
+	GNILND_TX_DYING,        /* TX got caught on MAPQ or RDMAQ while conn was closing, needs someone to call tx_done */
+	GNILND_TX_FREED         /* TX is free! */
+} kgn_tx_list_state_t;
+
+typedef enum kgn_conn_state {
+	/* don't start @ 0 - prevent memset(0) badness */
+	GNILND_CONN_DUMMY = 0,
+	GNILND_CONN_LISTEN,
+	GNILND_CONN_CONNECTING,
+	GNILND_CONN_ESTABLISHED,
+	GNILND_CONN_CLOSING,
+	GNILND_CONN_CLOSED,
+	GNILND_CONN_DONE,
+	GNILND_CONN_DESTROY_EP
+} kgn_conn_state_t;
+
+/* changing these requires a change to GNILND_CONNREQ_VERSION and
+ * will result in dropped packets instead of NAKs. Adding to this is
+ * acceptable without changing the CONNREQ_VERSION, but code should
+ * be ready to handle NAKs on version mismatch  */
+typedef enum kgn_connreq_type {
+	GNILND_CONNREQ_REQ = 1,         /* how YOU doin' ? */
+	GNILND_CONNREQ_NAK,             /* NO soup for you! */
+	GNILND_CONNREQ_CLOSE,           /* we should see other people */
+} kgn_connreq_type_t;
+
+typedef enum kgn_dgram_state {
+	/* don't use 0 to avoid thinking a memset of zero is valid data */
+	GNILND_DGRAM_USED = 1,
+	GNILND_DGRAM_POSTING,
+	GNILND_DGRAM_POSTED,
+	GNILND_DGRAM_PROCESSING,
+	GNILND_DGRAM_CANCELED,
+	GNILND_DGRAM_DONE,
+} kgn_dgram_state_t;
+
+typedef enum kgn_dgram_type {
+	GNILND_DGRAM_REQ = 1,         /* how YOU doin' ? */
+	GNILND_DGRAM_WC_REQ,          /* you talkin' to ME? */
+	GNILND_DGRAM_NAK,             /* NO soup for you! */
+	GNILND_DGRAM_CLOSE,           /* we should see other people */
+} kgn_dgram_type_t;
+
+/************************************************************************
+ * Wire message structs.  These are sent in sender's byte order
+ * (i.e. receiver checks magic and flips if required).
+ */
+
+#define GNILND_MSG_MAGIC     LNET_PROTO_GNI_MAGIC /* unique magic */
+#define GNILND_DGRAM_MAGIC   0x0DDBA11
+
+/*  kgn_msg_t - FMA/SMSG wire struct
+  v2:
+   * - added checksum to FMA
+   * moved seq before paylod
+   * WIRE_ATTR added for alignment
+  v3:
+   * added gnm_payload_len for FMA payload size
+  v4:
+   * added gncm_retval to completion, allowing return code transmission
+     on RDMA NAKs
+  v5:
+   * changed how CQID and TX ids are assigned
+  v6:
+   * added retval on CLOSE
+  v7:
+   * added payload checksumming
+  v8:
+   * reworked checksumming a bit, changed payload checksums
+*/
+#define GNILND_MSG_VERSION              8
+/* kgn_connreq_t connection request datagram wire struct
+  v2:
+   * added NAKs
+*/
+
+#define GNILND_CONNREQ_VERSION          2
+
+typedef struct kgn_gniparams {
+	__u32            gnpr_host_id;          /* ph. host ID of the NIC */
+	__u32            gnpr_cqid;             /* cqid I want peer to use when sending events to me */
+	gni_smsg_attr_t  gnpr_smsg_attr;        /* my short msg. attributes */
+} WIRE_ATTR kgn_gniparams_t;
+
+typedef struct kgn_nak_data {
+	__s32            gnnd_errno;            /* errno reason for NAK */
+
+} WIRE_ATTR kgn_nak_data_t;
+
+/* the first bits of the connreq struct CANNOT CHANGE FORM EVER
+ * without breaking the ability for us to properly NAK someone */
+typedef struct kgn_connreq {                    /* connection request/response */
+	__u32             gncr_magic;           /* I'm an gnilnd connreq */
+	__u32             gncr_cksum;           /* checksum (0 == disabled) */
+	__u16             gncr_type;            /* REQ, NAK, etc */
+	__u16             gncr_version;         /* this is my version number */
+	__u32             gncr_timeout;         /* sender's timeout */
+	__u64             gncr_srcnid;          /* sender's NID */
+	__u64             gncr_dstnid;          /* who sender expects to listen */
+	__u64             gncr_peerstamp;       /* sender's instance stamp */
+	__u64             gncr_connstamp;       /* sender's connection stamp */
+
+	/* everything before this needs to stay static, adding after should
+	 * result in a change to GNILND_CONNREQ_VERSION */
+
+	union {
+		kgn_gniparams_t   gncr_gnparams;        /* sender's endpoint info */
+		kgn_nak_data_t    gncr_nakdata;         /* data (rc, etc) for NAK */
+	};
+} WIRE_ATTR kgn_connreq_t;
+
+typedef struct {
+	gni_mem_handle_t  gnrd_key;
+	__u64             gnrd_addr;
+	__u32             gnrd_nob;
+} WIRE_ATTR kgn_rdma_desc_t;
+
+typedef struct {
+	lnet_hdr_t        gnim_hdr;             /* LNet header */
+	/* LNet payload is in FMA "Message Data" */
+} WIRE_ATTR kgn_immediate_msg_t;
+
+typedef struct {
+	lnet_hdr_t        gnprm_hdr;            /* LNet header */
+	__u64             gnprm_cookie;         /* opaque completion cookie */
+} WIRE_ATTR kgn_putreq_msg_t;
+
+typedef struct {
+	__u64             gnpam_src_cookie;     /* reflected completion cookie */
+	__u64             gnpam_dst_cookie;     /* opaque completion cookie */
+	kgn_rdma_desc_t   gnpam_desc;           /* sender's sink buffer */
+} WIRE_ATTR kgn_putack_msg_t;
+
+typedef struct {
+	lnet_hdr_t        gngm_hdr;             /* LNet header */
+	__u64             gngm_cookie;          /* opaque completion cookie */
+	kgn_rdma_desc_t   gngm_desc;            /* sender's sink buffer */
+} WIRE_ATTR kgn_get_msg_t;
+
+typedef struct {
+	int               gncm_retval;          /* error on NAK, size on REQ */
+	__u64             gncm_cookie;          /* reflected completion cookie */
+} WIRE_ATTR kgn_completion_msg_t;
+
+typedef struct {                                /* NB must fit in FMA "Prefix" */
+	__u32             gnm_magic;            /* I'm an gni message */
+	__u16             gnm_version;          /* this is my version number */
+	__u16             gnm_type;             /* msg type */
+	__u64             gnm_srcnid;           /* sender's NID */
+	__u64             gnm_connstamp;        /* sender's connection stamp */
+	__u32             gnm_seq;              /* incrementing sequence number */
+	__u16             gnm_cksum;            /* checksum (0 == no checksum ) */
+	__u16             gnm_payload_cksum;    /* payload checksum (0 == no checksum ) */
+	__u32             gnm_payload_len;      /* size of the FMA payload sent */
+	union {
+		kgn_immediate_msg_t   immediate;
+		kgn_putreq_msg_t      putreq;
+		kgn_putack_msg_t      putack;
+		kgn_get_msg_t         get;
+		kgn_completion_msg_t  completion;
+	} gnm_u;
+} WIRE_ATTR kgn_msg_t;
+
+/************************************************************************
+ * runtime tunable data
+ */
+
+typedef struct kgn_tunables {
+	int              *kgn_min_reconnect_interval; /* connreq starting timeout & retransmit interval */
+	int              *kgn_max_reconnect_interval; /* ...exponentially increasing to this */
+	int              *kgn_credits;          /* # concurrent sends */
+	int              *kgn_fma_cq_size;      /* # entries in receive CQ */
+	int              *kgn_peer_credits;     /* # LNet peer credits */
+	int              *kgn_concurrent_sends; /* max # of max_immediate in mbox */
+	int              *kgn_timeout;          /* comms timeout (seconds) */
+	int              *kgn_max_immediate;    /* immediate payload breakpoint */
+	int              *kgn_checksum;         /* checksum data */
+	int              *kgn_checksum_dump;    /* dump raw data to D_INFO log when checksumming */
+	int              *kgn_bte_hash;         /* hashing on BTE transfers */
+	int              *kgn_bte_adapt;        /* adaptive routing on BTE transfers */
+	int              *kgn_bte_relaxed_ordering; /* relaxed ordering (PASSPW) on BTE transfers */
+	int              *kgn_ptag;             /* PTAG for cdm_create */
+	int              *kgn_max_retransmits;  /* max number of FMA retransmits */
+	int              *kgn_nwildcard;        /* # wildcard per net to post */
+	int              *kgn_nice;             /* nice value for kgnilnd threads */
+	int              *kgn_rdmaq_intervals;  /* # intervals per second for rdmaq throttle */
+	int              *kgn_loops;            /* # of loops sched does before flush/heartbeat tickle */
+	int              *kgn_peer_hash_size;   /* size of kgn_peers */
+	int              *kgn_peer_health;      /* enable/disable peer health */
+	int              *kgn_vmap_cksum;       /* enable/disable vmap of kiov checksums */
+	int              *kgn_mbox_per_block;   /* mailboxes per fmablk */
+	int              *kgn_nphys_mbox;       /* # mailboxes to preallocate with physical memory */
+	int              *kgn_mbox_credits;     /* max credits per fma */
+	int              *kgn_sched_threads;    /* number of kgnilnd_scheduler threads */
+	int              *kgn_net_hash_size;    /* size of kgn_net_ht */
+	int              *kgn_hardware_timeout; /* max time for a message to get across the network */
+	int              *kgn_mdd_timeout;      /* max time for ghal to hold an mdd in minutes */
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+	cfs_sysctl_table_header_t *kgn_sysctl;  /* sysctl interface */
+#endif
+} kgn_tunables_t;
+
+typedef struct kgn_mbox_info {
+	lnet_nid_t mbx_prev_nid;
+	unsigned long mbx_create_conn_memset;
+	unsigned long mbx_add_purgatory;
+	unsigned long mbx_detach_of_purgatory;
+	unsigned long mbx_release_from_purgatory;
+	unsigned long mbx_release_purg_active_dgram;
+} kgn_mbox_info_t;
+
+typedef struct kgn_fma_memblock {
+	struct list_head    gnm_bufflist;                          /* memblock is part of device's  gnd_fma_buffs */
+	kgn_fmablk_state_t  gnm_state;                             /* how this memory allocated & state of it */
+	int                 gnm_hold_timeout;                      /* hold_timeout if used at unmap time */
+	int                 gnm_num_mboxs;                         /* total mboxes allocated */
+	int                 gnm_avail_mboxs;                       /* number of available mailboxes in the block */
+	int                 gnm_held_mboxs;                        /* number of purgatory held  mailboxes */
+	int                 gnm_mbox_size;                         /* size of the single mailbox */
+	int                 gnm_next_avail_mbox;                   /* next available mailbox */
+	long                gnm_max_timeout;                       /* max timeout for possible purgatory hold */
+	unsigned int        gnm_blk_size;                          /* how big is our hunk o memory ? */
+	void               *gnm_block;                             /* pointer to mem. block */
+	gni_mem_handle_t    gnm_hndl;                              /* mem. handle of the block */
+	unsigned long      *gnm_bit_array;                         /* bit array tracking allocation of mailboxes */
+	kgn_mbox_info_t    *gnm_mbox_info;                         /* array of mbox_information about each mbox */
+} kgn_fma_memblock_t;
+
+typedef struct kgn_device {
+	gni_nic_handle_t        gnd_handle;       /* device handle */
+	gni_cdm_handle_t        gnd_domain;       /* GNI communication domain */
+	gni_err_handle_t        gnd_err_handle;   /* device error handle */
+	unsigned long           gnd_sched_alive;  /* scheduler thread alive stamp */
+	gni_cq_handle_t         gnd_rcv_fma_cqh;  /* FMA rcv. completion queue handle */
+	gni_cq_handle_t         gnd_snd_rdma_cqh; /* rdma send completion queue handle */
+	gni_cq_handle_t         gnd_snd_fma_cqh;  /* rdma send completion queue handle */
+	struct mutex            gnd_cq_mutex;     /* CQ access serialization */
+	__u32                   gnd_host_id;      /* ph. host ID of the NIC */
+	int                     gnd_id;           /* device id, also index in kgn_devices */
+	__u32                   gnd_nid;          /* ph host ID translated to NID */
+	struct list_head        gnd_fma_buffs;    /* list of FMA memory blocks */
+	struct semaphore        gnd_fmablk_sem;   /* semaphore for FMA block memory alloc/free */
+	spinlock_t              gnd_fmablk_lock;  /* lock for mbox alloc/release */
+	atomic_t                gnd_nfmablk;      /* # of fmablk live */
+	atomic_t                gnd_fmablk_vers;  /* gnd_fma_bufs stamp */
+	atomic_t                gnd_neps;         /* # EP allocated to conns */
+	short                   gnd_ready;        /* stuff to do in scheduler thread */
+	struct list_head        gnd_ready_conns;  /* connections ready to tx/rx */
+	struct list_head        gnd_map_tx;       /* TX: needing buffer mapping */
+	wait_queue_head_t       gnd_waitq;        /* scheduler wakeup */
+	spinlock_t              gnd_lock;         /* serialise gnd_ready_conns */
+	struct list_head        gnd_connd_peers;  /* peers waiting for a connection */
+	spinlock_t              gnd_connd_lock;   /* serialise connd_peers */
+	wait_queue_head_t       gnd_dgram_waitq;  /* dgram_mover thread wakeup */
+	wait_queue_head_t       gnd_dgping_waitq; /* dgram thread ping-pong */
+	int                     gnd_dgram_ready;  /* dgrams need movin' */
+	struct list_head       *gnd_dgrams;       /* nid hash to dgrams */
+	atomic_t                gnd_ndgrams;      /* # dgrams extant */
+	spinlock_t              gnd_dgram_lock;   /* serialize gnd_dgrams */
+	struct list_head        gnd_map_list;     /* list of all mapped regions */
+	int                     gnd_map_version;  /* version flag for map list */
+	atomic_t                gnd_n_mdd;        /* number of total MDD - fma, tx, etc */
+	atomic_t                gnd_n_mdd_held;   /* number of total MDD held - fma, tx, etc */
+	atomic_t                gnd_nq_map;       /* # queued waiting for mapping (MDD/GART) */
+	atomic64_t              gnd_nbytes_map;   /* bytes of total GART maps - fma, tx, etc */
+	__u32                   gnd_map_nphys;    /* # TX phys mappings */
+	__u32                   gnd_map_physnop;  /* # TX phys pages mapped */
+	__u32                   gnd_map_nvirt;    /* # TX virt mappings */
+	__u64                   gnd_map_virtnob;  /* # TX virt bytes mapped */
+	spinlock_t              gnd_map_lock;     /* serialize gnd_map_XXX */
+	struct list_head        gnd_rdmaq;        /* RDMA to be sent */
+	spinlock_t              gnd_rdmaq_lock;   /* play nice with others */
+	atomic64_t              gnd_rdmaq_bytes_out; /* # bytes authorized */
+	atomic64_t              gnd_rdmaq_bytes_ok;  /* # bytes allowed until deadline */
+	atomic_t                gnd_rdmaq_nstalls;   /* # stalls due to throttle */
+	unsigned long           gnd_rdmaq_deadline;  /* when does bucket roll over ? */
+	struct timer_list       gnd_rdmaq_timer;     /* wakey-wakey */
+	atomic_t                gnd_short_ntx;      /* TX stats: short messages */
+	atomic64_t              gnd_short_txbytes;  /* TX stats: short message  payload*/
+	atomic_t                gnd_rdma_ntx;       /* TX stats: rdma messages */
+	atomic64_t              gnd_rdma_txbytes;   /* TX stats: rdma message payload*/
+	atomic_t                gnd_short_nrx;      /* RX stats: short messages */
+	atomic64_t              gnd_short_rxbytes;  /* RX stats: short message  payload*/
+	atomic_t                gnd_rdma_nrx;       /* RX stats: rdma messages */
+	atomic64_t              gnd_rdma_rxbytes;   /* RX stats: rdma message payload*/
+	atomic_t                gnd_fast_try;       /* # of times fast send tried */
+	atomic_t                gnd_fast_ok;        /* # of times fast send ok */
+	atomic_t                gnd_fast_block;     /* # of times fast send blocked */
+	unsigned long           gnd_mutex_delay;
+	atomic_t                gnd_n_yield;
+	atomic_t                gnd_n_schedule;
+	atomic_t                gnd_canceled_dgrams; /* # of outstanding cancels */
+} kgn_device_t;
+
+typedef struct kgn_net {
+	struct list_head    gnn_list;           /* chain on kgni_data::kgn_nets */
+	kgn_device_t       *gnn_dev;            /* device for this net */
+	lnet_ni_t          *gnn_ni;             /* network interface instance */
+	atomic_t            gnn_refcount;       /* # current references */
+	int                 gnn_shutdown;       /* lnd_shutdown set */
+	__u16               gnn_netnum;         /* stash netnum for quicker lookup */
+} kgn_net_t;
+
+static inline lnet_nid_t
+kgnilnd_lnd2lnetnid(lnet_nid_t ni_nid, lnet_nid_t kgnilnd_nid)
+{
+	return LNET_MKNID(LNET_NIDNET(ni_nid), LNET_NIDADDR(kgnilnd_nid));
+}
+
+static inline lnet_nid_t
+kgnilnd_lnet2lndnid(lnet_nid_t lnet_nid, lnet_nid_t kgnilnd_nid)
+{
+	return LNET_MKNID(LNET_NIDNET(kgnilnd_nid), LNET_NIDADDR(lnet_nid));
+}
+
+/* The code for this is a bit ugly - but really  this just boils down to a __u64
+ * that can have various parts accessed separately.
+ *
+ * The lower 32 bits is the ID
+ * we give to SMSG for our completion event - it needs to be globally unique across
+ * all TX currently in flight. We separate that out into the CQID so that we can
+ * reference the connection (kgnilnd_cqid2conn_locked) and then the msg_id to pull
+ * the actual TX out of the per-connection gnc_tx_ref_table.
+ *
+ * The upper 32 bits are just extra stuff we put into the cookie to ensure this TX
+ * has a unique value we can send with RDMA setup messages to ensure the completion for
+ * those is unique across the wire. The extra 32 bits are there to ensure that TX id
+ * reuse is separated.
+ */
+
+typedef struct kgn_tx_ev_id {
+	union {
+		__u64             txe_cookie;    /* are you my mommy ? */
+		struct {
+			__u32     txe_chips;     /* extra bits to ensure ID unique across reuse */
+			union {
+				__u32     txe_smsg_id;      /* ID for SMSG CQ event */
+				/* N.B: Never ever ever ever use the bit shifts directly,
+				 * you are just asking for a world of pain and are at the
+				 * mercy of the compiler layouts */
+				struct {
+					__u32     txe_cqid :GNILND_CQID_NBITS;
+					__u32     txe_idx :GNILND_MSGID_TX_NBITS;
+				};
+			};
+		};
+	};
+} kgn_tx_ev_id_t;
+
+typedef struct kgn_dgram {
+	struct list_head     gndg_list;          /* on hash dev::gnd_dgrams */
+	kgn_dgram_state_t    gndg_state;         /* state of this dgram */
+	kgn_dgram_type_t     gndg_type;          /* REQ, NAK, etc */
+	__u32                gndg_magic;         /* saftey word */
+	unsigned long        gndg_post_time;     /* time when we posted */
+	struct kgn_conn     *gndg_conn;          /* unbound conn with ep & smsg */
+	kgn_connreq_t        gndg_conn_out;      /* connreq from local node */
+	kgn_connreq_t        gndg_conn_in;       /* connreq from remote node */
+} kgn_dgram_t;
+
+typedef struct kgn_tx {                         /* message descriptor */
+	struct list_head          tx_list;      /* TX queues - peer, conn, rdma */
+	kgn_tx_list_state_t       tx_list_state;/* where in state machine is this TX ? */
+	struct list_head         *tx_list_p;    /* pointer to current list */
+	struct kgn_conn          *tx_conn;      /* owning conn */
+	lnet_msg_t               *tx_lntmsg[2]; /* ptl msgs to finalize on completion */
+	unsigned long             tx_qtime;     /* when tx started to wait for something (jiffies) */
+	unsigned long             tx_cred_wait; /* time spend waiting for smsg creds */
+	struct list_head          tx_map_list;  /* list entry on device map list */
+	unsigned int              tx_nob;       /* # bytes of payload */
+	int                       tx_buftype;   /* payload buffer type */
+	int                       tx_phys_npages; /* # physical pages */
+	gni_mem_handle_t          tx_map_key;   /* mapping key */
+	gni_mem_segment_t        *tx_phys;      /* page descriptors */
+	kgn_msg_t                 tx_msg;       /* FMA message buffer */
+	kgn_tx_ev_id_t            tx_id;        /* who are you, who ? who ? */
+	__u8                      tx_state;     /* state of the descriptor */
+	int                       tx_retrans;   /* retrans count of RDMA */
+	int                       tx_rc;        /* if we need to stash the ret code until we see completion */
+	void                     *tx_buffer;    /* source/sink buffer */
+	union {
+		gni_post_descriptor_t     tx_rdma_desc; /* rdma descriptor */
+		struct page              *tx_imm_pages[GNILND_MAX_IMMEDIATE/PAGE_SIZE];  /* page array to map kiov for immediate send */
+	};
+
+	/* we only use one or the other */
+	union {
+		kgn_putack_msg_t  tx_putinfo;   /* data for differed rdma & re-try */
+		kgn_get_msg_t     tx_getinfo;   /* data for rdma re-try*/
+	};
+} kgn_tx_t;
+
+typedef struct kgn_conn {
+	kgn_device_t       *gnc_device;         /* which device */
+	struct kgn_peer    *gnc_peer;           /* owning peer */
+	struct list_head    gnc_list;           /* stash on peer's conn list - or pending purgatory lists as we clear them */
+	struct list_head    gnc_hashlist;       /* stash in connection hash table */
+	struct list_head    gnc_schedlist;      /* schedule (on gnd_?_conns) for attention */
+	struct list_head    gnc_fmaq;           /* txs queued for FMA */
+	struct list_head    gnc_mdd_list;       /* hold list for MDD on hard conn reset */
+	__u64               gnc_peerstamp;      /* peer's unique stamp */
+	__u64               gnc_peer_connstamp; /* peer's unique connection stamp */
+	__u64               gnc_my_connstamp;   /* my unique connection stamp */
+	unsigned long       gnc_first_rx;       /* when I first received an FMA message (jiffies) */
+	unsigned long       gnc_last_tx;        /* when I last sent an FMA message (jiffies) */
+	unsigned long       gnc_last_rx;        /* when I last sent an FMA message (jiffies) */
+	unsigned long       gnc_last_tx_cq;     /* when I last received an FMA CQ (jiffies) */
+	unsigned long       gnc_last_rx_cq;     /* when I last received an FMA CQ (jiffies) */
+	unsigned long       gnc_last_noop_want; /* time I wanted to send NOOP */
+	unsigned long       gnc_last_noop_sent; /* time I did gni_smsg_send on NOOP */
+	unsigned long       gnc_last_noop_cq;   /* time when NOOP completed */
+	unsigned long       gnc_last_sched_ask; /* time when conn added to ready_conns */
+	unsigned long       gnc_last_sched_do;  /* time when conn processed from ready_conns */
+	atomic_t            gnc_reaper_noop;    /* # reaper triggered NOOP */
+	atomic_t            gnc_sched_noop;     /* # sched triggered NOOP */
+	unsigned int        gnc_timeout;        /* infer peer death if no rx for this many seconds */
+	__u32               gnc_cqid;           /* my completion callback id (non-unique) */
+	__u32               gnc_tx_seq;         /* tx msg sequence number */
+	__u32               gnc_rx_seq;         /* rx msg sequence number */
+	__u64               gnc_tx_retrans;     /* # retrans on SMSG */
+	atomic_t            gnc_nlive_fma;      /* # live FMA */
+	atomic_t            gnc_nq_rdma;        /* # queued (on device) RDMA */
+	atomic_t            gnc_nlive_rdma;     /* # live RDMA */
+	short               gnc_close_sent;     /* I've sent CLOSE */
+	short               gnc_close_recvd;    /* I've received CLOSE */
+	short               gnc_in_purgatory;   /* in the sin bin */
+	int                 gnc_error;          /* errno when conn being closed due to error */
+	int                 gnc_peer_error;     /* errno peer sent us on CLOSE */
+	kgn_conn_state_t    gnc_state;          /* connection state */
+	int                 gnc_scheduled;      /* being attented to */
+	atomic_t            gnc_refcount;       /* # users */
+	spinlock_t          gnc_list_lock;      /* serialise tx lists, max_rx_age */
+	gni_ep_handle_t     gnc_ephandle;       /* GNI endpoint */
+	kgn_fma_memblock_t *gnc_fma_blk;        /* pointer to fma block for our mailbox */
+	gni_smsg_attr_t     gnpr_smsg_attr;     /* my short msg. attributes */
+	spinlock_t          gnc_tx_lock;        /* protect tx alloc/free */
+	__u8                gnc_tx_bits[GNILND_MAX_MSG_ID/8]; /* bit table for tx id */
+	int                 gnc_next_tx;        /* next tx to use in tx_ref_table */
+	kgn_tx_t          **gnc_tx_ref_table;   /* table of TX descriptors for this conn */
+	int                 gnc_mbox_id;        /* id of mbox in fma_blk                 */
+	short               gnc_needs_detach;   /* flag set in detach_purgatory_all_locked so reaper will clear out purgatory */
+	short               gnc_needs_closing;  /* flag set in del_conns when called from kgnilnd_del_peer_or_conn */
+} kgn_conn_t;
+
+typedef struct kgn_mdd_purgatory {
+	gni_mem_handle_t    gmp_map_key;        /* mapping key */
+	struct list_head    gmp_list;           /* entry point for purgatory list */
+} kgn_mdd_purgatory_t;
+
+typedef struct kgn_peer {
+	struct list_head    gnp_list;                   /* stash on global peer list */
+	struct list_head    gnp_connd_list;             /* schedule on kgn_connd_peers */
+	struct list_head    gnp_conns;                  /* all active connections and all conns in purgatory for the peer */
+	struct list_head    gnp_tx_queue;               /* msgs waiting for a conn */
+	kgn_net_t          *gnp_net;                    /* net instance for this peer */
+	lnet_nid_t          gnp_nid;                    /* who's on the other end(s) */
+	atomic_t            gnp_refcount;               /* # users */
+	__u32               gnp_host_id;                /* ph. host ID of the peer */
+	short               gnp_connecting;             /* connection forming */
+	short               gnp_pending_unlink;         /* need last conn close to trigger unlink */
+	int                 gnp_last_errno;             /* last error conn saw */
+	unsigned long       gnp_last_alive;             /* last time I had valid comms */
+	int                 gnp_last_dgram_errno;       /* last error dgrams saw */
+	unsigned long       gnp_last_dgram_time;        /* last time I tried to connect */
+	unsigned long       gnp_reconnect_time;         /* CURRENT_SECONDS when reconnect OK */
+	unsigned long       gnp_reconnect_interval;     /* exponential backoff */
+	atomic_t            gnp_dirty_eps;              /* # of old but yet to be destroyed EPs from conns */
+} kgn_peer_t;
+
+/* the kgn_rx_t is a struct for handing to LNET as the private pointer for things
+ * like lnet_parse. It allows a single pointer to let us get enough
+ * information in _recv and friends */
+typedef struct kgn_rx {
+	kgn_conn_t              *grx_conn;      /* connection */
+	kgn_msg_t               *grx_msg;       /* message */
+	lnet_msg_t              *grx_lntmsg;    /* lnet msg for this rx (eager only) */
+	int                      grx_eager;     /* if eager, we copied msg to somewhere */
+	struct timespec          grx_received;  /* time this msg received */
+} kgn_rx_t;
+
+typedef struct kgn_data {
+	int                     kgn_init;             /* initialisation state */
+	int                     kgn_shutdown;         /* shut down? */
+	int                     kgn_wc_kill;          /* Should I repost the WC */
+	atomic_t                kgn_nthreads;         /* # live threads */
+	int                     kgn_nresets;          /* number of stack resets */
+	int                     kgn_in_reset;         /* are we in stack reset ? */
+
+	kgn_device_t            kgn_devices[GNILND_MAXDEVS]; /* device/ptag/cq etc */
+	int                     kgn_ndevs;            /* # devices */
+
+	int                     kgn_ruhroh_running;   /* ruhroh thread is running */
+	int                     kgn_ruhroh_shutdown;  /* ruhroh thread should or is shut down */
+	wait_queue_head_t       kgn_ruhroh_waitq;     /* ruhroh thread wakeup */
+	int                     kgn_quiesce_trigger;  /* should we quiesce ? */
+	atomic_t                kgn_nquiesce;         /* how many quiesced ? */
+	struct semaphore        kgn_quiesce_sem;      /* serialize ruhroh task, startup and shutdown */
+	int                     kgn_needs_reset;      /* we need stack reset */
+
+	/* These next three members implement communication from gnilnd into
+	 * the ruhroh task.  To ensure correct operation of the task, code that
+	 * writes into them must use memory barriers to ensure that the changes
+	 * are visible to other cores in the order the members appear below.  */
+	__u32                   kgn_quiesce_secs;     /* seconds to bump timeouts */
+	int                     kgn_bump_info_rdy;    /* we have info needed to bump */
+	int                     kgn_needs_pause;      /* we need to pause for network quiesce */
+
+	struct list_head       *kgn_nets;             /* hashtable of kgn_net instances */
+	struct rw_semaphore     kgn_net_rw_sem;       /* serialise gnn_shutdown, kgn_nets */
+
+	rwlock_t                kgn_peer_conn_lock;   /* stabilize peer/conn ops */
+	struct list_head       *kgn_peers;            /* hash table of all my known peers */
+	atomic_t                kgn_npeers;           /* # peers extant */
+	int                     kgn_peer_version;     /* version flag for peer tables */
+
+	struct list_head       *kgn_conns;            /* conns hashed by cqid */
+	atomic_t                kgn_nconns;           /* # connections extant */
+	__u64                   kgn_peerstamp;        /* when I started up */
+	__u64                   kgn_connstamp;        /* conn stamp generator */
+	int                     kgn_conn_version;     /* version flag for conn tables */
+	int                     kgn_next_cqid;        /* cqid generator */
+
+	long                    kgn_new_min_timeout;  /* minimum timeout on any new conn */
+	wait_queue_head_t       kgn_reaper_waitq;     /* reaper sleeps here */
+	spinlock_t              kgn_reaper_lock;      /* serialise */
+
+	cfs_mem_cache_t        *kgn_rx_cache;         /* rx descriptor space */
+	cfs_mem_cache_t        *kgn_tx_cache;         /* tx descriptor memory */
+	cfs_mem_cache_t        *kgn_tx_phys_cache;    /* tx phys descriptor memory */
+	atomic_t                kgn_ntx;              /* # tx in use */
+	cfs_mem_cache_t        *kgn_dgram_cache;      /* outgoing datagrams */
+
+	struct page          ***kgn_cksum_map_pages;  /* page arrays for mapping pages on checksum */
+	__u64			kgn_cksum_npages;     /* Number of pages allocated for checksumming */
+	atomic_t                kgn_nvmap_cksum;      /* # times we vmapped for checksums */
+	atomic_t                kgn_nvmap_short;      /* # times we vmapped for short kiov */
+
+	atomic_t                kgn_nkmap_short;      /* # time we kmapped for a short kiov */
+	long                    kgn_rdmaq_override;   /* bytes per second override */
+
+	struct kmem_cache      *kgn_mbox_cache;       /* mailboxes from not-GART */
+
+	atomic_t                kgn_npending_unlink;  /* # of peers pending unlink */
+	atomic_t                kgn_npending_conns;   /* # of conns with pending closes */
+	atomic_t                kgn_npending_detach;  /* # of conns with a pending detach */
+
+} kgn_data_t;
+
+extern kgn_data_t         kgnilnd_data;
+extern kgn_tunables_t     kgnilnd_tunables;
+
+extern void kgnilnd_destroy_peer(kgn_peer_t *peer);
+extern void kgnilnd_destroy_conn(kgn_conn_t *conn);
+extern void kgnilnd_schedule_conn(kgn_conn_t *conn);
+
+static inline int
+kgnilnd_thread_start(int(*fn)(void *arg), void *arg, char *name, int id)
+{
+	struct task_struct *thrd = kthread_run(fn, arg, "%s_%02d", name, id);
+	if (IS_ERR(thrd))
+		return PTR_ERR(thrd);
+
+	atomic_inc(&kgnilnd_data.kgn_nthreads);
+	return 0;
+}
+
+static inline void
+kgnilnd_thread_fini(void)
+{
+	atomic_dec(&kgnilnd_data.kgn_nthreads);
+}
+
+/* like mutex_trylock but with a jiffies spinner. This is to allow certain
+ * parts of the code to avoid a scheduler trip when the mutex is held
+ *
+ * Try to acquire the mutex atomically for 1 jiffie. Returns 1 if the mutex
+ * has been acquired successfully, and 0 on contention.
+ *
+ * NOTE: this function follows the spin_trylock() convention, so
+ * it is negated to the down_trylock() return values! Be careful
+ * about this when converting semaphore users to mutexes.
+ *
+ * This function must not be used in interrupt context. The
+ * mutex must be released by the same task that acquired it.
+ */
+static inline int kgnilnd_mutex_trylock(struct mutex *lock)
+{
+	int             ret;
+	unsigned long   timeout;
+
+	LASSERT(!in_interrupt());
+
+	for (timeout = jiffies + 1; time_before(jiffies, timeout);) {
+
+		ret = mutex_trylock(lock);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+/* Copied from DEBUG_REQ in Lustre - the dance is needed to save stack space */
+
+extern void
+_kgnilnd_debug_msg(kgn_msg_t *msg,
+		struct libcfs_debug_msg_data *data, const char *fmt, ... );
+
+#define kgnilnd_debug_msg(msgdata, mask, cdls, msg, fmt, a...)                \
+do {                                                                          \
+	CFS_CHECK_STACK(msgdata, mask, cdls);                                 \
+									      \
+	if (((mask) & D_CANTMASK) != 0 ||                                     \
+	    ((libcfs_debug & (mask)) != 0 &&                                  \
+	     (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0))                \
+		_kgnilnd_debug_msg((msg), msgdata, fmt, ##a);                 \
+} while(0)
+
+/* for most callers (level is a constant) this is resolved at compile time */
+#define GNIDBG_MSG(level, msg, fmt, args...)                                  \
+do {                                                                          \
+	if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) {                   \
+	    static cfs_debug_limit_state_t cdls;                              \
+	    LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls);                \
+	    kgnilnd_debug_msg(&msgdata, level, &cdls, msg,                    \
+			      "$$ "fmt" from %s ", ## args,                   \
+			      libcfs_nid2str((msg)->gnm_srcnid));             \
+	} else {                                                              \
+	    LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL);                 \
+	    kgnilnd_debug_msg(&msgdata, level, NULL, msg,                     \
+			      "$$ "fmt" from %s ", ## args,                   \
+			      libcfs_nid2str((msg)->gnm_srcnid));             \
+	}                                                                     \
+} while (0)
+
+/* user puts 'to nid' in msg for us */
+#define GNIDBG_TOMSG(level, msg, fmt, args...)                                \
+do {                                                                          \
+	if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) {                   \
+	    static cfs_debug_limit_state_t cdls;                              \
+	    LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls);                \
+	    kgnilnd_debug_msg(&msgdata, level, &cdls, msg,                    \
+			      "$$ "fmt" ", ## args);                          \
+	} else {                                                              \
+	    LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL);                 \
+	    kgnilnd_debug_msg(&msgdata, level, NULL, msg,                     \
+			      "$$ "fmt" ", ## args);                          \
+	}                                                                     \
+} while (0)
+
+extern void
+_kgnilnd_debug_conn(kgn_conn_t *conn,
+		struct libcfs_debug_msg_data *data, const char *fmt, ... );
+
+#define kgnilnd_debug_conn(msgdata, mask, cdls, conn, fmt, a...)               \
+do {                                                                           \
+	CFS_CHECK_STACK(msgdata, mask, cdls);                                  \
+									       \
+	if (((mask) & D_CANTMASK) != 0 ||                                      \
+	    ((libcfs_debug & (mask)) != 0 &&                                   \
+	     (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0))                 \
+		_kgnilnd_debug_conn((conn), msgdata, fmt, ##a);                \
+} while(0)
+
+/* for most callers (level is a constant) this is resolved at compile time */
+#define GNIDBG_CONN(level, conn, fmt, args...)                                  \
+do {                                                                            \
+	if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) {                     \
+	    static cfs_debug_limit_state_t cdls;                                \
+	    LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls);                  \
+	    kgnilnd_debug_conn(&msgdata, level, &cdls, conn,                    \
+			       "$$ "fmt" ", ## args);                           \
+	} else {                                                                \
+	    LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL);                   \
+	    kgnilnd_debug_conn(&msgdata, level, NULL, conn,                     \
+			       "$$ "fmt" ", ## args);                           \
+	}                                                                       \
+} while (0)
+
+extern void
+_kgnilnd_debug_tx(kgn_tx_t *tx,
+		struct libcfs_debug_msg_data *data, const char *fmt, ... );
+
+#define kgnilnd_debug_tx(msgdata, mask, cdls, tx, fmt, a...)                   \
+do {                                                                           \
+	CFS_CHECK_STACK(msgdata, mask, cdls);                                  \
+									       \
+	if (((mask) & D_CANTMASK) != 0 ||                                      \
+	    ((libcfs_debug & (mask)) != 0 &&                                   \
+	     (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0))                 \
+		_kgnilnd_debug_tx((tx), msgdata, fmt, ##a);                    \
+} while(0)
+
+/* for most callers (level is a constant) this is resolved at compile time */
+#define GNIDBG_TX(level, tx, fmt, args...)                                      \
+do {                                                                            \
+	if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) {                     \
+	    static cfs_debug_limit_state_t cdls;                                \
+	    LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls);                  \
+	    kgnilnd_debug_tx(&msgdata, level, &cdls, tx,                        \
+			      "$$ "fmt" ", ## args);                            \
+	} else {                                                                \
+	    LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL);                   \
+	    kgnilnd_debug_tx(&msgdata, level, NULL, tx,                         \
+			      "$$ "fmt" ", ## args);                            \
+	}                                                                       \
+} while (0)
+
+#define GNITX_ASSERTF(tx, cond, fmt, a...)                                      \
+({                                                                              \
+	if (unlikely(!(cond))) {                                                \
+		GNIDBG_TX(D_EMERG, tx, "ASSERTION(" #cond ") failed:" fmt, a);  \
+		LBUG();                                                         \
+	}                                                                       \
+})
+
+#define GNILND_IS_QUIESCED                                                      \
+	(atomic_read(&kgnilnd_data.kgn_nquiesce) ==                             \
+		atomic_read(&kgnilnd_data.kgn_nthreads))
+
+#define KGNILND_SPIN_QUIESCE                                                 \
+do {                                                                         \
+	/* E.T phone home */                                                 \
+	atomic_inc(&kgnilnd_data.kgn_nquiesce);                              \
+	CDEBUG(D_NET, "Waiting for thread pause to be over...\n");           \
+	while (kgnilnd_data.kgn_quiesce_trigger) {                           \
+		set_current_state(TASK_INTERRUPTIBLE);                       \
+		cfs_schedule_timeout_and_set_state(TASK_INTERRUPTIBLE,       \
+			cfs_time_seconds(1));                                \
+	}                                                                    \
+	/* Mom, my homework is done */                                       \
+	CDEBUG(D_NET, "Waking up from thread pause\n");                      \
+	atomic_dec(&kgnilnd_data.kgn_nquiesce);                              \
+} while(0)
+
+/* use macros for addref/decref to get the calling function name in the CDEBUG */
+#ifndef LIBCFS_DEBUG
+#error "this code uses actions inside LASSERT for ref counting"
+#endif
+
+#define kgnilnd_admin_addref(atomic)                                     \
+do {                                                                            \
+	int     val = atomic_inc_return(&atomic);                               \
+	LASSERTF(val > 0,  #atomic " refcount %d\n", val);                       \
+	CDEBUG(D_NETTRACE, #atomic " refcount %d\n", val);                       \
+} while (0)
+
+#define kgnilnd_admin_decref(atomic)                                     \
+do {                                                                            \
+	int     val = atomic_dec_return(&atomic);                               \
+	LASSERTF(val >=0,  #atomic " refcount %d\n", val);                        \
+	CDEBUG(D_NETTRACE, #atomic " refcount %d\n", val);                       \
+}while (0)
+
+#define kgnilnd_net_addref(net)                                                 \
+do {                                                                            \
+	int     val = atomic_inc_return(&net->gnn_refcount);                    \
+	LASSERTF(val > 1, "net %p refcount %d\n", net, val);                    \
+	CDEBUG(D_NETTRACE, "net %p->%s++ (%d)\n", net,                          \
+		libcfs_nid2str(net->gnn_ni->ni_nid), val);                      \
+} while (0)
+
+#define kgnilnd_net_decref(net)                                                 \
+do {                                                                            \
+	int     val = atomic_dec_return(&net->gnn_refcount);                    \
+	LASSERTF(val >= 0, "net %p refcount %d\n", net, val);                   \
+	CDEBUG(D_NETTRACE, "net %p->%s-- (%d)\n", net,                          \
+	       libcfs_nid2str(net->gnn_ni->ni_nid), val);                       \
+} while (0)
+
+#define kgnilnd_peer_addref(peer)                                               \
+do {                                                                            \
+	int     val = atomic_inc_return(&peer->gnp_refcount);                   \
+	LASSERTF(val > 1, "peer %p refcount %d\n", peer, val);                  \
+	CDEBUG(D_NETTRACE, "peer %p->%s++ (%d)\n", peer,                        \
+	       libcfs_nid2str(peer->gnp_nid), val);                             \
+} while (0)
+
+#define kgnilnd_peer_decref(peer)                                               \
+do {                                                                            \
+	int     val = atomic_dec_return(&peer->gnp_refcount);                   \
+	LASSERTF(val >= 0, "peer %p refcount %d\n", peer, val);                 \
+	CDEBUG(D_NETTRACE, "peer %p->%s--(%d)\n", peer,                         \
+	       libcfs_nid2str(peer->gnp_nid), val);                             \
+	if (atomic_read(&peer->gnp_refcount) == 0)                              \
+		kgnilnd_destroy_peer(peer);                                     \
+} while(0)
+
+#define kgnilnd_conn_addref(conn)                                       \
+do {                                                                    \
+	int     val;                                                    \
+									\
+	smp_wmb();                                                      \
+	val = atomic_inc_return(&conn->gnc_refcount);                   \
+	LASSERTF(val >= 0, "conn %p refc %d to %s\n",                   \
+		conn, val,                                              \
+		conn->gnc_peer                                          \
+			? libcfs_nid2str(conn->gnc_peer->gnp_nid)       \
+			: "<?>");                                       \
+	CDEBUG(D_NETTRACE, "conn %p->%s++ (%d)\n", conn,                \
+		conn->gnc_peer                                          \
+			? libcfs_nid2str(conn->gnc_peer->gnp_nid)       \
+			: "<?>",                                        \
+		val);                                                   \
+} while (0)
+
+/* we hijack conn_decref && gnc_refcount = 1 to allow us to push the conn
+ * through the scheduler thread to get the EP destroyed. This avoids some
+ * messy semaphore business and allows us to reuse the connd_list and existing
+ * linkage and avoid creating extra lists just for destroying EPs */
+
+/* Safety Disclaimer:
+ * Q: If we decrement the refcount and then check it again, is it possible that
+ *    another caller could have passed through this macro concurrently? If so,
+ *    then it is possible that both will attempt to call kgnilnd_destroy_conn().
+ *
+ * A: Yes, entirely possible in most cases, but we can't get concurrent users
+ * once we are refcount <= 2. It hinges around gnc_state and membership of
+ * gnc_hashlist. There are two ways to find a connection - either ask for
+ * it from the peer, kgnilnd_find_conn_locked(peer) or from the CQ id,
+ * kgnilnd_cqid2conn_locked(id). While a conn is live, we'll have at least
+ * 4 refcounts
+ *
+ * - #1 from create (kgnilnd_create_conn)
+ * - #2 for EP (kgnilnd_create_conn)
+ * - #3 - living on peer (gnc_list, kgnilnd_finish_connect)
+ * - #4 living in global hash (gnc_hashlist, kgnilnd_finish_connect).
+ *
+ * Actually, only 3 live, as at the end of kgnilnd_finish_connect, we drop:
+ * - #1 - the ref the dgram inherited from kgnilnd_create_conn.
+ *
+ * There could be more from TX descriptors during the lifetime of a live
+ * conn.
+ *
+ * If we nuke the conn before finish_connect, we won't have parallel paths
+ * because nobody besides the dgram handler for the single outstanding
+ * dgram can find the connection as it isn't in any searchable tables yet.
+ *
+ * This leaves connection close, we'll drop 2 refs (#4 and #3) but only
+ * after calling kgnilnd_schedule_conn, which would add a new ref (#5). At
+ * this point gnc_refcount=2 (#2, #5). We have a 'maybe' send of the CLOSE
+ * now on the next scheduler loop, this could be #6 (schedule_conn again)
+ * and #7 (TX on gnc_fmaq). Both would be cleared quickly as that TX is
+ * sent. Now the gnc_state == CLOSED, so we hit
+ * kgnilnd_complete_closed_conn. At this point, nobody can 'find' this conn
+ * - we've nuked them from the peer and CQ id tables, so we own them and
+ * are guaranteed serial access - hence the complete lack of conn list
+ * locking in kgnilnd_complete_closed_conn. We are free then to mark the
+ * conn DESTROY_EP (add #6 for schedule_conn), then lose #5 in
+ * kgnilnd_process_conns. Then the next scheduler loop would call
+ * kgnilnd_destroy_conn_ep (drop #2 for EP) and lose #6 (refcount=0) in
+ * kgnilnd_process_conns.
+ *
+ * Clearly, we are totally safe. Clearly.
+ */
+
+#define kgnilnd_conn_decref(conn)                                       \
+do {                                                                    \
+	int     val;                                                    \
+									\
+	smp_wmb();                                                      \
+	val = atomic_dec_return(&conn->gnc_refcount);                   \
+	LASSERTF(val >= 0, "conn %p refc %d to %s\n",                   \
+		conn, val,                                              \
+		conn->gnc_peer                                          \
+			? libcfs_nid2str(conn->gnc_peer->gnp_nid)       \
+			: "<?>");                                       \
+	CDEBUG(D_NETTRACE, "conn %p->%s-- (%d)\n", conn,                \
+		conn->gnc_peer                                          \
+			? libcfs_nid2str(conn->gnc_peer->gnp_nid)       \
+			: "<?>",                                        \
+		val);                                                   \
+	smp_rmb();                                                      \
+	if ((atomic_read(&conn->gnc_refcount) == 1) &&                  \
+	    (conn->gnc_ephandle != NULL) &&                             \
+	    (conn->gnc_state != GNILND_CONN_DESTROY_EP)) {              \
+		set_mb(conn->gnc_state, GNILND_CONN_DESTROY_EP);        \
+		kgnilnd_schedule_conn(conn);                            \
+	} else if (atomic_read(&conn->gnc_refcount) == 0) {             \
+		kgnilnd_destroy_conn(conn);                             \
+	}                                                               \
+} while (0)
+
+static inline struct list_head *
+kgnilnd_nid2peerlist(lnet_nid_t nid)
+{
+	unsigned int hash = ((unsigned int)LNET_NIDADDR(nid)) % *kgnilnd_tunables.kgn_peer_hash_size;
+
+	RETURN(&kgnilnd_data.kgn_peers[hash]);
+}
+
+static inline struct list_head *
+kgnilnd_netnum2netlist(__u16 netnum)
+{
+	unsigned int hash = ((unsigned int) netnum) % *kgnilnd_tunables.kgn_net_hash_size;
+
+	RETURN(&kgnilnd_data.kgn_nets[hash]);
+}
+
+static inline int
+kgnilnd_peer_active(kgn_peer_t *peer)
+{
+	/* Am I in the peer hash table? */
+	return (!list_empty(&peer->gnp_list));
+}
+
+/* need write_lock on kgn_peer_conn_lock */
+static inline int
+kgnilnd_can_unlink_peer_locked(kgn_peer_t *peer)
+{
+	CDEBUG(D_NET, "peer 0x%p->%s conns? %d tx? %d\n",
+		peer, libcfs_nid2str(peer->gnp_nid),
+		!list_empty(&peer->gnp_conns),
+		!list_empty(&peer->gnp_tx_queue));
+
+	/* kgn_peer_conn_lock protects us from conflict with
+	 * kgnilnd_peer_notify and gnp_persistent */
+	RETURN ((list_empty(&peer->gnp_conns)) &&
+		(list_empty(&peer->gnp_tx_queue)));
+}
+
+/* returns positive if error was for a clean shutdown of conn */
+static inline int
+kgnilnd_conn_clean_errno(int errno)
+{
+	/*  - ESHUTDOWN - LND is unloading
+	 *  - EUCLEAN - admin requested via "lctl del_peer"
+	 *  - ENETRESET - admin requested via "lctl disconnect"
+	 *  - ENOTRECOVERABLE - stack reset
+	 *  - EISCONN - cleared via "lctl push"
+	 *  not doing ESTALE - that isn't clean */
+	RETURN ((errno == 0) ||
+		(errno == -ESHUTDOWN) ||
+		(errno == -EUCLEAN) ||
+		(errno == -ENETRESET) ||
+		(errno == -EISCONN) ||
+		(errno == -ENOTRECOVERABLE));
+}
+
+/* returns positive if error results in purgatory hold */
+static inline int
+kgnilnd_check_purgatory_errno(int errno)
+{
+	/* We don't want to save the purgatory lists these cases:
+	 *  - EUCLEAN - admin requested via "lctl del_peer"
+	 *  - ESHUTDOWN - LND is unloading
+	 */
+	RETURN ((errno != -ESHUTDOWN) &&
+		(errno != -EUCLEAN));
+
+}
+
+/* returns positive if a purgatory hold is needed */
+static inline int
+kgnilnd_check_purgatory_conn(kgn_conn_t *conn)
+{
+	int loopback = 0;
+
+	if (conn->gnc_peer) {
+		loopback = conn->gnc_peer->gnp_nid ==
+		       conn->gnc_peer->gnp_net->gnn_ni->ni_nid;
+	} else {
+		/* short circuit - a conn that didn't complete
+		 * setup never needs a purgatory hold */
+		RETURN(0);
+	}
+	CDEBUG(D_NETTRACE, "conn 0x%p->%s loopback %d close_recvd %d\n",
+		conn, conn->gnc_peer ?
+				libcfs_nid2str(conn->gnc_peer->gnp_nid) :
+				"<?>",
+		loopback, conn->gnc_close_recvd);
+
+	/* we only use a purgatory hold if we've not received the CLOSE msg
+	 * from our peer - without that message, we can't know the state of
+	 * the other end of this connection and must put it into purgatory
+	 * to prevent reuse and corruption.
+	 * The theory is that a TX error can be communicated in all other cases
+	 */
+	RETURN(likely(!loopback) && !conn->gnc_close_recvd &&
+		kgnilnd_check_purgatory_errno(conn->gnc_error));
+}
+
+static inline const char *
+kgnilnd_tx_state2str(kgn_tx_list_state_t state);
+
+static inline struct list_head *
+kgnilnd_tx_state2list(kgn_peer_t *peer, kgn_conn_t *conn,
+			kgn_tx_list_state_t to_state)
+{
+	switch (to_state) {
+	case GNILND_TX_PEERQ:
+		return &peer->gnp_tx_queue;
+	case GNILND_TX_FMAQ:
+		return &conn->gnc_fmaq;
+	case GNILND_TX_LIVE_FMAQ:
+	case GNILND_TX_LIVE_RDMAQ:
+	case GNILND_TX_DYING:
+		return NULL;
+	case GNILND_TX_MAPQ:
+		return &conn->gnc_device->gnd_map_tx;
+	case GNILND_TX_RDMAQ:
+		return &conn->gnc_device->gnd_rdmaq;
+	default:
+		/* IDLE, FREED or ALLOCD is not valid "on list" state */
+		CERROR("invalid state requested: %s\n",
+			kgnilnd_tx_state2str(to_state));
+		LBUG();
+		break;
+	}
+}
+
+/* should hold tx, conn or peer lock when calling */
+static inline void
+kgnilnd_tx_add_state_locked(kgn_tx_t *tx, kgn_peer_t *peer,
+			kgn_conn_t *conn, kgn_tx_list_state_t state,
+			int add_tail)
+{
+	struct list_head        *list = NULL;
+
+	/* make sure we have a sane TX state to start */
+	GNITX_ASSERTF(tx, (tx->tx_list_p == NULL &&
+		  tx->tx_list_state == GNILND_TX_ALLOCD) &&
+		list_empty(&tx->tx_list),
+		"bad state with tx_list %s",
+		list_empty(&tx->tx_list) ? "empty" : "not empty");
+
+	/* WTF - you are already on that state buttmunch */
+	GNITX_ASSERTF(tx, state != tx->tx_list_state,
+		      "already at %s", kgnilnd_tx_state2str(state));
+
+	/* get proper list from the state requested */
+	list = kgnilnd_tx_state2list(peer, conn, state);
+
+	/* add refcount */
+	switch (state) {
+	case GNILND_TX_PEERQ:
+		kgnilnd_peer_addref(peer);
+		break;
+	case GNILND_TX_ALLOCD:
+		/* no refs needed */
+		break;
+	case GNILND_TX_FMAQ:
+		kgnilnd_conn_addref(conn);
+		break;
+	case GNILND_TX_MAPQ:
+		atomic_inc(&conn->gnc_device->gnd_nq_map);
+		kgnilnd_conn_addref(conn);
+		break;
+	case GNILND_TX_LIVE_FMAQ:
+		atomic_inc(&conn->gnc_nlive_fma);
+		kgnilnd_conn_addref(conn);
+		break;
+	case GNILND_TX_LIVE_RDMAQ:
+		atomic_inc(&conn->gnc_nlive_rdma);
+		kgnilnd_conn_addref(conn);
+		break;
+	case GNILND_TX_RDMAQ:
+		atomic_inc(&conn->gnc_nq_rdma);
+		kgnilnd_conn_addref(conn);
+		break;
+	case GNILND_TX_DYING:
+		kgnilnd_conn_addref(conn);
+		break;
+	default:
+		CERROR("invalid state requested: %s\n",
+			kgnilnd_tx_state2str(state));
+		LBUG();
+		break;;
+	}
+
+	/* if this changes, change kgnilnd_alloc_tx */
+	tx->tx_list_state = state;
+
+	/* some states don't have lists - we track them in the per conn
+	 * TX table instead. Waste not, want not! */
+	if (list != NULL) {
+		tx->tx_list_p = list;
+		if (add_tail)
+			list_add_tail(&tx->tx_list, list);
+		else
+			list_add(&tx->tx_list, list);
+	} else {
+		/* set dummy list_p to make book keeping happy and let debugging
+		 * be a hair easier */
+		tx->tx_list_p = (void *)state;
+	}
+
+	GNIDBG_TX(D_NET, tx, "onto %s->0x%p",
+		  kgnilnd_tx_state2str(state), list);
+}
+
+static inline void
+kgnilnd_tx_del_state_locked(kgn_tx_t *tx, kgn_peer_t *peer,
+			kgn_conn_t *conn, kgn_tx_list_state_t new_state)
+{
+	/* These is only 1 "off-list" state */
+	GNITX_ASSERTF(tx, new_state == GNILND_TX_ALLOCD,
+		      "invalid new_state %s", kgnilnd_tx_state2str(new_state));
+
+	/* new_state == ALLOCD means we are deallocating this tx,
+	 * so make sure it was on a valid list to start with */
+	GNITX_ASSERTF(tx, (tx->tx_list_p != NULL) &&
+		      (((tx->tx_list_state == GNILND_TX_LIVE_FMAQ) ||
+			(tx->tx_list_state == GNILND_TX_LIVE_RDMAQ) ||
+			(tx->tx_list_state == GNILND_TX_DYING)) == list_empty(&tx->tx_list)),
+		      "bad state", NULL);
+
+	GNIDBG_TX(D_NET, tx, "off %p", tx->tx_list_p);
+
+	/* drop refcount */
+	switch (tx->tx_list_state) {
+	case GNILND_TX_PEERQ:
+		kgnilnd_peer_decref(peer);
+		break;
+	case GNILND_TX_FREED:
+	case GNILND_TX_IDLE:
+	case GNILND_TX_ALLOCD:
+		/* no refs needed */
+		break;
+	case GNILND_TX_DYING:
+		kgnilnd_conn_decref(conn);
+		break;
+	case GNILND_TX_FMAQ:
+		kgnilnd_conn_decref(conn);
+		break;
+	case GNILND_TX_MAPQ:
+		atomic_dec(&conn->gnc_device->gnd_nq_map);
+		kgnilnd_conn_decref(conn);
+		break;
+	case GNILND_TX_LIVE_FMAQ:
+		atomic_dec(&conn->gnc_nlive_fma);
+		kgnilnd_conn_decref(conn);
+		break;
+	case GNILND_TX_LIVE_RDMAQ:
+		atomic_dec(&conn->gnc_nlive_rdma);
+		kgnilnd_conn_decref(conn);
+		break;
+	case GNILND_TX_RDMAQ:
+		atomic_dec(&conn->gnc_nq_rdma);
+		kgnilnd_conn_decref(conn);
+	/* don't need to assert on default, already did in set */
+	}
+
+	/* for ALLOCD, this might already be true, but no harm doing it again */
+	list_del_init(&tx->tx_list);
+	tx->tx_list_p = NULL;
+	tx->tx_list_state = new_state;
+}
+
+static inline int
+kgnilnd_tx_mapped(kgn_tx_t *tx)
+{
+	return (tx->tx_buftype == GNILND_BUF_VIRT_MAPPED ||
+		tx->tx_buftype == GNILND_BUF_PHYS_MAPPED);
+}
+
+static inline struct list_head *
+kgnilnd_cqid2connlist(__u32 cqid)
+{
+	unsigned int hash = cqid % *kgnilnd_tunables.kgn_peer_hash_size;
+
+	return (&kgnilnd_data.kgn_conns [hash]);
+}
+
+static inline kgn_conn_t *
+kgnilnd_cqid2conn_locked(__u32 cqid)
+{
+	struct list_head *conns = kgnilnd_cqid2connlist(cqid);
+	struct list_head *tmp;
+	kgn_conn_t       *conn;
+
+	list_for_each(tmp, conns) {
+		conn = list_entry(tmp, kgn_conn_t, gnc_hashlist);
+
+		if (conn->gnc_cqid == cqid)
+			return conn;
+	}
+
+	return NULL;
+}
+
+/* returns 1..GNILND_MAX_CQID on success, 0 on failure */
+static inline __u32
+kgnilnd_get_cqid_locked(void)
+{
+	int     looped = 0;
+	__u32   cqid;
+
+	do {
+		cqid = kgnilnd_data.kgn_next_cqid++;
+		if (kgnilnd_data.kgn_next_cqid >= GNILND_MAX_CQID) {
+			if (looped) {
+				return 0;
+			}
+			kgnilnd_data.kgn_next_cqid = 1;
+			looped = 1;
+		}
+	} while (kgnilnd_cqid2conn_locked(cqid) != NULL);
+
+	return cqid;
+}
+
+static inline void
+kgnilnd_validate_tx_ev_id(kgn_tx_ev_id_t *ev_id, kgn_tx_t **txp, kgn_conn_t **connp)
+{
+	kgn_tx_t        *tx = NULL;
+	kgn_conn_t      *conn = NULL;
+
+	/* set to NULL so any early return is an error */
+	*txp = NULL;
+	*connp = NULL;
+
+	LASSERTF((ev_id->txe_idx > 0) &&
+		 (ev_id->txe_idx < GNILND_MAX_MSG_ID),
+		"bogus txe_idx %d >= %d\n",
+		ev_id->txe_idx, GNILND_MAX_MSG_ID);
+
+	LASSERTF((ev_id->txe_cqid > 0) &&
+		 (ev_id->txe_cqid < GNILND_MAX_CQID),
+		"bogus txe_cqid %d >= %d\n",
+		ev_id->txe_cqid, GNILND_MAX_CQID);
+
+	read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+	conn = kgnilnd_cqid2conn_locked(ev_id->txe_cqid);
+
+	if (conn == NULL) {
+		/* Conn was destroyed? */
+		read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+		CDEBUG(D_NET, "CQID %d lookup failed\n", ev_id->txe_cqid);
+		return;
+	}
+	/* just insurance */
+	kgnilnd_conn_addref(conn);
+	read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+	/* we know this is safe - as the TX won't be reused until AFTER
+	 * the conn is unlinked from the cqid hash, so we can use the TX
+	 * (serializing to avoid any cache oddness) freely from the conn tx ref table */
+
+	spin_lock(&conn->gnc_tx_lock);
+	tx = conn->gnc_tx_ref_table[ev_id->txe_idx];
+	spin_unlock(&conn->gnc_tx_lock);
+
+	/* We could have a tx that was cleared out by other forces
+	 * lctl disconnect or del_peer. */
+	if (tx == NULL) {
+		CNETERR("txe_idx %d is gone, ignoring event\n", ev_id->txe_idx);
+		kgnilnd_conn_decref(conn);
+		return;
+	}
+
+	/* check tx->tx_msg magic to make sure kgni didn't eat it */
+	GNITX_ASSERTF(tx, tx->tx_msg.gnm_magic == GNILND_MSG_MAGIC,
+		      "came back from kgni with bad magic %x", tx->tx_msg.gnm_magic);
+
+	GNITX_ASSERTF(tx, tx->tx_id.txe_idx == ev_id->txe_idx,
+		      "conn 0x%p->%s tx_ref_table hosed: wanted txe_idx %d "
+		      "found tx %p txe_idx %d",
+		      conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+		      ev_id->txe_idx, tx, tx->tx_id.txe_idx);
+
+	GNITX_ASSERTF(tx, tx->tx_conn != NULL, "tx with NULL connection", NULL);
+
+	GNITX_ASSERTF(tx, tx->tx_conn == conn, "tx conn does not equal conn", NULL);
+
+	*txp = tx;
+	*connp = conn;
+
+	GNIDBG_TX(D_NET, tx, "validated to 0x%p", conn);
+}
+
+/* set_normalized_timepsec isn't exported from the kernel, so
+ * we need to do the same thing inline */
+static inline struct timespec
+kgnilnd_ts_sub(struct timespec lhs, struct timespec rhs)
+{
+	time_t                  sec;
+	long                    nsec;
+	struct timespec         ts;
+
+	sec = lhs.tv_sec - rhs.tv_sec;
+	nsec = lhs.tv_nsec - rhs.tv_nsec;
+
+	while (nsec >= NSEC_PER_SEC) {
+		nsec -= NSEC_PER_SEC;
+		++sec;
+	}
+	while (nsec < 0) {
+		nsec += NSEC_PER_SEC;
+		--sec;
+	}
+	ts.tv_sec = sec;
+	ts.tv_nsec = nsec;
+	return ts;
+}
+
+static inline int
+kgnilnd_count_list(struct list_head *q)
+{
+	struct list_head *e;
+	int               n = 0;
+
+	list_for_each(e, q) {
+		n++;
+	}
+
+	return n;
+}
+
+/* kgnilnd_find_net adds a reference to the net it finds
+ * this is so the net will not be removed before the calling function
+ * has time to use the data returned. This reference needs to be released
+ * by the calling function once it has finished using the returned net
+ */
+
+static inline int
+kgnilnd_find_net(lnet_nid_t nid, kgn_net_t **netp)
+{
+	kgn_net_t *net;
+	int rc;
+
+	rc = down_read_trylock(&kgnilnd_data.kgn_net_rw_sem);
+
+	if (!rc) {
+		return -ESHUTDOWN;
+	}
+
+	list_for_each_entry(net, kgnilnd_netnum2netlist(LNET_NETNUM(LNET_NIDNET(nid))), gnn_list) {
+		if (!net->gnn_shutdown && LNET_NIDNET(net->gnn_ni->ni_nid) == LNET_NIDNET(nid)) {
+			kgnilnd_net_addref(net);
+			up_read(&kgnilnd_data.kgn_net_rw_sem);
+			*netp = net;
+			return 0;
+		}
+	}
+
+	up_read(&kgnilnd_data.kgn_net_rw_sem);
+
+	return -ENONET;
+}
+
+#ifdef CONFIG_DEBUG_SLAB
+#define KGNILND_POISON(ptr, c, s) do {} while(0)
+#else
+#define KGNILND_POISON(ptr, c, s) memset(ptr, c, s)
+#endif
+
+int kgnilnd_dev_init(kgn_device_t *dev);
+void kgnilnd_dev_fini(kgn_device_t *dev);
+int kgnilnd_startup(lnet_ni_t *ni);
+void kgnilnd_shutdown(lnet_ni_t *ni);
+int kgnilnd_base_startup(void);
+void kgnilnd_base_shutdown(void);
+
+int kgnilnd_allocate_phys_fmablk(kgn_device_t *device);
+int kgnilnd_map_phys_fmablk(kgn_device_t *device);
+void kgnilnd_unmap_phys_fmablk(kgn_device_t *device);
+void kgnilnd_free_phys_fmablk(kgn_device_t *device);
+
+int kgnilnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
+void kgnilnd_query(lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when);
+int kgnilnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+int kgnilnd_eager_recv(lnet_ni_t *ni, void *private,
+			lnet_msg_t *lntmsg, void **new_private);
+int kgnilnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+		int delayed, unsigned int niov,
+		struct iovec *iov, lnet_kiov_t *kiov,
+		unsigned int offset, unsigned int mlen, unsigned int rlen);
+
+__u16 kgnilnd_cksum_kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int offset, unsigned int nob, int dump_blob);
+
+/* purgatory functions */
+void kgnilnd_add_purgatory_locked(kgn_conn_t *conn, kgn_peer_t *peer);
+void kgnilnd_mark_for_detach_purgatory_all_locked(kgn_peer_t *peer);
+void kgnilnd_detach_purgatory_locked(kgn_conn_t *conn, struct list_head *conn_list);
+void kgnilnd_release_purgatory_list(struct list_head *conn_list);
+
+void kgnilnd_update_reaper_timeout(long timeout);
+void kgnilnd_unmap_buffer(kgn_tx_t *tx, int error);
+kgn_tx_t *kgnilnd_new_tx_msg(int type, lnet_nid_t source);
+void kgnilnd_tx_done(kgn_tx_t *tx, int completion);
+void kgnilnd_txlist_done(struct list_head *txlist, int error);
+void kgnilnd_unlink_peer_locked(kgn_peer_t *peer);
+void kgnilnd_schedule_conn(kgn_conn_t *conn);
+void kgnilnd_schedule_process_conn(kgn_conn_t *conn, int sched_intent);
+
+void kgnilnd_schedule_dgram(kgn_device_t *dev);
+int kgnilnd_create_peer_safe(kgn_peer_t **peerp, lnet_nid_t nid, kgn_net_t *net);
+void kgnilnd_add_peer_locked(lnet_nid_t nid, kgn_peer_t *new_stub_peer, kgn_peer_t **peerp);
+int kgnilnd_add_peer(kgn_net_t *net, lnet_nid_t nid, kgn_peer_t **peerp);
+
+kgn_peer_t *kgnilnd_find_peer_locked(lnet_nid_t nid);
+int kgnilnd_del_conn_or_peer(kgn_net_t *net, lnet_nid_t nid, int command, int error);
+void kgnilnd_peer_increase_reconnect_locked(kgn_peer_t *peer);
+void kgnilnd_queue_reply(kgn_conn_t *conn, kgn_tx_t *tx);
+void kgnilnd_queue_tx(kgn_conn_t *conn, kgn_tx_t *tx);
+void kgnilnd_launch_tx(kgn_tx_t *tx, kgn_net_t *net, lnet_process_id_t *target);
+int kgnilnd_send_mapped_tx(kgn_tx_t *tx, int try_map_if_full);
+void kgnilnd_consume_rx(kgn_rx_t *rx);
+
+void kgnilnd_schedule_device(kgn_device_t *dev);
+void kgnilnd_device_callback(__u32 devid, __u64 arg);
+void kgnilnd_schedule_device_timer(unsigned long arg);
+
+int kgnilnd_reaper(void *arg);
+int kgnilnd_scheduler(void *arg);
+int kgnilnd_dgram_mover(void *arg);
+
+int kgnilnd_create_conn(kgn_conn_t **connp, kgn_device_t *dev);
+int kgnilnd_conn_isdup_locked(kgn_peer_t *peer, kgn_conn_t *newconn);
+kgn_conn_t *kgnilnd_find_conn_locked(kgn_peer_t *peer);
+int kgnilnd_get_conn(kgn_conn_t **connp, kgn_peer_t);
+kgn_conn_t *kgnilnd_find_or_create_conn_locked(kgn_peer_t *peer);
+void kgnilnd_peer_cancel_tx_queue(kgn_peer_t *peer);
+void kgnilnd_cancel_peer_connect_locked(kgn_peer_t *peer, struct list_head *zombies);
+int kgnilnd_close_stale_conns_locked(kgn_peer_t *peer, kgn_conn_t *newconn);
+void kgnilnd_peer_alive(kgn_peer_t *peer);
+void kgnilnd_peer_notify(kgn_peer_t *peer, int error);
+void kgnilnd_close_conn_locked(kgn_conn_t *conn, int error);
+void kgnilnd_close_conn(kgn_conn_t *conn, int error);
+void kgnilnd_complete_closed_conn(kgn_conn_t *conn);
+void kgnilnd_destroy_conn_ep(kgn_conn_t *conn);
+
+int kgnilnd_close_peer_conns_locked(kgn_peer_t *peer, int why);
+
+int kgnilnd_tunables_init(void);
+void kgnilnd_tunables_fini(void);
+void kgnilnd_init_msg(kgn_msg_t *msg, int type, lnet_nid_t source);
+
+void kgnilnd_bump_timeouts(__u32 nap_time, char *reason);
+void kgnilnd_pause_threads(void);
+int kgnilnd_hw_in_quiesce(void);
+int kgnilnd_check_hw_quiesce(void);
+void kgnilnd_quiesce_wait(char *reason);
+void kgnilnd_quiesce_end_callback(gni_nic_handle_t nic_handle, uint64_t msecs);
+int kgnilnd_ruhroh_thread(void *arg);
+void kgnilnd_reset_stack(void);
+void kgnilnd_critical_error(gni_err_handle_t err_handle);
+
+void kgnilnd_insert_sysctl(void);
+void kgnilnd_remove_sysctl(void);
+void kgnilnd_proc_init(void);
+void kgnilnd_proc_fini(void);
+
+/* gnilnd_conn.c */
+void kgnilnd_release_mbox(kgn_conn_t *conn, int purgatory_hold);
+
+int kgnilnd_find_and_cancel_dgram(kgn_device_t *dev, lnet_nid_t dst_nid);
+void kgnilnd_cancel_dgram_locked(kgn_dgram_t *dgram);
+void kgnilnd_release_dgram(kgn_device_t *dev, kgn_dgram_t *dgram);
+
+int kgnilnd_setup_wildcard_dgram(kgn_device_t *dev);
+int kgnilnd_cancel_net_dgrams(kgn_net_t *net);
+int kgnilnd_cancel_wc_dgrams(kgn_device_t *dev);
+void kgnilnd_wait_for_canceled_dgrams(kgn_device_t *dev);
+
+int kgnilnd_dgram_waitq(void *arg);
+
+int kgnilnd_set_conn_params(kgn_dgram_t *dgram);
+
+/* struct2str functions - we don't use a default: case to cause the compile
+ * to fail if there is a missing case. This allows us to hide these down here
+ * out of the way but ensure we'll catch any updates to the enum/types
+ * above */
+
+#define DO_TYPE(x) case x: return #x;
+static inline const char *
+kgnilnd_fmablk_state2str(kgn_fmablk_state_t state)
+{
+	/* Only want single char string for this */
+	switch (state) {
+	case GNILND_FMABLK_IDLE:
+		return "I";
+	case GNILND_FMABLK_PHYS:
+		return "P";
+	case GNILND_FMABLK_VIRT:
+		return "V";
+	case GNILND_FMABLK_FREED:
+		return "F";
+	}
+	return "<unknown state>";
+}
+
+static inline const char *
+kgnilnd_msgtype2str(int type)
+{
+	switch (type) {
+		DO_TYPE(GNILND_MSG_NONE);
+		DO_TYPE(GNILND_MSG_NOOP);
+		DO_TYPE(GNILND_MSG_IMMEDIATE);
+		DO_TYPE(GNILND_MSG_PUT_REQ);
+		DO_TYPE(GNILND_MSG_PUT_NAK);
+		DO_TYPE(GNILND_MSG_PUT_ACK);
+		DO_TYPE(GNILND_MSG_PUT_DONE);
+		DO_TYPE(GNILND_MSG_GET_REQ);
+		DO_TYPE(GNILND_MSG_GET_NAK);
+		DO_TYPE(GNILND_MSG_GET_DONE);
+		DO_TYPE(GNILND_MSG_CLOSE);
+	}
+	return "<unknown msg type>";
+}
+
+static inline const char *
+kgnilnd_tx_state2str(kgn_tx_list_state_t state)
+{
+	switch (state) {
+		DO_TYPE(GNILND_TX_IDLE);
+		DO_TYPE(GNILND_TX_ALLOCD);
+		DO_TYPE(GNILND_TX_PEERQ);
+		DO_TYPE(GNILND_TX_MAPQ);
+		DO_TYPE(GNILND_TX_FMAQ);
+		DO_TYPE(GNILND_TX_LIVE_FMAQ);
+		DO_TYPE(GNILND_TX_RDMAQ);
+		DO_TYPE(GNILND_TX_LIVE_RDMAQ);
+		DO_TYPE(GNILND_TX_DYING);
+		DO_TYPE(GNILND_TX_FREED);
+	}
+	return "<unknown state>";
+}
+
+static inline const char *
+kgnilnd_conn_state2str(kgn_conn_t *conn)
+{
+	kgn_conn_state_t state = conn->gnc_state;
+	switch (state) {
+		DO_TYPE(GNILND_CONN_DUMMY);
+		DO_TYPE(GNILND_CONN_LISTEN);
+		DO_TYPE(GNILND_CONN_CONNECTING);
+		DO_TYPE(GNILND_CONN_ESTABLISHED);
+		DO_TYPE(GNILND_CONN_CLOSING);
+		DO_TYPE(GNILND_CONN_CLOSED);
+		DO_TYPE(GNILND_CONN_DONE);
+		DO_TYPE(GNILND_CONN_DESTROY_EP);
+	}
+	return "<?state?>";
+}
+
+static inline const char *
+kgnilnd_connreq_type2str(kgn_connreq_t *connreq)
+{
+	kgn_connreq_type_t type = connreq->gncr_type;
+
+	switch (type) {
+		DO_TYPE(GNILND_CONNREQ_REQ);
+		DO_TYPE(GNILND_CONNREQ_NAK);
+		DO_TYPE(GNILND_CONNREQ_CLOSE);
+	}
+	return "<?type?>";
+}
+
+static inline const char *
+kgnilnd_dgram_state2str(kgn_dgram_t *dgram)
+{
+	kgn_dgram_state_t state = dgram->gndg_state;
+
+	switch (state) {
+		DO_TYPE(GNILND_DGRAM_USED);
+		DO_TYPE(GNILND_DGRAM_POSTING);
+		DO_TYPE(GNILND_DGRAM_POSTED);
+		DO_TYPE(GNILND_DGRAM_PROCESSING);
+		DO_TYPE(GNILND_DGRAM_DONE);
+		DO_TYPE(GNILND_DGRAM_CANCELED);
+	}
+	return "<?state?>";
+}
+
+static inline const char *
+kgnilnd_dgram_type2str(kgn_dgram_t *dgram)
+{
+	kgn_dgram_type_t type = dgram->gndg_type;
+
+	switch (type) {
+		DO_TYPE(GNILND_DGRAM_REQ);
+		DO_TYPE(GNILND_DGRAM_WC_REQ);
+		DO_TYPE(GNILND_DGRAM_NAK);
+		DO_TYPE(GNILND_DGRAM_CLOSE);
+	}
+	return "<?type?>";
+}
+
+
+#undef DO_TYPE
+
+/* API wrapper functions - include late to pick up all of the other defines */
+#include "gnilnd_api_wrap.h"
+
+#endif /* _GNILND_GNILND_H_ */
diff --git a/lnet/klnds/gnilnd/gnilnd_api_wrap.h b/lnet/klnds/gnilnd/gnilnd_api_wrap.h
new file mode 100644
index 0000000..e7ba9ab
--- /dev/null
+++ b/lnet/klnds/gnilnd/gnilnd_api_wrap.h
@@ -0,0 +1,1505 @@
+/*
+ * Copyright (C) 2009-2012 Cray, Inc.
+ *
+ *   Author: Nic Henke <nic@cray.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#ifndef _GNILND_API_WRAP_H
+#define _GNILND_API_WRAP_H
+
+/* LNet is allocated failure locations 0xe000 to 0xffff */
+
+/* GNILND has 0xf0XX */
+#define CFS_FAIL_GNI			0xf000
+#define CFS_FAIL_GNI_PHYS_MAP		0xf001
+#define CFS_FAIL_GNI_VIRT_MAP		0xf002
+#define CFS_FAIL_GNI_GET_UNMAP		0xf003
+#define CFS_FAIL_GNI_PUT_UNMAP		0xf004
+#define CFS_FAIL_GNI_MAP_TX		0xf005
+#define CFS_FAIL_GNI_SMSG_SEND		0xf006
+#define CFS_FAIL_GNI_CLOSE_SEND		0xf007
+#define CFS_FAIL_GNI_CDM_CREATE		0xf008
+#define CFS_FAIL_GNI_CDM_DESTROY	0xf009
+#define CFS_FAIL_GNI_CDM_ATTACH		0xf00a
+#define CFS_FAIL_GNI_CQ_CREATE		0xf00b
+#define CFS_FAIL_GNI_CQ_DESTROY		0xf00c
+#define CFS_FAIL_GNI_EP_BIND		0xf00d
+#define CFS_FAIL_GNI_EP_UNBIND		0xf00e
+#define CFS_FAIL_GNI_EP_SET_EVDATA	0xf00f
+#define CFS_FAIL_GNI_SMSG_INIT		0xf010
+#define CFS_FAIL_GNI_SMSG_RELEASE	0xf011
+#define CFS_FAIL_GNI_POST_RDMA		0xf012
+#define CFS_FAIL_GNI_GET_COMPLETED	0xf013
+#define CFS_FAIL_GNI_EP_DESTROY		0xf015
+#define CFS_FAIL_GNI_VIRT_UNMAP		0xf016
+#define CFS_FAIL_GNI_MDD_RELEASE	0xf017
+#define CFS_FAIL_GNI_NOOP_SEND		0xf018
+#define CFS_FAIL_GNI_ERR_SUBSCRIBE	0xf01a
+#define CFS_FAIL_GNI_QUIESCE_RACE	0xf01b
+#define CFS_FAIL_GNI_DG_TERMINATE	0xf01c
+#define CFS_FAIL_GNI_REG_QUIESCE	0xf01d
+#define CFS_FAIL_GNI_IN_QUIESCE		0xf01e
+#define CFS_FAIL_GNI_DELAY_RDMA		0xf01f
+#define CFS_FAIL_GNI_SR_DOWN_RACE	0xf020
+#define CFS_FAIL_GNI_ALLOC_TX		0xf021
+#define CFS_FAIL_GNI_FMABLK_AVAIL	0xf022
+#define CFS_FAIL_GNI_EP_CREATE		0xf023
+#define CFS_FAIL_GNI_CQ_GET_EVENT	0xf024
+#define CFS_FAIL_GNI_PROBE		0xf025
+#define CFS_FAIL_GNI_EP_TEST		0xf026
+#define CFS_FAIL_GNI_CONNREQ_DROP	0xf027
+#define CFS_FAIL_GNI_CONNREQ_PROTO	0xf028
+#define CFS_FAIL_GNI_CONND_PILEUP	0xf029
+#define CFS_FAIL_GNI_PHYS_SETUP		0xf02a
+#define CFS_FAIL_GNI_FIND_TARGET	0xf02b
+#define CFS_FAIL_GNI_WC_DGRAM_FREE	0xf02c
+#define CFS_FAIL_GNI_DROP_CLOSING	0xf02d
+#define CFS_FAIL_GNI_RX_CLOSE_CLOSING	0xf02e
+#define CFS_FAIL_GNI_RX_CLOSE_CLOSED	0xf02f
+#define CFS_FAIL_GNI_EP_POST		0xf030
+#define CFS_FAIL_GNI_PACK_SRCNID	0xf031
+#define CFS_FAIL_GNI_PACK_DSTNID	0xf032
+#define CFS_FAIL_GNI_PROBE_WAIT		0xf033
+#define CFS_FAIL_GNI_SMSG_CKSUM1	0xf034
+#define CFS_FAIL_GNI_SMSG_CKSUM2	0xf035
+#define CFS_FAIL_GNI_SMSG_CKSUM3	0xf036
+#define CFS_FAIL_GNI_DROP_DESTROY_EP	0xf037
+#define CFS_FAIL_GNI_SMSG_GETNEXT	0xf038
+#define CFS_FAIL_GNI_FINISH_PURG	0xf039
+#define CFS_FAIL_GNI_PURG_REL_DELAY	0xf03a
+#define CFS_FAIL_GNI_DONT_NOTIFY	0xf03b
+#define CFS_FAIL_GNI_VIRT_SMALL_MAP	0xf03c
+#define CFS_FAIL_GNI_DELAY_RDMAQ	0xf03d
+#define CFS_FAIL_GNI_PAUSE_SHUTDOWN	0xf03e
+#define CFS_FAIL_GNI_PAUSE_DGRAM_COMP	0xf03f
+#define CFS_FAIL_GNI_NET_LOOKUP		0xf040
+#define CFS_FAIL_GNI_RECV_TIMEOUT	0xf041
+#define CFS_FAIL_GNI_SEND_TIMEOUT	0xf042
+#define CFS_FAIL_GNI_ONLY_NOOP		0xf043
+#define CFS_FAIL_GNI_FINISH_PURG2	0xf044
+#define CFS_FAIL_GNI_RACE_RESET		0xf045
+#define CFS_FAIL_GNI_GNP_CONNECTING1	0xf046
+#define CFS_FAIL_GNI_GNP_CONNECTING2	0xf047
+#define CFS_FAIL_GNI_GNP_CONNECTING3	0xf048
+#define CFS_FAIL_GNI_PUT_ACK_AGAIN	0xf050
+#define CFS_FAIL_GNI_GET_REQ_AGAIN	0xf051
+
+/* helper macros */
+extern void
+_kgnilnd_api_rc_lbug(const char *rcstr, int rc, struct libcfs_debug_msg_data *data,
+			const char *fmt, ...)
+	__attribute__ ((format (printf, 4, 5)));
+
+#define kgnilnd_api_rc_lbug(msgdata, rc, fmt, a...)				\
+do {										\
+	CFS_CHECK_STACK(msgdata, D_ERROR, NULL);				\
+	/* we don't mask this - it is always at D_ERROR */			\
+	_kgnilnd_api_rc_lbug(kgnilnd_api_rc2str(rc), (rc), msgdata, fmt, ##a);	\
+} while (0)
+
+#define DO_RETCODE(x) case x: return #x;
+static inline const char *
+kgnilnd_api_rc2str(gni_return_t rrc)
+{
+
+	switch (rrc) {
+		DO_RETCODE(GNI_RC_SUCCESS)
+		DO_RETCODE(GNI_RC_NOT_DONE);
+		DO_RETCODE(GNI_RC_INVALID_PARAM);
+		DO_RETCODE(GNI_RC_ERROR_RESOURCE);
+		DO_RETCODE(GNI_RC_TIMEOUT);
+		DO_RETCODE(GNI_RC_PERMISSION_ERROR);
+		DO_RETCODE(GNI_RC_DESCRIPTOR_ERROR);
+		DO_RETCODE(GNI_RC_ALIGNMENT_ERROR);
+		DO_RETCODE(GNI_RC_INVALID_STATE);
+		DO_RETCODE(GNI_RC_NO_MATCH);
+		DO_RETCODE(GNI_RC_SIZE_ERROR);
+		DO_RETCODE(GNI_RC_TRANSACTION_ERROR);
+		DO_RETCODE(GNI_RC_ILLEGAL_OP);
+		DO_RETCODE(GNI_RC_ERROR_NOMEM);
+	}
+	LBUG();
+}
+#undef DO_RETCODE
+
+/* log an error and LBUG for unhandled rc from gni api function
+ * the fmt should be something like:
+ *  gni_api_call(arg1, arg2, arg3)
+ */
+
+/* apick_fn and apick_fmt should be defined for each site */
+#undef apick_fn
+#undef apick_fmt
+
+#define GNILND_API_RC_LBUG(args...)						\
+do {										\
+	LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_ERROR, NULL);			\
+	kgnilnd_api_rc_lbug(&msgdata, rrc, apick_fn"("apick_fmt")", ##args);	\
+} while (0)
+
+#define GNILND_API_SWBUG(args...)                                               \
+do {                                                                            \
+	CERROR("likely SOFTWARE BUG "apick_fn"("apick_fmt") rc %s\n",           \
+		 ##args, kgnilnd_api_rc2str(rrc));                              \
+} while (0)
+
+#define GNILND_API_EINVAL(args...)                                              \
+do {                                                                            \
+	CERROR("invalid parameter to "apick_fn"("apick_fmt") rc %s\n",          \
+		 ##args, kgnilnd_api_rc2str(rrc));                              \
+} while (0)
+
+#define GNILND_API_RESOURCE(args...)                                            \
+do {                                                                            \
+	CERROR("no resources for "apick_fn"("apick_fmt") rc %s\n",              \
+		##args, kgnilnd_api_rc2str(rrc));                               \
+} while (0)
+
+#define GNILND_API_BUSY(args...)                                                \
+do {                                                                            \
+	CERROR("resources busy for "apick_fn"("apick_fmt") rc %s\n",            \
+		##args, kgnilnd_api_rc2str(rrc));                               \
+} while (0)
+
+#undef DEBUG_SMSG_CREDITS
+#ifdef DEBUG_SMSG_CREDITS
+#define CRAY_CONFIG_GHAL_GEMINI
+#include <gni_priv.h>
+#define GNIDBG_SMSG_CREDS(level, conn)                                        \
+do {                                                                          \
+	gni_ep_smsg_mbox_t *smsg = conn->gnc_ephandle->smsg;                  \
+	CDEBUG(level, "SMSGDBG: conn %p mcred %d/%d bcred %d/%d "             \
+		"s_seq %d/%d/%d r_seq %d/%d/%d retr %d\n",                    \
+		conn, smsg->mbox_credits, smsg->back_mbox_credits,            \
+		smsg->buffer_credits, smsg->back_buffer_credits,              \
+		smsg->s_seqno, smsg->s_seqno_back_mbox_credits,               \
+		smsg->s_seqno_back_buffer_credits, smsg->r_seqno,             \
+		smsg->r_seqno_back_mbox_credits,                              \
+		smsg->r_seqno_back_buffer_credits, smsg->retransmit_count);   \
+} while (0)
+#else
+#define GNIDBG_SMSG_CREDS(level, conn) do {} while(0)
+#endif
+
+/* these are all wrappers around gni_XXX functions.
+ * This allows us to handle all the return codes and api checks without
+ * dirtying up the logic code */
+
+/* TODO: RETURN wrapper that translates integer to GNI API RC string */
+
+#define apick_fn "kgnilnd_cdm_create"
+#define apick_fmt "%u, %u, %u, %u, 0x%p"
+static inline gni_return_t kgnilnd_cdm_create(
+		IN uint32_t		inst_id,
+		IN uint8_t		ptag,
+		IN uint32_t		cookie,
+		IN uint32_t		modes,
+		OUT gni_cdm_handle_t	*cdm_hndl
+		)
+{
+	gni_return_t rrc;
+
+	/* Error injection */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CDM_CREATE)) {
+		rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+	} else {
+		rrc = gni_cdm_create(inst_id, ptag, cookie, modes, cdm_hndl);
+	}
+
+	switch (rrc)  {
+	case GNI_RC_SUCCESS:
+		break;
+	case GNI_RC_ERROR_RESOURCE:
+	case GNI_RC_INVALID_PARAM:
+		/* Try to bail gracefully */
+		GNILND_API_SWBUG(
+			inst_id, ptag, cookie, modes, cdm_hndl);
+		break;
+	default:
+		GNILND_API_RC_LBUG(
+			inst_id, ptag, cookie, modes, cdm_hndl);
+
+		/* LBUG never returns, but just for style and consistency */
+		break;
+	}
+	RETURN(rrc);
+}
+
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_cdm_attach"
+#define apick_fmt "0x%p, %u, 0x%p, 0x%p"
+static inline gni_return_t kgnilnd_cdm_attach(
+		IN gni_cdm_handle_t	cdm_hndl,
+		IN uint32_t		device_id,
+		OUT uint32_t		*local_addr,
+		OUT gni_nic_handle_t	*nic_hndl
+		)
+{
+	gni_return_t rrc;
+
+	/* Error injection */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CDM_ATTACH)) {
+		rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+	} else {
+		rrc = gni_cdm_attach(cdm_hndl, device_id, local_addr, nic_hndl);
+	}
+
+	switch (rrc)  {
+	case GNI_RC_SUCCESS:
+		break;
+	case GNI_RC_NO_MATCH:
+	case GNI_RC_INVALID_PARAM:
+		GNILND_API_SWBUG(
+			cdm_hndl, device_id, local_addr, nic_hndl);
+		break;
+	case GNI_RC_ERROR_RESOURCE:
+	case GNI_RC_INVALID_STATE:
+		GNILND_API_RESOURCE(
+			cdm_hndl, device_id, local_addr, nic_hndl);
+		break;
+	default:
+		GNILND_API_RC_LBUG(
+			cdm_hndl, device_id, local_addr, nic_hndl);
+
+		/* LBUG never returns, but just for style and consistency */
+		break;
+	}
+	RETURN(rrc);
+}
+#undef apick_fmt
+#undef apick_fn
+
+#define apick_fn "kgnilnd_cdm_destroy"
+#define apick_fmt "0x%p"
+static inline gni_return_t kgnilnd_cdm_destroy(
+		IN gni_cdm_handle_t     cdm_hndl
+		)
+{
+	gni_return_t rrc;
+
+	/* Error injection */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CQ_DESTROY)) {
+		rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+	} else {
+		rrc = gni_cdm_destroy(
+			cdm_hndl);
+	}
+
+	switch (rrc)  {
+	case GNI_RC_SUCCESS:
+		break;
+	case GNI_RC_INVALID_PARAM:
+		GNILND_API_SWBUG(
+			cdm_hndl);
+		break;
+	default:
+		GNILND_API_RC_LBUG(
+			cdm_hndl);
+
+		/* LBUG never returns, but just for style and consistency */
+		break;
+	}
+	RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_subscribe_errors"
+#define apick_fmt "0x%p,%x,%u,0x%p,0x%p,0x%p"
+static inline gni_return_t kgnilnd_subscribe_errors(
+		IN gni_nic_handle_t  nic_handle,
+		IN gni_error_mask_t  mask,
+		IN uint32_t          EEQ_size,
+		IN void              (*EQ_new_event)(gni_err_handle_t),
+		IN void              (*app_crit_err)(gni_err_handle_t),
+		OUT gni_err_handle_t *err_handle
+		)
+{
+	gni_return_t rrc;
+
+	/* Error injection */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_ERR_SUBSCRIBE)) {
+		rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+	} else {
+		rrc = gni_subscribe_errors(
+			nic_handle, mask, EEQ_size, EQ_new_event, app_crit_err,
+			err_handle);
+	}
+
+	switch (rrc)  {
+	case GNI_RC_SUCCESS:
+		break;
+	case GNI_RC_INVALID_PARAM:
+		GNILND_API_SWBUG(
+			nic_handle, mask, EEQ_size, EQ_new_event, app_crit_err,
+			err_handle);
+		break;
+	case GNI_RC_ERROR_RESOURCE:
+		GNILND_API_RESOURCE(
+			nic_handle, mask, EEQ_size, EQ_new_event, app_crit_err,
+			err_handle);
+		break;
+	default:
+		GNILND_API_RC_LBUG(
+			nic_handle, mask, EEQ_size, EQ_new_event, app_crit_err,
+			err_handle);
+		/* LBUG never returns, but just for style and consistency */
+		break;
+	}
+	RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_release_errors"
+#define apick_fmt "0x%p"
+static inline gni_return_t kgnilnd_release_errors(
+		IN gni_err_handle_t err_handle
+		)
+{
+	gni_return_t rrc;
+
+	rrc = gni_release_errors(
+			err_handle);
+
+	switch (rrc)  {
+	case GNI_RC_SUCCESS:
+		break;
+	case GNI_RC_INVALID_PARAM:
+	case GNI_RC_NOT_DONE:
+		GNILND_API_SWBUG(
+			err_handle);
+		break;
+	default:
+		GNILND_API_RC_LBUG(
+			err_handle);
+		/* LBUG never returns, but just for style and consistency */
+		break;
+	}
+	RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_set_quiesce_callback"
+#define apick_fmt "0x%p,0x%p"
+static inline gni_return_t kgnilnd_set_quiesce_callback(
+		IN gni_nic_handle_t  nic_handle,
+		IN void              (*qsce_func)(gni_nic_handle_t, uint64_t msecs)
+		)
+{
+	gni_return_t rrc;
+
+	/* Error injection */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_REG_QUIESCE)) {
+		rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+	} else {
+		rrc = gni_set_quiesce_callback(
+			nic_handle, qsce_func);
+	}
+
+	switch (rrc)  {
+	case GNI_RC_SUCCESS:
+		break;
+	case GNI_RC_INVALID_STATE:
+	case GNI_RC_INVALID_PARAM:
+		GNILND_API_SWBUG(
+			nic_handle, qsce_func);
+		break;
+	default:
+		GNILND_API_RC_LBUG(
+			nic_handle, qsce_func);
+		/* LBUG never returns, but just for style and consistency */
+		break;
+	}
+	RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_get_quiesce_status"
+#define apick_fmt "0x%p"
+static inline gni_return_t kgnilnd_get_quiesce_status(
+		IN gni_nic_handle_t  nic_handle
+		)
+{
+	uint32_t rrc;
+
+	/* this has weird RC -
+	 * 0 - quiesce not in progress
+	 * 1 - quiesce is turned on
+	*/
+
+	/* Error injection */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_IN_QUIESCE)) {
+		rrc = 1;
+	} else {
+		rrc = gni_get_quiesce_status(
+			nic_handle);
+	}
+
+	switch (rrc)  {
+	case 1:
+	case 0:
+		break;
+	default:
+		GNILND_API_RC_LBUG(
+			nic_handle);
+		/* LBUG never returns, but just for style and consistency */
+		break;
+	}
+	RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_cq_create"
+#define apick_fmt "0x%p, %u, %u, 0x%p, "LPX64", 0x%p"
+static inline gni_return_t kgnilnd_cq_create(
+		IN gni_nic_handle_t	nic_hndl,
+		IN uint32_t		entry_count,
+		IN uint32_t		delay_index,
+		IN gni_cq_event_hndlr_f *event_handler,
+		IN uint64_t		usr_event_data,
+		OUT gni_cq_handle_t	*cq_hndl
+		)
+{
+	gni_return_t rrc;
+
+	/* Error injection */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CQ_CREATE)) {
+		rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+	} else {
+		rrc = gni_cq_create(
+		       nic_hndl, entry_count, delay_index, event_handler,
+			usr_event_data, cq_hndl);
+	}
+
+	switch (rrc)  {
+	case GNI_RC_SUCCESS:
+		break;
+	case GNI_RC_INVALID_PARAM:
+		GNILND_API_SWBUG(
+			nic_hndl, entry_count, delay_index, event_handler,
+			usr_event_data, cq_hndl);
+		break;
+	case GNI_RC_ERROR_RESOURCE:
+		GNILND_API_RESOURCE(
+			nic_hndl, entry_count, delay_index, event_handler,
+			usr_event_data, cq_hndl);
+		break;
+	default:
+		GNILND_API_RC_LBUG(
+			nic_hndl, entry_count, delay_index, event_handler,
+			usr_event_data, cq_hndl);
+
+		/* LBUG never returns, but just for style and consistency */
+		break;
+	}
+	RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_cq_destroy"
+#define apick_fmt "0x%p"
+static inline gni_return_t kgnilnd_cq_destroy(
+		IN gni_cq_handle_t cq_hndl
+		)
+{
+	gni_return_t rrc;
+
+	/* Error injection */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CQ_DESTROY)) {
+		rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+	} else {
+
+		rrc = gni_cq_destroy(
+			cq_hndl);
+	}
+
+	switch (rrc)  {
+	case GNI_RC_SUCCESS:
+		break;
+	case GNI_RC_INVALID_PARAM:
+		GNILND_API_SWBUG(
+			cq_hndl);
+		break;
+	case GNI_RC_ERROR_RESOURCE:
+		GNILND_API_BUSY(
+			cq_hndl);
+		break;
+	default:
+		GNILND_API_RC_LBUG(
+			cq_hndl);
+
+		/* LBUG never returns, but just for style and consistency */
+		break;
+	}
+	RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_cq_get_event"
+#define apick_fmt "0x%p, 0x%p"
+static inline gni_return_t kgnilnd_cq_get_event(
+		IN gni_cq_handle_t cq_hndl,
+		OUT gni_cq_entry_t *event_data
+		)
+{
+	gni_return_t rrc;
+
+	/* no error injection - CQs are touchy about the data.
+	 * where appropriate, we'll do this on the CQs that should be able to
+	 * handle the various errors */
+	rrc = gni_cq_get_event(
+			cq_hndl, event_data);
+
+	switch (rrc)  {
+	case GNI_RC_SUCCESS:
+	case GNI_RC_NOT_DONE:
+	case GNI_RC_TRANSACTION_ERROR:
+		break;
+	case GNI_RC_ERROR_RESOURCE:
+		LASSERTF(GNI_CQ_OVERRUN(*event_data),
+			 "kgni returned ERROR_RESOURCE but cq_hndl 0x%p is not "
+			 "overrun\n", cq_hndl);
+		break;
+	case GNI_RC_INVALID_PARAM:
+		GNILND_API_SWBUG(
+			cq_hndl, event_data);
+		break;
+	default:
+		GNILND_API_RC_LBUG(
+			cq_hndl, event_data);
+
+		/* LBUG never returns, but just for style and consistency */
+		break;
+	}
+	return rrc;
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_smsg_init"
+#define apick_fmt "0x%p, 0x%p, 0x%p"
+static inline gni_return_t kgnilnd_smsg_init(
+		IN gni_ep_handle_t      ep_hndl,
+		IN gni_smsg_attr_t      *local_smsg_attr,
+		IN gni_smsg_attr_t      *remote_smsg_attr
+		)
+{
+	gni_return_t rrc;
+
+	/* Error injection */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_INIT)) {
+		rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_ERROR_RESOURCE;
+	} else {
+		rrc = gni_smsg_init(
+			ep_hndl, local_smsg_attr, remote_smsg_attr);
+	}
+
+	switch (rrc)  {
+	/* both of these are OK, upper SW needs to handle */
+	case GNI_RC_SUCCESS:
+	case GNI_RC_NOT_DONE:
+		break;
+	case GNI_RC_INVALID_PARAM:
+	case GNI_RC_INVALID_STATE:
+		GNILND_API_SWBUG(
+			ep_hndl, local_smsg_attr, remote_smsg_attr);
+		break;
+	case GNI_RC_ERROR_RESOURCE:
+		GNILND_API_RESOURCE(
+			ep_hndl, local_smsg_attr, remote_smsg_attr);
+		break;
+	default:
+		GNILND_API_RC_LBUG(
+			ep_hndl, local_smsg_attr, remote_smsg_attr);
+
+		/* LBUG never returns, but just for style and consistency */
+		break;
+	}
+	RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_smsg_send"
+#define apick_fmt "0x%p, 0x%p, %d, 0x%p, %u %u"
+static inline gni_return_t kgnilnd_smsg_send(
+		IN gni_ep_handle_t      ep_hndl,
+		IN void                 *header,
+		IN uint32_t             header_length,
+		IN void                 *data,
+		IN uint32_t             data_length,
+		IN uint32_t             msg_id
+		)
+{
+	gni_return_t rrc;
+
+	/* Error injection */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_SEND)) {
+		if (cfs_fail_loc & CFS_FAIL_RAND) {
+			rrc = GNI_RC_NOT_DONE;
+		} else {
+			rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE;
+		}
+	} else {
+		rrc = gni_smsg_send(
+			ep_hndl, header, header_length, data, data_length, msg_id);
+	}
+
+	switch (rrc)  {
+	/* both of these are OK, upper SW needs to handle */
+	case GNI_RC_SUCCESS:
+	case GNI_RC_NOT_DONE:
+		break;
+	case GNI_RC_INVALID_PARAM:
+		GNILND_API_SWBUG(
+			ep_hndl, header, header_length, data, data_length, msg_id);
+		break;
+	case GNI_RC_ERROR_RESOURCE:
+		GNILND_API_RESOURCE(
+			ep_hndl, header, header_length, data, data_length, msg_id);
+		break;
+	default:
+		GNILND_API_RC_LBUG(
+			ep_hndl, header, header_length, data, data_length, msg_id);
+
+		/* LBUG never returns, but just for style and consistency */
+		break;
+	}
+	RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_smsg_getnext"
+#define apick_fmt "0x%p,0x%p"
+static inline gni_return_t kgnilnd_smsg_getnext(
+		IN gni_ep_handle_t      ep_hndl,
+		OUT void                **header
+		)
+{
+	gni_return_t rrc;
+
+	/* Error injection */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_RELEASE)) {
+		rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE;
+	} else {
+		rrc = gni_smsg_getnext(
+			ep_hndl, header);
+	}
+
+	switch (rrc)  {
+	/* both of these are OK, upper SW needs to handle */
+	case GNI_RC_SUCCESS:
+	case GNI_RC_NOT_DONE:
+	case GNI_RC_INVALID_STATE:
+		break;
+	case GNI_RC_INVALID_PARAM:
+		GNILND_API_SWBUG(
+			ep_hndl, header);
+		break;
+	default:
+		GNILND_API_RC_LBUG(
+			ep_hndl, header);
+
+		/* LBUG never returns, but just for style and consistency */
+		break;
+	}
+	RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_smsg_release"
+#define apick_fmt "0x%p"
+static inline gni_return_t kgnilnd_smsg_release(
+		IN gni_ep_handle_t      ep_hndl
+		)
+{
+	gni_return_t rrc;
+
+	/* Error injection */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_RELEASE)) {
+		rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+	} else {
+		rrc = gni_smsg_release(
+			ep_hndl);
+	}
+
+	switch (rrc)  {
+	/* both of these are OK, upper SW needs to handle */
+	case GNI_RC_SUCCESS:
+	case GNI_RC_NOT_DONE:
+		break;
+	case GNI_RC_INVALID_PARAM:
+		GNILND_API_SWBUG(
+			ep_hndl);
+		break;
+	default:
+		GNILND_API_RC_LBUG(
+			ep_hndl);
+
+		/* LBUG never returns, but just for style and consistency */
+		break;
+	}
+	RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_create"
+#define apick_fmt "0x%p, 0x%p, 0x%p"
+static inline gni_return_t kgnilnd_ep_create(
+		IN gni_nic_handle_t     nic_hndl,
+		IN gni_cq_handle_t      src_cq_hndl,
+		OUT gni_ep_handle_t     *ep_hndl
+		)
+{
+	gni_return_t rrc;
+
+	/* error injection */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_CREATE)) {
+		rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_ERROR_NOMEM;
+	} else {
+		rrc = gni_ep_create(
+			nic_hndl, src_cq_hndl, ep_hndl);
+	}
+
+	switch (rrc)  {
+	case GNI_RC_SUCCESS:
+		break;
+	case GNI_RC_INVALID_PARAM:
+		GNILND_API_SWBUG(
+			nic_hndl, src_cq_hndl, ep_hndl);
+		break;
+	case GNI_RC_ERROR_NOMEM:
+		GNILND_API_RESOURCE(
+			nic_hndl, src_cq_hndl, ep_hndl);
+		break;
+	default:
+		GNILND_API_RC_LBUG(
+			nic_hndl, src_cq_hndl, ep_hndl);
+
+		/* lbug never returns, but just for style and consistency */
+		break;
+	}
+	RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_bind"
+#define apick_fmt "0x%p, %x, %x"
+static inline gni_return_t kgnilnd_ep_bind(
+		IN gni_ep_handle_t      ep_hndl,
+		IN uint32_t             remote_addr,
+		IN uint32_t             remote_id
+		)
+{
+	gni_return_t rrc;
+
+	/* error injection */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_BIND)) {
+		rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE;
+	} else {
+		rrc = gni_ep_bind(
+			ep_hndl, remote_addr, remote_id);
+	}
+
+	switch (rrc)  {
+	/* both of these are ok, upper sw needs to handle */
+	case GNI_RC_SUCCESS:
+	case GNI_RC_NOT_DONE:
+		break;
+	case GNI_RC_INVALID_PARAM:
+		GNILND_API_SWBUG(
+			ep_hndl, remote_addr, remote_id);
+		break;
+	default:
+		GNILND_API_RC_LBUG(
+			ep_hndl, remote_addr, remote_id);
+
+		/* lbug never returns, but just for style and consistency */
+		break;
+	}
+	RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_set_eventdata"
+#define apick_fmt "0x%p, %x, %x"
+static inline gni_return_t kgnilnd_ep_set_eventdata(
+		IN gni_ep_handle_t      ep_hndl,
+		IN uint32_t             local_event,
+		IN uint32_t             remote_event
+		)
+{
+	gni_return_t rrc;
+
+	/* Error injection */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_SET_EVDATA)) {
+		rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+	} else {
+		rrc = gni_ep_set_eventdata(
+			ep_hndl, local_event, remote_event);
+	}
+
+	switch (rrc)  {
+	case GNI_RC_SUCCESS:
+		break;
+	case GNI_RC_INVALID_PARAM:
+		GNILND_API_SWBUG(
+			ep_hndl, local_event, remote_event);
+		break;
+	default:
+		GNILND_API_RC_LBUG(
+			ep_hndl, local_event, remote_event);
+
+		/* LBUG never returns, but just for style and consistency */
+		break;
+	}
+	RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_unbind"
+#define apick_fmt "0x%p"
+static inline gni_return_t kgnilnd_ep_unbind(
+		IN gni_ep_handle_t      ep_hndl
+		)
+{
+	gni_return_t rrc;
+
+	/* Error injection */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_UNBIND)) {
+		rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE;
+	} else {
+		rrc = gni_ep_unbind(
+			ep_hndl);
+	}
+
+	switch (rrc)  {
+	/* both of these are OK, upper SW needs to handle */
+	case GNI_RC_NOT_DONE:
+	case GNI_RC_SUCCESS:
+		break;
+	case GNI_RC_INVALID_PARAM:
+		GNILND_API_SWBUG(
+			ep_hndl);
+		break;
+	default:
+		GNILND_API_RC_LBUG(
+			ep_hndl);
+
+		/* LBUG never returns, but just for style and consistency */
+		break;
+	}
+	RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_destroy"
+#define apick_fmt "0x%p"
+static inline gni_return_t kgnilnd_ep_destroy(
+		IN gni_ep_handle_t      ep_hndl
+		)
+{
+	gni_return_t rrc;
+
+	/* Error injection */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_DESTROY)) {
+		rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE;
+	} else {
+		rrc = gni_ep_destroy(
+			ep_hndl);
+	}
+
+	switch (rrc)  {
+	case GNI_RC_SUCCESS:
+		break;
+	case GNI_RC_INVALID_PARAM:
+		GNILND_API_SWBUG(
+			ep_hndl);
+		break;
+	default:
+		GNILND_API_RC_LBUG(
+			ep_hndl);
+
+		/* LBUG never returns, but just for style and consistency */
+		break;
+	}
+	RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_postdata_w_id"
+#define apick_fmt "0x%p, 0x%p, %d, 0x%p, %d, "LPU64""
+static inline gni_return_t kgnilnd_ep_postdata_w_id(
+		IN gni_ep_handle_t ep_hndl,
+		IN void            *in_data,
+		IN uint16_t        data_len,
+		IN void            *out_buf,
+		IN uint16_t        buf_size,
+		IN uint64_t        datagram_id
+		)
+{
+	gni_return_t rrc;
+
+	/* Error injection */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_POST)) {
+		rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_SIZE_ERROR;
+	} else {
+		rrc = gni_ep_postdata_w_id(
+			ep_hndl, in_data, data_len, out_buf, buf_size,
+			datagram_id);
+	}
+
+	switch (rrc)  {
+	case GNI_RC_SUCCESS:
+	case GNI_RC_ERROR_NOMEM:
+	case GNI_RC_ERROR_RESOURCE:
+		break;
+	case GNI_RC_INVALID_PARAM:
+	case GNI_RC_SIZE_ERROR:
+		GNILND_API_SWBUG(
+			ep_hndl, in_data, data_len, out_buf, buf_size,
+			datagram_id);
+		break;
+	default:
+		GNILND_API_RC_LBUG(
+			ep_hndl, in_data, data_len, out_buf, buf_size,
+			datagram_id);
+
+		/* LBUG never returns, but just for style and consistency */
+		break;
+	}
+	RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_postdata_test_by_id"
+#define apick_fmt "0x%p, "LPU64", 0x%p, 0x%p, 0x%p"
+static inline gni_return_t kgnilnd_ep_postdata_test_by_id(
+		IN gni_ep_handle_t      ep_hndl,
+		IN uint64_t             datagram_id,
+		OUT gni_post_state_t    *post_state,
+		OUT uint32_t            *remote_addr,
+		OUT uint32_t            *remote_id
+		)
+{
+	gni_return_t rrc;
+
+	/* Error injection */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_TEST)) {
+		rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_ERROR_NOMEM;
+	} else {
+		rrc = gni_ep_postdata_test_by_id(
+			ep_hndl, datagram_id, post_state, remote_addr,
+			remote_id);
+
+		/* we want to lie, but we need to do the actual work first
+		 * so we don't keep getting the event saying a dgram is ready */
+		if (rrc == GNI_RC_SUCCESS && CFS_FAIL_CHECK(CFS_FAIL_GNI_DG_TERMINATE)) {
+			/* don't use fail_val, allows us to do FAIL_SOME */
+			*post_state = GNI_POST_TERMINATED;
+		}
+	}
+
+	switch (rrc)  {
+	case GNI_RC_SUCCESS:
+	case GNI_RC_NO_MATCH:
+		break;
+	case GNI_RC_SIZE_ERROR:
+	case GNI_RC_INVALID_PARAM:
+		GNILND_API_SWBUG(
+			ep_hndl, datagram_id, post_state, remote_addr,
+			remote_id);
+		break;
+	case GNI_RC_ERROR_NOMEM:
+		GNILND_API_RESOURCE(
+			ep_hndl, datagram_id, post_state, remote_addr,
+			remote_id);
+		break;
+	default:
+		GNILND_API_RC_LBUG(
+			ep_hndl, datagram_id, post_state, remote_addr,
+			remote_id);
+
+		/* LBUG never returns, but just for style and consistency */
+		break;
+	}
+	RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_postdata_cancel_by_id"
+#define apick_fmt "0x%p, "LPU64""
+static inline gni_return_t kgnilnd_ep_postdata_cancel_by_id(
+		IN gni_ep_handle_t      ep_hndl,
+		IN uint64_t             datagram_id
+		)
+{
+	gni_return_t rrc;
+
+	/* no error injection as the only thing we'd do is LBUG */
+
+	rrc = gni_ep_postdata_cancel_by_id(
+		ep_hndl, datagram_id);
+
+	switch (rrc)  {
+	case GNI_RC_SUCCESS:
+	case GNI_RC_NO_MATCH:
+		break;
+	case GNI_RC_INVALID_PARAM:
+		GNILND_API_SWBUG(
+			ep_hndl, datagram_id);
+		break;
+	default:
+		GNILND_API_RC_LBUG(
+			ep_hndl, datagram_id);
+
+		/* LBUG never returns, but just for style and consistency */
+		break;
+	}
+	RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_postdata_probe_by_id"
+#define apick_fmt "0x%p, 0x%p"
+static inline gni_return_t kgnilnd_postdata_probe_by_id(
+		IN gni_nic_handle_t    nic_hndl,
+		OUT uint64_t          *datagram_id
+		)
+{
+	gni_return_t rrc;
+
+	/* Error injection */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PROBE)) {
+		rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NO_MATCH;
+	} else {
+		rrc = gni_postdata_probe_by_id(
+			nic_hndl, datagram_id);
+	}
+
+	switch (rrc)  {
+	case GNI_RC_SUCCESS:
+	case GNI_RC_NO_MATCH:
+		break;
+	case GNI_RC_INVALID_PARAM:
+		GNILND_API_SWBUG(
+			nic_hndl, datagram_id);
+		break;
+	default:
+		GNILND_API_RC_LBUG(
+			nic_hndl, datagram_id);
+
+		/* LBUG never returns, but just for style and consistency */
+		break;
+	}
+	RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_postdata_probe_wait_by_id"
+#define apick_fmt "0x%p, %d, 0x%p"
+static inline gni_return_t kgnilnd_postdata_probe_wait_by_id(
+		IN gni_nic_handle_t nic_hndl,
+		IN uint32_t         timeout,
+		OUT uint64_t        *datagram_id
+		)
+{
+	gni_return_t rrc;
+
+	/* Error injection */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PROBE_WAIT)) {
+		rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_TIMEOUT;
+	} else {
+		rrc = gni_postdata_probe_wait_by_id(
+			nic_hndl, timeout, datagram_id);
+	}
+
+	switch (rrc)  {
+	case GNI_RC_SUCCESS:
+	case GNI_RC_TIMEOUT:
+		break;
+	case GNI_RC_INVALID_PARAM:
+		GNILND_API_SWBUG(
+			nic_hndl, timeout, datagram_id);
+		break;
+	default:
+		GNILND_API_RC_LBUG(
+			nic_hndl, timeout, datagram_id);
+
+		/* LBUG never returns, but just for style and consistency */
+		break;
+	}
+	RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_post_rdma"
+#define apick_fmt "0x%p, 0x%p"
+static inline gni_return_t kgnilnd_post_rdma(
+		IN gni_ep_handle_t               ep_hndl,
+		IN gni_post_descriptor_t        *post_descr
+		)
+{
+	gni_return_t rrc;
+
+	/* Error injection */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_POST_RDMA)) {
+		rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+	} else {
+		rrc = gni_post_rdma(
+			ep_hndl, post_descr);
+	}
+
+	switch (rrc)  {
+	case GNI_RC_SUCCESS:
+		break;
+	case GNI_RC_ALIGNMENT_ERROR:
+	case GNI_RC_INVALID_PARAM:
+		GNILND_API_SWBUG(
+			ep_hndl, post_descr);
+		break;
+	case GNI_RC_ERROR_RESOURCE:
+		GNILND_API_RESOURCE(
+			ep_hndl, post_descr);
+		break;
+	default:
+		GNILND_API_RC_LBUG(
+			ep_hndl, post_descr);
+
+		/* LBUG never returns, but just for style and consistency */
+		break;
+	}
+	RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_get_completed"
+#define apick_fmt "0x%p,"LPX64",0x%p"
+static inline gni_return_t kgnilnd_get_completed(
+		IN gni_cq_handle_t              cq_hndl,
+		IN gni_cq_entry_t               event_data,
+		OUT gni_post_descriptor_t       **post_descr
+		)
+{
+	gni_return_t rrc;
+
+
+	rrc = gni_get_completed(cq_hndl, event_data, post_descr);
+
+	switch (rrc)  {
+	case GNI_RC_TRANSACTION_ERROR:
+	case GNI_RC_SUCCESS:
+		break;
+	case GNI_RC_DESCRIPTOR_ERROR:
+	case GNI_RC_INVALID_PARAM:
+		GNILND_API_SWBUG(cq_hndl, event_data, post_descr);
+		break;
+	default:
+		GNILND_API_RC_LBUG(cq_hndl, event_data, post_descr);
+		/* LBUG never returns, but just for style and consistency */
+		break;
+	}
+
+	/* Error injection - we need a valid desc, so let kgni give us one
+	 * - then we lie  */
+	if (rrc == GNI_RC_SUCCESS &&
+	    (CFS_FAIL_CHECK(CFS_FAIL_GNI_GET_COMPLETED))) {
+		/* We only trigger TRANSACTION_ERROR for now */
+		gni_post_descriptor_t *desc;
+		rrc = GNI_RC_TRANSACTION_ERROR;
+		desc = *post_descr;
+		desc->status = rrc;
+		/* recoverable decision made from cfs_fail_val in
+		 *  kgnilnd_cq_error_str and
+		 *  kgnilnd_cq_error_recoverable */
+	}
+	RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_cq_error_str"
+#define apick_fmt LPX64",0x%p,%d"
+static inline gni_return_t kgnilnd_cq_error_str(
+		IN gni_cq_entry_t       entry,
+		IN void                *buffer,
+		IN uint32_t             len
+		)
+{
+	gni_return_t rrc;
+
+	/* Error injection - set string if we injected a
+	 *  TRANSACTION_ERROR earlier */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GET_COMPLETED)) {
+		/* if we just set persistent error, we can't ever
+		 * break in via ssh to clear, so use a count > 10 to indicate fatal */
+		sprintf(buffer, "INJECT:%s", cfs_fail_val > 10 ?
+			"FATAL" : "RECOVERABLE");
+		rrc = GNI_RC_SUCCESS;
+	} else {
+		rrc = gni_cq_error_str(
+			entry, buffer, len);
+	}
+
+	switch (rrc)  {
+	case GNI_RC_SUCCESS:
+		break;
+	case GNI_RC_SIZE_ERROR:
+	case GNI_RC_INVALID_PARAM:
+		GNILND_API_SWBUG(
+			entry, buffer, len);
+		/* give them something to use */
+		snprintf(buffer, len, "UNDEF:UNDEF");
+		break;
+	default:
+		GNILND_API_RC_LBUG(
+			entry, buffer, len);
+
+		/* LBUG never returns, but just for style and consistency */
+		break;
+	}
+	RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_cq_error_recoverable"
+#define apick_fmt LPX64",0x%p"
+static inline gni_return_t kgnilnd_cq_error_recoverable(
+		IN gni_cq_entry_t       entry,
+		IN uint32_t            *recoverable
+		)
+{
+	gni_return_t rrc;
+
+	/* Error injection - set string if we injected a
+	 *  TRANSACTION_ERROR earlier */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GET_COMPLETED)) {
+		*recoverable = cfs_fail_val > 10 ? 0 : 1;
+		rrc = GNI_RC_SUCCESS;
+	} else {
+		rrc = gni_cq_error_recoverable(
+			entry, recoverable);
+	}
+
+	switch (rrc)  {
+	case GNI_RC_SUCCESS:
+		break;
+	case GNI_RC_INVALID_STATE:
+	case GNI_RC_INVALID_PARAM:
+		GNILND_API_SWBUG(
+			entry, recoverable);
+		*recoverable = 0;
+		break;
+	default:
+		GNILND_API_RC_LBUG(
+			entry, recoverable);
+
+		/* LBUG never returns, but just for style and consistency */
+		break;
+	}
+	RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_mem_register_segments"
+#define apick_fmt "0x%p,0x%p,%u,0x%p,%x,0x%p"
+static inline gni_return_t
+kgnilnd_mem_register_segments(
+		IN gni_nic_handle_t     nic_hndl,
+		IN gni_mem_segment_t    *mem_segments,
+		IN uint32_t             segments_cnt,
+		IN gni_cq_handle_t      dst_cq_hndl,
+		IN uint32_t             flags,
+		OUT gni_mem_handle_t    *mem_hndl
+		)
+{
+	gni_return_t rrc;
+
+	/* Error injection */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PHYS_MAP)) {
+		rrc = GNI_RC_ERROR_RESOURCE;
+	} else {
+		rrc = gni_mem_register_segments(
+			nic_hndl, mem_segments, segments_cnt,
+			dst_cq_hndl, flags, mem_hndl);
+	}
+
+	switch (rrc)  {
+	case GNI_RC_SUCCESS:
+	case GNI_RC_ERROR_RESOURCE:
+		break;
+	case GNI_RC_INVALID_PARAM:
+		GNILND_API_SWBUG(
+			nic_hndl, mem_segments, segments_cnt,
+			dst_cq_hndl, flags, mem_hndl);
+		break;
+	default:
+		GNILND_API_RC_LBUG(
+			nic_hndl, mem_segments, segments_cnt,
+			dst_cq_hndl, flags, mem_hndl);
+
+		/* LBUG never returns, but just for style and consistency */
+		break;
+	}
+	RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_mem_register"
+#define apick_fmt "0x%p,"LPX64","LPX64"0x%p,%u,0x%p"
+static inline gni_return_t kgnilnd_mem_register(
+		IN gni_nic_handle_t     nic_hndl,
+		IN uint64_t             address,
+		IN uint64_t             length,
+		IN gni_cq_handle_t      dst_cq_hndl,
+		IN uint32_t             flags,
+		OUT gni_mem_handle_t    *mem_hndl
+		)
+{
+	gni_return_t rrc;
+
+	/* Error injection */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_VIRT_MAP)) {
+		rrc = GNI_RC_ERROR_RESOURCE;
+	} else if (CFS_FAIL_CHECK(CFS_FAIL_GNI_VIRT_SMALL_MAP) &&
+		   length <= *kgnilnd_tunables.kgn_max_immediate) {
+		rrc = GNI_RC_INVALID_PARAM;
+	} else {
+		rrc = gni_mem_register(
+			nic_hndl, address, length,
+			dst_cq_hndl, flags, mem_hndl);
+	}
+
+	switch (rrc)  {
+	case GNI_RC_SUCCESS:
+	case GNI_RC_ERROR_RESOURCE:
+		break;
+	case GNI_RC_INVALID_PARAM:
+		GNILND_API_SWBUG(
+			nic_hndl, address, length,
+			dst_cq_hndl, flags, mem_hndl);
+		break;
+	default:
+		GNILND_API_RC_LBUG(
+			nic_hndl, address, length,
+			dst_cq_hndl, flags, mem_hndl);
+
+		/* LBUG never returns, but just for style and consistency */
+		break;
+	}
+	RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_mem_deregister"
+#define apick_fmt "0x%p,0x%p,%d"
+static inline gni_return_t kgnilnd_mem_deregister(
+		IN gni_nic_handle_t     nic_hndl,
+		IN gni_mem_handle_t     *mem_hndl,
+		IN int                  hold_timeout
+		)
+{
+	gni_return_t rrc;
+
+	/* Error injection */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_VIRT_UNMAP)) {
+		rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+	} else {
+		rrc = gni_mem_deregister(
+			nic_hndl, mem_hndl, hold_timeout);
+	}
+
+	switch (rrc)  {
+	case GNI_RC_SUCCESS:
+		break;
+	case GNI_RC_INVALID_PARAM:
+		GNILND_API_SWBUG(
+			nic_hndl, mem_hndl, hold_timeout);
+		break;
+	default:
+		GNILND_API_RC_LBUG(
+			nic_hndl, mem_hndl, hold_timeout);
+
+		/* LBUG never returns, but just for style and consistency */
+		break;
+	}
+	RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_mem_mdd_release"
+#define apick_fmt "0x%p,0x%p"
+static inline gni_return_t kgnilnd_mem_mdd_release(
+		IN gni_nic_handle_t     nic_hndl,
+		IN gni_mem_handle_t     *mem_hndl
+		)
+{
+	gni_return_t rrc;
+
+	/* Error injection */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_MDD_RELEASE)) {
+		rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NO_MATCH;
+	} else {
+		rrc = gni_mem_mdd_release(
+			nic_hndl, mem_hndl);
+	}
+
+	switch (rrc)  {
+	case GNI_RC_SUCCESS:
+	case GNI_RC_NO_MATCH:
+		break;
+	default:
+		GNILND_API_RC_LBUG(
+			nic_hndl, mem_hndl);
+
+		/* LBUG never returns, but just for style and consistency */
+		break;
+	}
+	RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#endif /* _GNILND_API_WRAP_H */
diff --git a/lnet/klnds/gnilnd/gnilnd_cb.c b/lnet/klnds/gnilnd/gnilnd_cb.c
new file mode 100644
index 0000000..56be88a
--- /dev/null
+++ b/lnet/klnds/gnilnd/gnilnd_cb.c
@@ -0,0 +1,4366 @@
+/*
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ * Copyright (C) 2009-2012 Cray, Inc.
+ *
+ *   Derived from work by Eric Barton <eric@bartonsoftware.com>
+ *   Author: Nic Henke <nic@cray.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <linux/nmi.h>
+#include "gnilnd.h"
+
+/* this is useful when needed to debug wire corruption. */
+static void
+kgnilnd_dump_blob(int level, char *prefix, void *buf, int len) {
+	__u64 *ptr;
+
+	ptr = (__u64 *) buf;
+
+	while (len > 0) {
+		if (len >= 32) {
+			CDEBUG(level,
+			       "%s 0x%p: 0x%16.16llx 0x%16.16llx 0x%16.16llx 0x%16.16llx\n",
+			       prefix, ptr, *(ptr), *(ptr + 1), *(ptr + 2), *(ptr + 3));
+			ptr += 4;
+			len -= 32;
+		} else if (len >= 16) {
+			CDEBUG(level,
+			       "%s 0x%p: 0x%16.16llx 0x%16.16llx\n",
+			       prefix, ptr, *(ptr), *(ptr + 1));
+			ptr += 2;
+			len -= 16;
+		} else {
+			CDEBUG(level, "%s 0x%p: 0x%16.16llx\n",
+			       prefix, ptr, *(ptr));
+			ptr++;
+			len -= 8;
+		}
+	}
+}
+
+static void
+kgnilnd_dump_msg(int mask, kgn_msg_t *msg)
+{
+	CDEBUG(mask, "0x%8.8x 0x%4.4x 0x%4.4x 0x%16.16llx"
+		" 0x%16.16llx 0x%8.8x 0x%4.4x 0x%4.4x 0x%8.8x\n",
+		msg->gnm_magic, msg->gnm_version,
+		msg->gnm_type, msg->gnm_srcnid,
+		msg->gnm_connstamp, msg->gnm_seq,
+		msg->gnm_cksum, msg->gnm_payload_cksum,
+		msg->gnm_payload_len);
+}
+
+void
+kgnilnd_schedule_device(kgn_device_t *dev)
+{
+	short         already_live = 0;
+
+	/* we'll only want to wake if the scheduler thread
+	 * has come around and set ready to zero */
+	already_live = cmpxchg(&dev->gnd_ready, GNILND_DEV_IDLE, GNILND_DEV_IRQ);
+
+	if (!already_live) {
+		wake_up_all(&dev->gnd_waitq);
+	}
+	return;
+}
+
+void kgnilnd_schedule_device_timer(unsigned long arg)
+{
+	kgn_device_t *dev = (kgn_device_t *) arg;
+
+	kgnilnd_schedule_device(dev);
+}
+
+void
+kgnilnd_device_callback(__u32 devid, __u64 arg)
+{
+	kgn_device_t *dev;
+	int           index = (int) arg;
+
+	if (index >= kgnilnd_data.kgn_ndevs) {
+		/* use _EMERG instead of an LBUG to prevent LBUG'ing in
+		 * interrupt context. */
+		LCONSOLE_EMERG("callback for unknown device %d->%d\n",
+				devid, index);
+		return;
+	}
+
+	dev = &kgnilnd_data.kgn_devices[index];
+	/* just basic sanity */
+	if (dev->gnd_id == devid) {
+		kgnilnd_schedule_device(dev);
+	} else {
+		LCONSOLE_EMERG("callback for bad device %d devid %d\n",
+				dev->gnd_id, devid);
+	}
+}
+
+/* sched_intent values:
+ * < 0 : do not reschedule under any circumstances
+ * == 0: reschedule if someone marked him WANTS_SCHED
+ * > 0 : force a reschedule */
+
+void
+kgnilnd_schedule_process_conn(kgn_conn_t *conn, int sched_intent)
+{
+	int     conn_sched;
+
+	/* move back to IDLE but save previous state.
+	 * if we see WANTS_SCHED, we'll call kgnilnd_schedule_conn and
+	 * let the xchg there handle any racing callers to get it
+	 * onto gnd_ready_conns */
+
+	conn_sched = xchg(&conn->gnc_scheduled, GNILND_CONN_IDLE);
+	LASSERTF(conn_sched == GNILND_CONN_WANTS_SCHED ||
+		 conn_sched == GNILND_CONN_PROCESS,
+		 "conn %p after process in bad state: %d\n",
+		 conn, conn_sched);
+
+	if (sched_intent >= 0) {
+		if ((sched_intent > 0 || (conn_sched == GNILND_CONN_WANTS_SCHED))) {
+			kgnilnd_schedule_conn(conn);
+		}
+	}
+}
+
+void
+kgnilnd_schedule_conn(kgn_conn_t *conn)
+{
+	kgn_device_t        *dev = conn->gnc_device;
+	int                  sched;
+
+	sched = xchg(&conn->gnc_scheduled, GNILND_CONN_WANTS_SCHED);
+
+	/* if we are IDLE, add to list - only one guy sees IDLE and "wins"
+	 * the chance to put it onto gnd_ready_conns.
+	 * otherwise, leave marked as WANTS_SCHED and the thread that "owns"
+	 *  the conn in process_conns will take care of moving it back to
+	 *  SCHED when it is done processing */
+
+	if (sched == GNILND_CONN_IDLE) {
+		/* if the conn is already scheduled, we've already requested
+		 * the scheduler thread wakeup */
+		kgnilnd_conn_addref(conn);       /* +1 ref for scheduler */
+
+		LASSERTF(list_empty(&conn->gnc_schedlist), "conn %p already sched state %d\n",
+			 conn, sched);
+
+		CDEBUG(D_INFO, "scheduling conn 0x%p\n", conn);
+
+		spin_lock(&dev->gnd_lock);
+		list_add_tail(&conn->gnc_schedlist, &dev->gnd_ready_conns);
+		spin_unlock(&dev->gnd_lock);
+		set_mb(conn->gnc_last_sched_ask, jiffies);
+
+	} else {
+		CDEBUG(D_INFO, "not scheduling conn 0x%p: %d\n", conn, sched);
+	}
+
+	/* make sure thread(s) going to process conns - but let it make
+	 * separate decision from conn schedule */
+	kgnilnd_schedule_device(dev);
+}
+
+void
+kgnilnd_schedule_dgram(kgn_device_t *dev)
+{
+	int                  wake;
+
+	wake = xchg(&dev->gnd_dgram_ready, GNILND_DGRAM_SCHED);
+	if (wake != GNILND_DGRAM_SCHED)  {
+		wake_up(&dev->gnd_dgram_waitq);
+	} else {
+		CDEBUG(D_NETTRACE, "not waking: %d\n", wake);
+	}
+}
+
+void
+kgnilnd_free_tx(kgn_tx_t *tx)
+{
+	/* taken from kgnilnd_tx_add_state_locked */
+
+	LASSERTF((tx->tx_list_p == NULL &&
+		  tx->tx_list_state == GNILND_TX_ALLOCD) &&
+		list_empty(&tx->tx_list),
+		"tx %p with bad state %s (list_p %p) tx_list %s\n",
+		tx, kgnilnd_tx_state2str(tx->tx_list_state), tx->tx_list_p,
+		list_empty(&tx->tx_list) ? "empty" : "not empty");
+
+	atomic_dec(&kgnilnd_data.kgn_ntx);
+
+	/* we only allocate this if we need to */
+	if (tx->tx_phys != NULL) {
+		cfs_mem_cache_free(kgnilnd_data.kgn_tx_phys_cache, tx->tx_phys);
+		CDEBUG(D_MALLOC, "slab-freed 'tx_phys': %lu at %p.\n",
+		       LNET_MAX_IOV * sizeof(gni_mem_segment_t), tx->tx_phys);
+	}
+#if 0
+	KGNILND_POISON(tx, 0x5a, sizeof(kgn_tx_t));
+#endif
+	cfs_mem_cache_free(kgnilnd_data.kgn_tx_cache, tx);
+	CDEBUG(D_MALLOC, "slab-freed 'tx': %lu at %p.\n",
+	       sizeof(*tx), tx);
+}
+
+kgn_tx_t *
+kgnilnd_alloc_tx(void)
+{
+	kgn_tx_t      *tx = NULL;
+
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_ALLOC_TX))
+		return tx;
+
+	tx = cfs_mem_cache_alloc(kgnilnd_data.kgn_tx_cache, CFS_ALLOC_ATOMIC);
+	if (tx == NULL) {
+		CERROR("failed to allocate tx\n");
+		return NULL;
+	}
+	CDEBUG(D_MALLOC, "slab-alloced 'tx': %lu at %p.\n",
+	       sizeof(*tx), tx);
+
+	/* need this memset, cache alloc'd memory is not cleared */
+	memset(tx, 0, sizeof(*tx));
+
+	/* setup everything here to minimize time under the lock */
+	tx->tx_buftype = GNILND_BUF_NONE;
+	tx->tx_msg.gnm_type = GNILND_MSG_NONE;
+	INIT_LIST_HEAD(&tx->tx_list);
+	INIT_LIST_HEAD(&tx->tx_map_list);
+	tx->tx_list_state = GNILND_TX_ALLOCD;
+
+	atomic_inc(&kgnilnd_data.kgn_ntx);
+
+	return tx;
+}
+
+/* csum_fold needs to be run on the return value before shipping over the wire */
+#define _kgnilnd_cksum(seed, ptr, nob)  csum_partial(ptr, nob, seed)
+
+/* we don't use offset as every one is passing a buffer reference that already
+ * includes the offset into the base address -
+ *  see kgnilnd_setup_virt_buffer and kgnilnd_setup_immediate_buffer */
+static inline __u16
+kgnilnd_cksum(void *ptr, size_t nob)
+{
+	__u16   sum;
+
+	sum = csum_fold(_kgnilnd_cksum(0, ptr, nob));
+
+	/* don't use magic 'no checksum' value */
+	if (sum == 0)
+		sum = 1;
+
+	CDEBUG(D_INFO, "cksum 0x%x for ptr 0x%p sz %zu\n",
+	       sum, ptr, nob);
+
+	return sum;
+}
+
+inline __u16
+kgnilnd_cksum_kiov(unsigned int nkiov, lnet_kiov_t *kiov,
+		    unsigned int offset, unsigned int nob, int dump_blob)
+{
+	__wsum             cksum = 0;
+	__wsum             tmpck;
+	__u16              retsum;
+	void              *addr;
+	unsigned int       fraglen;
+	int                i, odd;
+
+	LASSERT(nkiov > 0);
+	LASSERT(nob > 0);
+
+	CDEBUG(D_BUFFS, "calc cksum for kiov 0x%p nkiov %u offset %u nob %u, dump %d\n",
+	       kiov, nkiov, offset, nob, dump_blob);
+
+	/* if loops changes, please change kgnilnd_setup_phys_buffer */
+
+	while (offset >= kiov->kiov_len) {
+		offset -= kiov->kiov_len;
+		nkiov--;
+		kiov++;
+		LASSERT(nkiov > 0);
+	}
+
+	/* ignore nob here, if nob < (kiov_len - offset), kiov == 1 */
+	odd = (unsigned long) (kiov[0].kiov_len - offset) & 1;
+
+	if ((odd || *kgnilnd_tunables.kgn_vmap_cksum) && nkiov > 1) {
+		struct page **pages = kgnilnd_data.kgn_cksum_map_pages[get_cpu()];
+
+		LASSERTF(pages != NULL, "NULL pages for cpu %d map_pages 0x%p\n",
+			 get_cpu(), kgnilnd_data.kgn_cksum_map_pages);
+
+		CDEBUG(D_BUFFS, "odd %d len %u offset %u nob %u\n",
+		       odd, kiov[0].kiov_len, offset, nob);
+
+		for (i = 0; i < nkiov; i++) {
+			pages[i] = kiov[i].kiov_page;
+		}
+
+		addr = vmap(pages, nkiov, VM_MAP, PAGE_KERNEL);
+		if (addr == NULL) {
+			CNETERR("Couldn't vmap %d frags on %d bytes to avoid odd length fragment in cksum\n",
+				nkiov, nob);
+			/* return zero to avoid killing tx - we'll just get warning on console
+			 * when remote end sees zero checksum */
+			RETURN(0);
+		}
+		atomic_inc(&kgnilnd_data.kgn_nvmap_cksum);
+
+		tmpck = _kgnilnd_cksum(0, (void *) addr + kiov[0].kiov_offset + offset, nob);
+		cksum = tmpck;
+
+		if (dump_blob) {
+			kgnilnd_dump_blob(D_BUFFS, "flat kiov RDMA payload",
+					  (void *)addr + kiov[0].kiov_offset + offset, nob);
+		}
+		CDEBUG(D_BUFFS, "cksum 0x%x (+0x%x) for addr 0x%p+%u len %u offset %u\n",
+		       cksum, tmpck, addr, kiov[0].kiov_offset, nob, offset);
+		vunmap(addr);
+	} else {
+		do {
+			fraglen = min(kiov->kiov_len - offset, nob);
+
+			/* make dang sure we don't send a bogus checksum if somehow we get
+			 * an odd length fragment on anything but the last entry in a kiov  -
+			 * we know from kgnilnd_setup_rdma_buffer that we can't have non
+			 * PAGE_SIZE pages in the middle, so if nob < PAGE_SIZE, it is the last one */
+			LASSERTF(!(fraglen&1) || (nob < PAGE_SIZE),
+				 "odd fraglen %u on nkiov %d, nob %u kiov_len %u offset %u kiov 0x%p\n",
+				 fraglen, nkiov, nob, kiov->kiov_len, offset, kiov);
+
+			addr = (void *)kmap(kiov->kiov_page) + kiov->kiov_offset + offset;
+			tmpck = _kgnilnd_cksum(cksum, addr, fraglen);
+
+			CDEBUG(D_BUFFS,
+			       "cksum 0x%x (+0x%x) for page 0x%p+%u (0x%p) len %u offset %u\n",
+			       cksum, tmpck, kiov->kiov_page, kiov->kiov_offset, addr,
+			       fraglen, offset);
+
+			cksum = tmpck;
+
+			if (dump_blob)
+				kgnilnd_dump_blob(D_BUFFS, "kiov cksum", addr, fraglen);
+
+			kunmap(kiov->kiov_page);
+
+			kiov++;
+			nkiov--;
+			nob -= fraglen;
+			offset = 0;
+
+			/* iov must not run out before end of data */
+			LASSERTF(nob == 0 || nkiov > 0, "nob %u nkiov %u\n", nob, nkiov);
+
+		} while (nob > 0);
+	}
+
+	retsum = csum_fold(cksum);
+
+	/* don't use magic 'no checksum' value */
+	if (retsum == 0)
+		retsum = 1;
+
+	CDEBUG(D_BUFFS, "retsum 0x%x from cksum 0x%x\n", retsum, cksum);
+
+	return retsum;
+}
+
+void
+kgnilnd_init_msg(kgn_msg_t *msg, int type, lnet_nid_t source)
+{
+	msg->gnm_magic = GNILND_MSG_MAGIC;
+	msg->gnm_version = GNILND_MSG_VERSION;
+	msg->gnm_type = type;
+	msg->gnm_payload_len = 0;
+	msg->gnm_srcnid = source;
+	/* gnm_connstamp gets set when FMA is sent */
+	/* gnm_srcnid is set on creation via function argument
+	 * The right interface/net and nid is passed in when the message
+	 * is created.
+	 */
+}
+
+kgn_tx_t *
+kgnilnd_new_tx_msg(int type, lnet_nid_t source)
+{
+	kgn_tx_t *tx = kgnilnd_alloc_tx();
+
+	if (tx != NULL) {
+		kgnilnd_init_msg(&tx->tx_msg, type, source);
+	} else {
+		CERROR("couldn't allocate new tx type %s!\n",
+		       kgnilnd_msgtype2str(type));
+	}
+
+	return tx;
+}
+
+static void
+kgnilnd_nak_rdma(kgn_conn_t *conn, int type, int error, __u64 cookie, lnet_nid_t source) {
+	kgn_tx_t        *tx;
+
+	/* only allow NAK on error and truncate to zero */
+	LASSERTF(error <= 0, "error %d conn 0x%p, cookie "LPU64"\n",
+		 error, conn, cookie);
+
+	tx = kgnilnd_new_tx_msg(type, source);
+	if (tx == NULL) {
+		CNETERR("can't get TX to NAK RDMA to %s\n",
+			libcfs_nid2str(conn->gnc_peer->gnp_nid));
+		return;
+	}
+
+	tx->tx_msg.gnm_u.completion.gncm_retval = error;
+	tx->tx_msg.gnm_u.completion.gncm_cookie = cookie;
+	kgnilnd_queue_tx(conn, tx);
+}
+
+int
+kgnilnd_setup_immediate_buffer(kgn_tx_t *tx, unsigned int niov, struct iovec *iov,
+			       lnet_kiov_t *kiov, unsigned int offset, unsigned int nob)
+
+{
+	kgn_msg_t       *msg = &tx->tx_msg;
+	int              i;
+
+	/* To help save on MDDs for short messages, we'll vmap a kiov to allow
+	 * gni_smsg_send to send that as the payload */
+
+	LASSERT(tx->tx_buftype == GNILND_BUF_NONE);
+	LASSERT(nob >= 0);
+
+	if (nob == 0) {
+		tx->tx_buffer = NULL;
+	} else if (kiov != NULL) {
+		LASSERTF(niov > 0 && niov < GNILND_MAX_IMMEDIATE/PAGE_SIZE,
+			 "bad niov %d\n", niov);
+
+		while (offset >= kiov->kiov_len) {
+			offset -= kiov->kiov_len;
+			niov--;
+			kiov++;
+			LASSERT(niov > 0);
+		}
+		for (i = 0; i < niov; i++) {
+			/* We can't have a kiov_offset on anything but the first entry,
+			 * otherwise we'll have a hole at the end of the mapping as we only map
+			 * whole pages.
+			 * Also, if we have a kiov_len < PAGE_SIZE but we need to map more
+			 * than kiov_len, we will also have a whole at the end of that page
+			 * which isn't allowed */
+			if ((kiov[i].kiov_offset != 0 && i > 0) ||
+			    (kiov[i].kiov_offset + kiov[i].kiov_len != CFS_PAGE_SIZE && i < niov - 1)) {
+				CNETERR("Can't make payload contiguous in I/O VM:"
+				       "page %d, offset %u, nob %u, kiov_offset %u kiov_len %u \n",
+				       i, offset, nob, kiov->kiov_offset, kiov->kiov_len);
+				RETURN(-EINVAL);
+			}
+			tx->tx_imm_pages[i] = kiov[i].kiov_page;
+		}
+
+		/* hijack tx_phys for the later unmap */
+		if (niov == 1) {
+			/* tx->phyx being equal to NULL is the signal for unmap to discern between kmap and vmap */
+			tx->tx_phys = NULL;
+			tx->tx_buffer = (void *)kmap(tx->tx_imm_pages[0]) + kiov[0].kiov_offset + offset;
+			atomic_inc(&kgnilnd_data.kgn_nkmap_short);
+			GNIDBG_TX(D_NET, tx, "kmapped page for %d bytes for kiov 0x%p, buffer 0x%p",
+				nob, kiov, tx->tx_buffer);
+		} else {
+			tx->tx_phys = vmap(tx->tx_imm_pages, niov, VM_MAP, PAGE_KERNEL);
+			if (tx->tx_phys == NULL) {
+				CNETERR("Couldn't vmap %d frags on %d bytes\n", niov, nob);
+				RETURN(-ENOMEM);
+
+			}
+			atomic_inc(&kgnilnd_data.kgn_nvmap_short);
+			/* make sure we take into account the kiov offset as the start of the buffer */
+			tx->tx_buffer = (void *)tx->tx_phys + kiov[0].kiov_offset + offset;
+			GNIDBG_TX(D_NET, tx, "mapped %d pages for %d bytes from kiov 0x%p to 0x%p, buffer 0x%p",
+				niov, nob, kiov, tx->tx_phys, tx->tx_buffer);
+		}
+		tx->tx_buftype = GNILND_BUF_IMMEDIATE_KIOV;
+		tx->tx_nob = nob;
+
+	} else {
+		/* For now this is almost identical to kgnilnd_setup_virt_buffer, but we
+		 * could "flatten" the payload into a single contiguous buffer ready
+		 * for sending direct over an FMA if we ever needed to. */
+
+		LASSERT(niov > 0);
+
+		while (offset >= iov->iov_len) {
+			offset -= iov->iov_len;
+			niov--;
+			iov++;
+			LASSERT(niov > 0);
+		}
+
+		if (nob > iov->iov_len - offset) {
+			CERROR("Can't handle multiple vaddr fragments\n");
+			return -EMSGSIZE;
+		}
+
+		tx->tx_buffer = (void *)(((unsigned long)iov->iov_base) + offset);
+
+		tx->tx_buftype = GNILND_BUF_IMMEDIATE;
+		tx->tx_nob = nob;
+	}
+
+	/* checksum payload early - it shouldn't be changing after lnd_send */
+	if (*kgnilnd_tunables.kgn_checksum >= 2) {
+		msg->gnm_payload_cksum = kgnilnd_cksum(tx->tx_buffer, nob);
+		if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_CKSUM2)) {
+			msg->gnm_payload_cksum += 0xe00e;
+		}
+		if (*kgnilnd_tunables.kgn_checksum_dump > 1) {
+			kgnilnd_dump_blob(D_BUFFS, "payload checksum",
+					  tx->tx_buffer, nob);
+		}
+	} else {
+		msg->gnm_payload_cksum = 0;
+	}
+
+	return 0;
+}
+
+int
+kgnilnd_setup_virt_buffer(kgn_tx_t *tx,
+			  unsigned int niov, struct iovec *iov,
+			  unsigned int offset, unsigned int nob)
+
+{
+	LASSERT(nob > 0);
+	LASSERT(niov > 0);
+	LASSERT(tx->tx_buftype == GNILND_BUF_NONE);
+
+	while (offset >= iov->iov_len) {
+		offset -= iov->iov_len;
+		niov--;
+		iov++;
+		LASSERT(niov > 0);
+	}
+
+	if (nob > iov->iov_len - offset) {
+		CERROR("Can't handle multiple vaddr fragments\n");
+		return -EMSGSIZE;
+	}
+
+	tx->tx_buftype = GNILND_BUF_VIRT_UNMAPPED;
+	tx->tx_nob = nob;
+	tx->tx_buffer = (void *)(((unsigned long)iov->iov_base) + offset);
+	return 0;
+}
+
+int
+kgnilnd_setup_phys_buffer(kgn_tx_t *tx, int nkiov, lnet_kiov_t *kiov,
+			  unsigned int offset, unsigned int nob)
+{
+	gni_mem_segment_t *phys;
+	int                rc = 0;
+	unsigned int       fraglen;
+
+	GNIDBG_TX(D_NET, tx, "niov %d kiov 0x%p offset %u nob %u", nkiov, kiov, offset, nob);
+
+	LASSERT(nob > 0);
+	LASSERT(nkiov > 0);
+	LASSERT(tx->tx_buftype == GNILND_BUF_NONE);
+
+	/* only allocate this if we are going to use it */
+	tx->tx_phys = cfs_mem_cache_alloc(kgnilnd_data.kgn_tx_phys_cache,
+					  CFS_ALLOC_ATOMIC);
+	if (tx->tx_phys == NULL) {
+		CERROR("failed to allocate tx_phys\n");
+		rc = -ENOMEM;
+		GOTO(error, rc);
+	}
+
+	CDEBUG(D_MALLOC, "slab-alloced 'tx->tx_phys': %lu at %p.\n",
+	       LNET_MAX_IOV * sizeof(gni_mem_segment_t), tx->tx_phys);
+
+	/* if loops changes, please change kgnilnd_cksum_kiov
+	 *   and kgnilnd_setup_immediate_buffer */
+
+	while (offset >= kiov->kiov_len) {
+		offset -= kiov->kiov_len;
+		nkiov--;
+		kiov++;
+		LASSERT(nkiov > 0);
+	}
+
+	/* at this point, kiov points to the first page that we'll actually map
+	 * now that we've seeked into the koiv for offset and dropped any
+	 * leading pages that fall entirely within the offset */
+	tx->tx_buftype = GNILND_BUF_PHYS_UNMAPPED;
+	tx->tx_nob = nob;
+
+	/* kiov_offset is start of 'valid' buffer, so index offset past that */
+	tx->tx_buffer = (void *)((unsigned long)(kiov->kiov_offset + offset));
+	phys = tx->tx_phys;
+
+	CDEBUG(D_NET, "tx 0x%p buffer 0x%p map start kiov 0x%p+%u niov %d offset %u\n",
+	       tx, tx->tx_buffer, kiov, kiov->kiov_offset, nkiov, offset);
+
+	do {
+		fraglen = min(kiov->kiov_len - offset, nob);
+
+		/* We can't have a kiov_offset on anything but the first entry,
+		 * otherwise we'll have a hole at the end of the mapping as we only map
+		 * whole pages. Only the first page is allowed to have an offset -
+		 * we'll add that into tx->tx_buffer and that will get used when we
+		 * map in the segments (see kgnilnd_map_buffer).
+		 * Also, if we have a kiov_len < PAGE_SIZE but we need to map more
+		 * than kiov_len, we will also have a whole at the end of that page
+		 * which isn't allowed */
+		if ((phys != tx->tx_phys) &&
+		    ((kiov->kiov_offset != 0) ||
+		     ((kiov->kiov_len < PAGE_SIZE) && (nob > kiov->kiov_len)))) {
+			CERROR("Can't make payload contiguous in I/O VM:"
+			       "page %d, offset %u, nob %u, kiov_offset %u kiov_len %u \n",
+			       (int)(phys - tx->tx_phys),
+			       offset, nob, kiov->kiov_offset, kiov->kiov_len);
+			rc = -EINVAL;
+			GOTO(error, rc);
+		}
+
+		if ((phys - tx->tx_phys) == LNET_MAX_IOV) {
+			CERROR ("payload too big (%d)\n", (int)(phys - tx->tx_phys));
+			rc = -EMSGSIZE;
+			GOTO(error, rc);
+		}
+
+		if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PHYS_SETUP)) {
+			rc = -EINVAL;
+			GOTO(error, rc);
+		}
+
+		CDEBUG(D_BUFFS, "page 0x%p kiov_offset %u kiov_len %u nob %u "
+			       "nkiov %u offset %u\n",
+		      kiov->kiov_page, kiov->kiov_offset, kiov->kiov_len, nob, nkiov, offset);
+
+		phys->address = lnet_page2phys(kiov->kiov_page);
+		phys++;
+		kiov++;
+		nkiov--;
+		nob -= fraglen;
+		offset = 0;
+
+		/* iov must not run out before end of data */
+		LASSERTF(nob == 0 || nkiov > 0, "nob %u nkiov %u\n", nob, nkiov);
+
+	} while (nob > 0);
+
+	tx->tx_phys_npages = phys - tx->tx_phys;
+
+	return 0;
+
+error:
+	if (tx->tx_phys != NULL) {
+		cfs_mem_cache_free(kgnilnd_data.kgn_tx_phys_cache, tx->tx_phys);
+		CDEBUG(D_MALLOC, "slab-freed 'tx_phys': %lu at %p.\n",
+		       sizeof(*tx->tx_phys), tx->tx_phys);
+		tx->tx_phys = NULL;
+	}
+	return rc;
+}
+
+static inline int
+kgnilnd_setup_rdma_buffer(kgn_tx_t *tx, unsigned int niov,
+			  struct iovec *iov, lnet_kiov_t *kiov,
+			  unsigned int offset, unsigned int nob)
+{
+	int     rc;
+
+	LASSERT((iov == NULL) != (kiov == NULL));
+
+	if (kiov != NULL) {
+		rc = kgnilnd_setup_phys_buffer(tx, niov, kiov, offset, nob);
+	} else {
+		rc = kgnilnd_setup_virt_buffer(tx, niov, iov, offset, nob);
+	}
+	return rc;
+}
+
+static void
+kgnilnd_parse_lnet_rdma(lnet_msg_t *lntmsg, unsigned int *niov, unsigned int *offset,
+			unsigned int *nob, lnet_kiov_t **kiov)
+{
+	/* GETs are weird, see kgnilnd_send */
+	if (lntmsg->msg_type == LNET_MSG_GET) {
+		if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0) {
+			*kiov = NULL;
+		} else {
+			*kiov = lntmsg->msg_md->md_iov.kiov;
+		}
+		*niov = lntmsg->msg_md->md_niov;
+		*nob = lntmsg->msg_md->md_length;
+		*offset = 0;
+	} else {
+		*kiov = lntmsg->msg_kiov;
+		*niov = lntmsg->msg_niov;
+		*nob = lntmsg->msg_len;
+		*offset = lntmsg->msg_offset;
+	}
+}
+
+static inline void
+kgnilnd_compute_rdma_cksum(kgn_tx_t *tx)
+{
+	unsigned int     niov, offset, nob;
+	lnet_kiov_t     *kiov;
+	lnet_msg_t      *lntmsg = tx->tx_lntmsg[0];
+	int              dump_cksum = (*kgnilnd_tunables.kgn_checksum_dump > 1);
+
+	GNITX_ASSERTF(tx, ((tx->tx_msg.gnm_type == GNILND_MSG_PUT_DONE) ||
+			   (tx->tx_msg.gnm_type == GNILND_MSG_GET_DONE)),
+		      "bad type %s", kgnilnd_msgtype2str(tx->tx_msg.gnm_type));
+
+
+	if (*kgnilnd_tunables.kgn_checksum < 3) {
+		tx->tx_msg.gnm_payload_cksum = 0;
+		return;
+	}
+
+	GNITX_ASSERTF(tx, lntmsg, "no LNet message!", NULL);
+
+	kgnilnd_parse_lnet_rdma(lntmsg, &niov, &offset, &nob, &kiov);
+
+	if (kiov != NULL) {
+		tx->tx_msg.gnm_payload_cksum = kgnilnd_cksum_kiov(niov, kiov, offset, nob, dump_cksum);
+	} else {
+		tx->tx_msg.gnm_payload_cksum = kgnilnd_cksum(tx->tx_buffer, nob);
+		if (dump_cksum) {
+			kgnilnd_dump_blob(D_BUFFS, "peer RDMA payload", tx->tx_buffer, nob);
+		}
+	}
+
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_CKSUM3)) {
+		tx->tx_msg.gnm_payload_cksum += 0xd00d;
+	}
+}
+
+static inline int
+kgnilnd_verify_rdma_cksum(kgn_tx_t *tx, __u16 rx_cksum)
+{
+	int              rc = 0;
+	__u16            cksum;
+	unsigned int     niov, offset, nob;
+	lnet_kiov_t     *kiov;
+	lnet_msg_t      *lntmsg = tx->tx_lntmsg[0];
+	int dump_on_err = *kgnilnd_tunables.kgn_checksum_dump;
+
+	/* we can only match certain requests */
+	GNITX_ASSERTF(tx, ((tx->tx_msg.gnm_type == GNILND_MSG_GET_REQ) ||
+			   (tx->tx_msg.gnm_type == GNILND_MSG_PUT_ACK)),
+		      "bad type %s", kgnilnd_msgtype2str(tx->tx_msg.gnm_type));
+
+	if (rx_cksum == 0)  {
+		if (*kgnilnd_tunables.kgn_checksum >= 3) {
+			GNIDBG_MSG(D_WARNING, &tx->tx_msg,
+				   "no RDMA payload checksum when enabled");
+		}
+		return 0;
+	}
+
+	GNITX_ASSERTF(tx, lntmsg, "no LNet message!", NULL);
+
+	kgnilnd_parse_lnet_rdma(lntmsg, &niov, &offset, &nob, &kiov);
+
+	if (kiov != NULL) {
+		cksum = kgnilnd_cksum_kiov(niov, kiov, offset, nob, 0);
+	} else {
+		cksum = kgnilnd_cksum(tx->tx_buffer, nob);
+	}
+
+	if (cksum != rx_cksum) {
+		GNIDBG_MSG(D_NETERROR, &tx->tx_msg,
+			   "Bad RDMA payload checksum (%x expected %x); "
+			   "kiov 0x%p niov %d nob %u offset %u",
+			    cksum, rx_cksum, kiov, niov, nob, offset);
+		switch (dump_on_err) {
+		case 2:
+			if (kiov != NULL) {
+				kgnilnd_cksum_kiov(niov, kiov, offset, nob, 1);
+			} else {
+				kgnilnd_dump_blob(D_BUFFS, "RDMA payload",
+						  tx->tx_buffer, nob);
+			}
+			/* fall through to dump log */
+		case 1:
+			libcfs_debug_dumplog();
+			break;
+		default:
+			break;
+		}
+		rc = -ENOKEY;
+		/* kgnilnd_check_fma_rx will close conn, kill tx with error */
+	}
+	return rc;
+}
+
+void
+kgnilnd_mem_add_map_list(kgn_device_t *dev, kgn_tx_t *tx)
+{
+	int     bytes;
+
+	GNITX_ASSERTF(tx, list_empty(&tx->tx_map_list),
+		"already mapped!", NULL);
+
+	spin_lock(&dev->gnd_map_lock);
+	switch (tx->tx_buftype) {
+	default:
+		GNIDBG_TX(D_EMERG, tx,
+			"SOFTWARE BUG: invalid mapping %d", tx->tx_buftype);
+		spin_unlock(&dev->gnd_map_lock);
+		LBUG();
+		break;
+
+	case GNILND_BUF_PHYS_MAPPED:
+		bytes = tx->tx_phys_npages * PAGE_SIZE;
+		dev->gnd_map_nphys++;
+		dev->gnd_map_physnop += tx->tx_phys_npages;
+		break;
+
+	case GNILND_BUF_VIRT_MAPPED:
+		bytes = tx->tx_nob;
+		dev->gnd_map_nvirt++;
+		dev->gnd_map_virtnob += tx->tx_nob;
+		break;
+	}
+
+	if (tx->tx_msg.gnm_type == GNILND_MSG_PUT_ACK ||
+	    tx->tx_msg.gnm_type == GNILND_MSG_GET_REQ) {
+		atomic64_add(bytes, &dev->gnd_rdmaq_bytes_out);
+		GNIDBG_TX(D_NETTRACE, tx, "rdma ++ %d to "LPD64"",
+			  bytes, atomic64_read(&dev->gnd_rdmaq_bytes_out));
+	}
+
+	atomic_inc(&dev->gnd_n_mdd);
+	atomic64_add(bytes, &dev->gnd_nbytes_map);
+
+	/* clear retrans to prevent any SMSG goofiness as that code uses the same counter */
+	tx->tx_retrans = 0;
+
+	/* we only get here in the valid cases */
+	list_add_tail(&tx->tx_map_list, &dev->gnd_map_list);
+	dev->gnd_map_version++;
+	spin_unlock(&dev->gnd_map_lock);
+}
+
+void
+kgnilnd_mem_del_map_list(kgn_device_t *dev, kgn_tx_t *tx)
+{
+	int     bytes;
+
+	GNITX_ASSERTF(tx, !list_empty(&tx->tx_map_list),
+		"not mapped!", NULL);
+	spin_lock(&dev->gnd_map_lock);
+
+	switch (tx->tx_buftype) {
+	default:
+		GNIDBG_TX(D_EMERG, tx,
+			"SOFTWARE BUG: invalid mapping %d", tx->tx_buftype);
+		spin_unlock(&dev->gnd_map_lock);
+		LBUG();
+		break;
+
+	case GNILND_BUF_PHYS_UNMAPPED:
+		bytes = tx->tx_phys_npages * PAGE_SIZE;
+		dev->gnd_map_nphys--;
+		dev->gnd_map_physnop -= tx->tx_phys_npages;
+		break;
+
+	case GNILND_BUF_VIRT_UNMAPPED:
+		bytes = tx->tx_nob;
+		dev->gnd_map_nvirt--;
+		dev->gnd_map_virtnob -= tx->tx_nob;
+		break;
+	}
+
+	if (tx->tx_msg.gnm_type == GNILND_MSG_PUT_ACK ||
+	    tx->tx_msg.gnm_type == GNILND_MSG_GET_REQ) {
+		atomic64_sub(bytes, &dev->gnd_rdmaq_bytes_out);
+		LASSERTF(atomic64_read(&dev->gnd_rdmaq_bytes_out) >= 0,
+			 "bytes_out negative! %ld\n", atomic64_read(&dev->gnd_rdmaq_bytes_out));
+		GNIDBG_TX(D_NETTRACE, tx, "rdma -- %d to "LPD64"",
+			  bytes, atomic64_read(&dev->gnd_rdmaq_bytes_out));
+	}
+
+	atomic_dec(&dev->gnd_n_mdd);
+	atomic64_sub(bytes, &dev->gnd_nbytes_map);
+
+	/* we only get here in the valid cases */
+	list_del_init(&tx->tx_map_list);
+	dev->gnd_map_version++;
+	spin_unlock(&dev->gnd_map_lock);
+}
+
+int
+kgnilnd_map_buffer(kgn_tx_t *tx)
+{
+	kgn_conn_t       *conn = tx->tx_conn;
+	kgn_device_t     *dev = conn->gnc_device;
+	__u32             flags = GNI_MEM_READWRITE;
+	gni_return_t      rrc;
+
+	/* The kgnilnd_mem_register(_segments) Gemini Driver functions can
+	 * be called concurrently as there are internal locks that protect
+	 * any data structures or HW resources. We just need to ensure
+	 * that our concurrency doesn't result in the kgn_device_t
+	 * getting nuked while we are in here */
+
+	LASSERTF(conn != NULL, "tx %p with NULL conn, someone forgot"
+		" to set tx_conn before calling %s\n", tx, __FUNCTION__);
+
+	if (unlikely(CFS_FAIL_CHECK(CFS_FAIL_GNI_MAP_TX)))
+		RETURN(-ENOMEM);
+
+	if (*kgnilnd_tunables.kgn_bte_relaxed_ordering) {
+		flags |= GNI_MEM_RELAXED_PI_ORDERING;
+	}
+
+	switch (tx->tx_buftype) {
+	default:
+		LBUG();
+
+	case GNILND_BUF_NONE:
+	case GNILND_BUF_IMMEDIATE:
+	case GNILND_BUF_IMMEDIATE_KIOV:
+	case GNILND_BUF_PHYS_MAPPED:
+	case GNILND_BUF_VIRT_MAPPED:
+		return 0;
+
+	case GNILND_BUF_PHYS_UNMAPPED:
+		GNITX_ASSERTF(tx, tx->tx_phys != NULL, "physical buffer not there!", NULL);
+		rrc = kgnilnd_mem_register_segments(dev->gnd_handle,
+			tx->tx_phys, tx->tx_phys_npages, NULL,
+			GNI_MEM_PHYS_SEGMENTS | flags,
+			&tx->tx_map_key);
+		/* could race with other uses of the map counts, but this is ok
+		 * - this needs to turn into a non-fatal error soon to allow
+		 *  GART resource, etc starvation handling */
+		if (rrc != GNI_RC_SUCCESS) {
+			GNIDBG_TX(D_NET, tx, "Can't map %d pages: dev %d "
+				"phys %u pp %u, virt %u nob "LPU64"",
+				tx->tx_phys_npages, dev->gnd_id,
+				dev->gnd_map_nphys, dev->gnd_map_physnop,
+				dev->gnd_map_nvirt, dev->gnd_map_virtnob);
+			RETURN(rrc == GNI_RC_ERROR_RESOURCE ? -ENOMEM : -EINVAL);
+		}
+
+		tx->tx_buftype = GNILND_BUF_PHYS_MAPPED;
+		kgnilnd_mem_add_map_list(dev, tx);
+		return 0;
+
+	case GNILND_BUF_VIRT_UNMAPPED:
+		rrc = kgnilnd_mem_register(dev->gnd_handle,
+			(__u64)tx->tx_buffer, tx->tx_nob,
+			NULL, flags, &tx->tx_map_key);
+		if (rrc != GNI_RC_SUCCESS) {
+			GNIDBG_TX(D_NET, tx, "Can't map %u bytes: dev %d "
+				"phys %u pp %u, virt %u nob "LPU64"",
+				tx->tx_nob, dev->gnd_id,
+				dev->gnd_map_nphys, dev->gnd_map_physnop,
+				dev->gnd_map_nvirt, dev->gnd_map_virtnob);
+			RETURN(rrc == GNI_RC_ERROR_RESOURCE ? -ENOMEM : -EINVAL);
+		}
+
+		tx->tx_buftype = GNILND_BUF_VIRT_MAPPED;
+		kgnilnd_mem_add_map_list(dev, tx);
+		if (tx->tx_msg.gnm_type == GNILND_MSG_PUT_ACK ||
+		    tx->tx_msg.gnm_type == GNILND_MSG_GET_REQ) {
+			atomic64_add(tx->tx_nob, &dev->gnd_rdmaq_bytes_out);
+			GNIDBG_TX(D_NETTRACE, tx, "rdma ++ %d to %ld\n",
+			       tx->tx_nob, atomic64_read(&dev->gnd_rdmaq_bytes_out));
+		}
+
+		return 0;
+	}
+}
+
+void
+kgnilnd_add_purgatory_tx(kgn_tx_t *tx)
+{
+	kgn_conn_t                  *conn = tx->tx_conn;
+	kgn_mdd_purgatory_t         *gmp;
+
+	LIBCFS_ALLOC(gmp, sizeof(*gmp));
+	LASSERTF(gmp != NULL, "couldn't allocate MDD purgatory member;"
+		" asserting to avoid data corruption\n");
+
+	gmp->gmp_map_key = tx->tx_map_key;
+	atomic_inc(&conn->gnc_device->gnd_n_mdd_held);
+
+	/* ensure that we don't have a blank purgatory - indicating the
+	 * conn is not already on purgatory lists - we'd never recover these
+	 * MDD if that were the case */
+	GNITX_ASSERTF(tx, conn->gnc_in_purgatory,
+		"conn 0x%p->%s with NULL purgatory",
+		conn, libcfs_nid2str(conn->gnc_peer->gnp_nid));
+
+	/* link 'er up! - only place we really need to lock for
+	 * concurrent access */
+	spin_lock(&conn->gnc_list_lock);
+	list_add_tail(&gmp->gmp_list, &conn->gnc_mdd_list);
+	spin_unlock(&conn->gnc_list_lock);
+}
+
+void
+kgnilnd_unmap_buffer(kgn_tx_t *tx, int error)
+{
+	kgn_device_t     *dev;
+	gni_return_t      rrc;
+	int               hold_timeout = 0;
+
+	/* code below relies on +1 relationship ... */
+	CLASSERT(GNILND_BUF_PHYS_MAPPED == (GNILND_BUF_PHYS_UNMAPPED + 1));
+	CLASSERT(GNILND_BUF_VIRT_MAPPED == (GNILND_BUF_VIRT_UNMAPPED + 1));
+
+	switch (tx->tx_buftype) {
+	default:
+		LBUG();
+
+	case GNILND_BUF_NONE:
+	case GNILND_BUF_IMMEDIATE:
+	case GNILND_BUF_PHYS_UNMAPPED:
+	case GNILND_BUF_VIRT_UNMAPPED:
+		break;
+	case GNILND_BUF_IMMEDIATE_KIOV:
+		if (tx->tx_phys != NULL) {
+			vunmap(tx->tx_phys);
+		} else if (tx->tx_phys == NULL && tx->tx_buffer != NULL) {
+			kunmap(tx->tx_imm_pages[0]);
+		}
+		/* clear to prevent kgnilnd_free_tx from thinking
+		 * this is a RDMA descriptor */
+		tx->tx_phys = NULL;
+		break;
+
+	case GNILND_BUF_PHYS_MAPPED:
+	case GNILND_BUF_VIRT_MAPPED:
+		LASSERT(tx->tx_conn != NULL);
+
+		dev = tx->tx_conn->gnc_device;
+
+		/* only want to hold if we are closing conn without
+		 * verified peer notification  - the theory is that
+		 * a TX error can be communicated in all other cases */
+		if (tx->tx_conn->gnc_state != GNILND_CONN_ESTABLISHED &&
+		    kgnilnd_check_purgatory_conn(tx->tx_conn)) {
+			kgnilnd_add_purgatory_tx(tx);
+
+			/* The timeout we give to kgni is a deadman stop only.
+			 *  we are setting high to ensure we don't have the kgni timer
+			 *  fire before ours fires _and_ is handled */
+			hold_timeout = GNILND_TIMEOUT2DEADMAN;
+
+			GNIDBG_TX(D_NET, tx,
+				 "dev %p delaying MDD release for %dms key "LPX64"."LPX64"",
+				 tx->tx_conn->gnc_device, hold_timeout,
+				 tx->tx_map_key.qword1, tx->tx_map_key.qword2);
+		}
+
+		rrc = kgnilnd_mem_deregister(dev->gnd_handle, &tx->tx_map_key, hold_timeout);
+
+		LASSERTF(rrc == GNI_RC_SUCCESS, "rrc %d\n", rrc);
+
+		tx->tx_buftype--;
+		kgnilnd_mem_del_map_list(dev, tx);
+		break;
+	}
+}
+
+void
+kgnilnd_tx_done(kgn_tx_t *tx, int completion)
+{
+	lnet_msg_t      *lntmsg0, *lntmsg1;
+	int             status0, status1;
+	lnet_ni_t       *ni = NULL;
+	kgn_conn_t      *conn = tx->tx_conn;
+
+	LASSERT(!in_interrupt());
+
+	lntmsg0 = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL;
+	lntmsg1 = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL;
+
+	if (completion &&
+	    !(tx->tx_state & GNILND_TX_QUIET_ERROR) &&
+	    !kgnilnd_conn_clean_errno(completion)) {
+		GNIDBG_TOMSG(D_NETERROR, &tx->tx_msg,
+		       "error %d on tx 0x%p->%s id %u/%d state %s age %ds",
+		       completion, tx, conn ?
+		       libcfs_nid2str(conn->gnc_peer->gnp_nid) : "<?>",
+		       tx->tx_id.txe_smsg_id, tx->tx_id.txe_idx,
+		       kgnilnd_tx_state2str(tx->tx_list_state),
+		       cfs_duration_sec((long)jiffies - tx->tx_qtime));
+	}
+
+	/* The error codes determine if we hold onto the MDD */
+	kgnilnd_unmap_buffer(tx, completion);
+
+	/* we have to deliver a reply on lntmsg[1] for the GET, so make sure
+	 * we play nice with the error codes to avoid delivering a failed
+	 * REQUEST and then a REPLY event as well */
+
+	/* return -EIO to lnet - it is the magic value for failed sends */
+	if (tx->tx_msg.gnm_type == GNILND_MSG_GET_REQ) {
+		status0 = 0;
+		status1 = completion;
+	} else {
+		status0 = status1 = completion;
+	}
+
+	tx->tx_buftype = GNILND_BUF_NONE;
+	tx->tx_msg.gnm_type = GNILND_MSG_NONE;
+
+	/* lnet_finalize doesn't do anything with the *ni, so ok for us to
+	 * set NULL when we are a tx without a conn */
+	if (conn != NULL) {
+		ni = conn->gnc_peer->gnp_net->gnn_ni;
+
+		spin_lock(&conn->gnc_tx_lock);
+
+		LASSERTF(test_and_clear_bit(tx->tx_id.txe_idx,
+			(volatile unsigned long *)&conn->gnc_tx_bits),
+			"conn %p tx %p bit %d already cleared\n",
+			conn, tx, tx->tx_id.txe_idx);
+
+		LASSERTF(conn->gnc_tx_ref_table[tx->tx_id.txe_idx] != NULL,
+			 "msg_id %d already NULL\n", tx->tx_id.txe_idx);
+
+		conn->gnc_tx_ref_table[tx->tx_id.txe_idx] = NULL;
+		spin_unlock(&conn->gnc_tx_lock);
+	}
+
+	kgnilnd_free_tx(tx);
+
+	/* finalize AFTER freeing lnet msgs */
+
+	/* warning - we should hold no locks here - calling lnet_finalize
+	 * could free up lnet credits, resulting in a call chain back into
+	 * the LND via kgnilnd_send and friends */
+	lnet_finalize(ni, lntmsg0, status0);
+
+	if (lntmsg1 != NULL) {
+		lnet_finalize(ni, lntmsg1, status1);
+	}
+}
+
+void
+kgnilnd_txlist_done(struct list_head *txlist, int error)
+{
+	kgn_tx_t        *tx, *txn;
+	int              err_printed = 0;
+
+	if (list_empty(txlist))
+		return;
+
+	list_for_each_entry_safe(tx, txn, txlist, tx_list) {
+		/* only print the first error */
+		if (err_printed)
+			tx->tx_state |= GNILND_TX_QUIET_ERROR;
+		list_del_init(&tx->tx_list);
+		kgnilnd_tx_done(tx, error);
+		err_printed++;
+	}
+}
+int
+kgnilnd_set_tx_id(kgn_tx_t *tx, kgn_conn_t *conn)
+{
+	int     id;
+
+	spin_lock(&conn->gnc_tx_lock);
+
+	/* ID zero is NOT ALLOWED!!! */
+
+search_again:
+	id = find_next_zero_bit((unsigned long *)&conn->gnc_tx_bits,
+				 GNILND_MAX_MSG_ID, conn->gnc_next_tx);
+	if (id == GNILND_MAX_MSG_ID) {
+		if (conn->gnc_next_tx != 1) {
+			/* we only searched from next_tx to end and didn't find
+			 * one, so search again from start */
+			conn->gnc_next_tx = 1;
+			goto search_again;
+		}
+		/* couldn't find one! */
+		spin_unlock(&conn->gnc_tx_lock);
+		return -E2BIG;
+	}
+
+	/* bump next_tx to prevent immediate reuse */
+	conn->gnc_next_tx = id + 1;
+
+	set_bit(id, (volatile unsigned long *)&conn->gnc_tx_bits);
+	LASSERTF(conn->gnc_tx_ref_table[id] == NULL,
+		 "tx 0x%p already at id %d\n",
+		 conn->gnc_tx_ref_table[id], id);
+
+	/* delay these until we have a valid ID - prevents bad clear of the bit
+	 * in kgnilnd_tx_done */
+	tx->tx_conn = conn;
+	tx->tx_id.txe_cqid = conn->gnc_cqid;
+
+	tx->tx_id.txe_idx = id;
+	conn->gnc_tx_ref_table[id] = tx;
+
+	/* Using jiffies to help differentiate against TX reuse - with
+	 * the usual minimum of a 250HZ clock, we wrap jiffies on the same TX
+	 * if we are sending to the same node faster than 256000/sec.
+	 * To help guard against this, we OR in the tx_seq - that is 32 bits */
+
+	tx->tx_id.txe_chips = (__u32)(jiffies | conn->gnc_tx_seq);
+
+	GNIDBG_TX(D_NET, tx, "set cookie/id/bits", NULL);
+
+	spin_unlock(&conn->gnc_tx_lock);
+	return 0;
+}
+
+static inline int
+kgnilnd_tx_should_retry(kgn_conn_t *conn, kgn_tx_t *tx)
+{
+	int             max_retrans = *kgnilnd_tunables.kgn_max_retransmits;
+	int             log_retrans;
+	int             log_retrans_level;
+
+	/* I need kgni credits to send this.  Replace tx at the head of the
+	 * fmaq and I'll get rescheduled when credits appear */
+	tx->tx_state = 0;
+	tx->tx_retrans++;
+	conn->gnc_tx_retrans++;
+	log_retrans = ((tx->tx_retrans < 25) || ((tx->tx_retrans % 25) == 0) ||
+			(tx->tx_retrans > (max_retrans / 2)));
+	log_retrans_level = tx->tx_retrans < (max_retrans / 2) ? D_NET : D_NETERROR;
+
+	/* Decision time - either error, warn or just retransmit */
+
+	/* we don't care about TX timeout - it could be that the network is slower
+	 * or throttled. We'll keep retranmitting - so if the network is so slow
+	 * that we fill up our mailbox, we'll keep trying to resend that msg
+	 * until we exceed the max_retrans _or_ gnc_last_rx expires, indicating
+	 * that he hasn't send us any traffic in return */
+
+	if (tx->tx_retrans > max_retrans) {
+		/* this means we are not backing off the retransmits
+		 * in a healthy manner and are likely chewing up the
+		 * CPU cycles quite badly */
+		GNIDBG_TOMSG(D_ERROR, &tx->tx_msg,
+			"SOFTWARE BUG: too many retransmits (%d) for tx id %x "
+			"conn 0x%p->%s\n",
+			tx->tx_retrans, tx->tx_id, conn,
+			libcfs_nid2str(conn->gnc_peer->gnp_nid));
+
+		/* yes - double errors to help debug this condition */
+		GNIDBG_TOMSG(D_NETERROR, &tx->tx_msg, "connection dead. "
+			"unable to send to %s for %lu secs (%d tries)",
+			libcfs_nid2str(tx->tx_conn->gnc_peer->gnp_nid),
+			cfs_duration_sec(jiffies - tx->tx_cred_wait),
+			tx->tx_retrans);
+
+		kgnilnd_close_conn(conn, -ETIMEDOUT);
+
+		/* caller should terminate */
+		RETURN(0);
+	} else {
+		/* some reasonable throttling of the debug message */
+		if (log_retrans) {
+			unsigned long now = jiffies;
+			/* XXX Nic: Mystical TX debug here... */
+			GNIDBG_SMSG_CREDS(log_retrans_level, conn);
+			GNIDBG_TOMSG(log_retrans_level, &tx->tx_msg,
+				"NOT_DONE on conn 0x%p->%s id %x retrans %d wait %dus"
+				" last_msg %uus/%uus last_cq %uus/%uus",
+				conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+				tx->tx_id, tx->tx_retrans,
+				jiffies_to_usecs(now - tx->tx_cred_wait),
+				jiffies_to_usecs(now - conn->gnc_last_tx),
+				jiffies_to_usecs(now - conn->gnc_last_rx),
+				jiffies_to_usecs(now - conn->gnc_last_tx_cq),
+				jiffies_to_usecs(now - conn->gnc_last_rx_cq));
+		}
+		/* caller should retry */
+		RETURN(1);
+	}
+}
+
+/* caller must be holding gnd_cq_mutex and not unlock it afterwards, as we need to drop it
+ * to avoid bad ordering with state_lock */
+
+static inline int
+kgnilnd_sendmsg_nolock(kgn_tx_t *tx, void *immediate, unsigned int immediatenob,
+		spinlock_t *state_lock, kgn_tx_list_state_t state)
+{
+	kgn_conn_t      *conn = tx->tx_conn;
+	kgn_msg_t       *msg = &tx->tx_msg;
+	int              retry_send;
+	gni_return_t     rrc;
+	unsigned long    newest_last_rx, timeout;
+	unsigned long    now;
+
+	LASSERTF((msg->gnm_type == GNILND_MSG_IMMEDIATE) ?
+		immediatenob <= *kgnilnd_tunables.kgn_max_immediate :
+		immediatenob == 0,
+		"msg 0x%p type %d wrong payload size %d\n",
+		msg, msg->gnm_type, immediatenob);
+
+	/* make sure we catch all the cases where we'd send on a dirty old mbox
+	 * but allow case for sending CLOSE. Since this check is within the CQ
+	 * mutex barrier and the close message is only sent through
+	 * kgnilnd_send_conn_close the last message out the door will be the
+	 * close message.
+	 */
+	if (atomic_read(&conn->gnc_peer->gnp_dirty_eps) != 0 && msg->gnm_type != GNILND_MSG_CLOSE) {
+		mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+		/* Return -ETIME, we are closing the connection already so we dont want to
+		 * have this tx hit the wire. The tx will be killed by the calling function.
+		 * Once the EP is marked dirty the close message will be the last
+		 * thing to hit the wire */
+		return -ETIME;
+	}
+
+	now = jiffies;
+	timeout = cfs_time_seconds(conn->gnc_timeout);
+
+	newest_last_rx = GNILND_LASTRX(conn);
+
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SEND_TIMEOUT)) {
+		now = now + (GNILND_TIMEOUTRX(timeout) * 2);
+	}
+
+	if (time_after_eq(now, newest_last_rx + GNILND_TIMEOUTRX(timeout))) {
+		GNIDBG_CONN(D_NETERROR|D_CONSOLE, conn, "Cant send to %s after timeout lapse of %lu; TO %lu",
+		libcfs_nid2str(conn->gnc_peer->gnp_nid),
+		cfs_duration_sec(now - newest_last_rx),
+		cfs_duration_sec(GNILND_TIMEOUTRX(timeout)));
+		mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+		return -ETIME;
+	}
+
+	GNITX_ASSERTF(tx, (conn != NULL) && (tx->tx_id.txe_idx != 0), "tx id unset!", NULL);
+	/* msg->gnm_srcnid is set when the message is initialized by whatever function is
+	 * creating the message this allows the message to contain the correct LNET NID/NET needed
+	 * instead of the one that the peer/conn uses for sending the data.
+	 */
+	msg->gnm_connstamp = conn->gnc_my_connstamp;
+	msg->gnm_payload_len = immediatenob;
+	msg->gnm_seq = conn->gnc_tx_seq;
+
+	/* always init here - kgn_checksum is a /sys module tunable
+	 * and can be flipped at any point, even between msg init and sending */
+	msg->gnm_cksum = 0;
+	if (*kgnilnd_tunables.kgn_checksum) {
+		/* We must set here and not in kgnilnd_init_msg,
+		 * we could resend this msg many times
+		 * (NOT_DONE from gni_smsg_send below) and wouldn't pass
+		 * through init_msg again */
+		msg->gnm_cksum = kgnilnd_cksum(msg, sizeof(kgn_msg_t));
+		if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_CKSUM1)) {
+			msg->gnm_cksum += 0xf00f;
+		}
+	}
+
+	GNIDBG_TOMSG(D_NET, msg, "tx 0x%p conn 0x%p->%s sending SMSG sz %u id %x/%d [%p for %u]",
+	       tx, conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+	       sizeof(kgn_msg_t), tx->tx_id.txe_smsg_id,
+	       tx->tx_id.txe_idx, immediate, immediatenob);
+
+	if (unlikely(tx->tx_state & GNILND_TX_FAIL_SMSG)) {
+		rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE;
+	} else {
+	rrc = kgnilnd_smsg_send(conn->gnc_ephandle,
+				    msg, sizeof(*msg), immediate, immediatenob,
+			    tx->tx_id.txe_smsg_id);
+	}
+
+	switch (rrc) {
+	case GNI_RC_SUCCESS:
+		conn->gnc_tx_seq++;
+		conn->gnc_last_tx = jiffies;
+		/* no locking here as LIVE isn't a list */
+		kgnilnd_tx_add_state_locked(tx, NULL, conn, GNILND_TX_LIVE_FMAQ, 1);
+
+		/* this needs to be checked under lock as it might be freed from a completion
+		 * event.
+		 */
+		if (msg->gnm_type == GNILND_MSG_NOOP) {
+			set_mb(conn->gnc_last_noop_sent, jiffies);
+		}
+
+		/* serialize with seeing CQ events for completion on this, as well as
+		 * tx_seq */
+		mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+
+		atomic_inc(&conn->gnc_device->gnd_short_ntx);
+		atomic64_add(immediatenob, &conn->gnc_device->gnd_short_txbytes);
+		kgnilnd_peer_alive(conn->gnc_peer);
+		GNIDBG_SMSG_CREDS(D_NET, conn);
+		return 0;
+
+	case GNI_RC_NOT_DONE:
+		/* XXX Nic: We need to figure out how to track this
+		 * - there are bound to be good reasons for it,
+		 * but we want to know when it happens */
+
+		mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+		/* We'll handle this error inline - makes the calling logic much more
+		 * clean */
+
+		/* If no lock, caller doesn't want us to retry */
+		if (state_lock == NULL) {
+			return -EAGAIN;
+		}
+
+		retry_send = kgnilnd_tx_should_retry(conn, tx);
+		if (retry_send) {
+			/* add to head of list for the state and retries */
+			spin_lock(state_lock);
+			kgnilnd_tx_add_state_locked(tx, conn->gnc_peer, conn, state, 0);
+			spin_unlock(state_lock);
+
+			/* We only reschedule for a certain number of retries, then
+			 * we will wait for the CQ events indicating a release of SMSG
+			 * credits */
+			if (tx->tx_retrans < (*kgnilnd_tunables.kgn_max_retransmits/4)) {
+				kgnilnd_schedule_conn(conn);
+				return 0;
+			} else {
+				/* CQ event coming in signifies either TX completed or
+				 * RX receive. Either of these *could* free up credits
+				 * in the SMSG mbox and we should try sending again */
+				GNIDBG_TX(D_NET, tx, "waiting for CQID %u event to resend",
+					 tx->tx_conn->gnc_cqid);
+				/* use +ve return code to let upper layers know they
+				 * should stop looping on sends */
+				return EAGAIN;
+			}
+		} else {
+			return -EAGAIN;
+		}
+	default:
+		/* handle bad retcode gracefully */
+		mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+		return -EIO;
+	}
+}
+
+/* kgnilnd_sendmsg has hard wait on gnd_cq_mutex */
+static inline int
+kgnilnd_sendmsg(kgn_tx_t *tx, void *immediate, unsigned int immediatenob,
+		spinlock_t *state_lock, kgn_tx_list_state_t state)
+{
+	kgn_device_t    *dev = tx->tx_conn->gnc_device;
+	unsigned long    timestamp;
+	int              rc;
+
+	timestamp = jiffies;
+	mutex_lock(&dev->gnd_cq_mutex);
+	/* delay in jiffies - we are really concerned only with things that
+	 * result in a schedule() or really holding this off for long times .
+	 * NB - mutex_lock could spin for 2 jiffies before going to sleep to wait */
+	dev->gnd_mutex_delay += (long) jiffies - timestamp;
+
+	rc = kgnilnd_sendmsg_nolock(tx, immediate, immediatenob, state_lock, state);
+
+	RETURN(rc);
+}
+
+
+/* returns -EAGAIN for lock miss, anything else < 0 is hard error, >=0 for success */
+static inline int
+kgnilnd_sendmsg_trylock(kgn_tx_t *tx, void *immediate, unsigned int immediatenob,
+		spinlock_t *state_lock, kgn_tx_list_state_t state)
+{
+	kgn_conn_t      *conn = tx->tx_conn;
+	kgn_device_t    *dev = conn->gnc_device;
+	unsigned long    timestamp;
+	int              rc;
+
+	timestamp = jiffies;
+
+	/* technically we are doing bad things with the read_lock on the peer_conn
+	 * table, but we shouldn't be sleeping inside here - and we don't sleep/block
+	 * for the mutex. I bet lockdep is gonna flag this one though... */
+
+	/* there are a few cases where we don't want the immediate send - like
+	 * when we are in the scheduler thread and it'd harm the latency of
+	 * getting messages up to LNet */
+
+	/* rmb for gnd_ready */
+	smp_rmb();
+	if (conn->gnc_device->gnd_ready == GNILND_DEV_LOOP) {
+		rc = 0;
+		atomic_inc(&conn->gnc_device->gnd_fast_block);
+	} else if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+	       /* dont hit HW during quiesce */
+		rc = 0;
+	} else if (unlikely(atomic_read(&conn->gnc_peer->gnp_dirty_eps))) {
+	       /* dont hit HW if stale EPs and conns left to close */
+		rc = 0;
+	} else {
+		atomic_inc(&conn->gnc_device->gnd_fast_try);
+		rc = mutex_trylock(&conn->gnc_device->gnd_cq_mutex);
+	}
+	if (!rc) {
+		rc = -EAGAIN;
+	} else {
+		/* we got the mutex and weren't blocked */
+
+		/* delay in jiffies - we are really concerned only with things that
+		 * result in a schedule() or really holding this off for long times .
+		 * NB - mutex_lock could spin for 2 jiffies before going to sleep to wait */
+		dev->gnd_mutex_delay += (long) jiffies - timestamp;
+
+		atomic_inc(&conn->gnc_device->gnd_fast_ok);
+		tx->tx_qtime = jiffies;
+		tx->tx_state = GNILND_TX_WAITING_COMPLETION;
+		rc = kgnilnd_sendmsg_nolock(tx, tx->tx_buffer, tx->tx_nob, &conn->gnc_list_lock, GNILND_TX_FMAQ);
+		/* _nolock unlocks the mutex for us */
+	}
+
+	RETURN(rc);
+}
+
+/* lets us know if we can push this RDMA through now */
+inline int
+kgnilnd_auth_rdma_bytes(kgn_device_t *dev, kgn_tx_t *tx)
+{
+	long    bytes_left;
+
+	bytes_left = atomic64_sub_return(tx->tx_nob, &dev->gnd_rdmaq_bytes_ok);
+
+	if (bytes_left < 0) {
+		atomic64_add(tx->tx_nob, &dev->gnd_rdmaq_bytes_ok);
+		atomic_inc(&dev->gnd_rdmaq_nstalls);
+		smp_wmb();
+
+		CDEBUG(D_NET, "no bytes to send, turning on timer for %lu\n",
+		       dev->gnd_rdmaq_deadline);
+		mod_timer(&dev->gnd_rdmaq_timer, dev->gnd_rdmaq_deadline);
+		/* we never del this timer - at worst it schedules us.. */
+		return -EAGAIN;
+	} else {
+		return 0;
+	}
+}
+
+/* this adds a TX to the queue pending throttling authorization before
+ * we allow our remote peer to launch a PUT at us */
+void
+kgnilnd_queue_rdma(kgn_conn_t *conn, kgn_tx_t *tx)
+{
+	int     rc;
+
+	/* we cannot go into send_mapped_tx from here as we are holding locks
+	 * and mem registration might end up allocating memory in kgni.
+	 * That said, we'll push this as far as we can into the queue process */
+	rc = kgnilnd_auth_rdma_bytes(conn->gnc_device, tx);
+
+	if (rc < 0) {
+		spin_lock(&conn->gnc_device->gnd_rdmaq_lock);
+		kgnilnd_tx_add_state_locked(tx, NULL, conn, GNILND_TX_RDMAQ, 0);
+		/* lets us know how delayed RDMA is */
+		tx->tx_qtime = jiffies;
+		spin_unlock(&conn->gnc_device->gnd_rdmaq_lock);
+	} else {
+		/* we have RDMA authorized, now it just needs a MDD and to hit the wire */
+		spin_lock(&tx->tx_conn->gnc_device->gnd_lock);
+		kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_MAPQ, 0);
+		/* lets us know how delayed mapping is */
+		tx->tx_qtime = jiffies;
+		spin_unlock(&tx->tx_conn->gnc_device->gnd_lock);
+	}
+
+	/* make sure we wake up sched to run this */
+	kgnilnd_schedule_device(tx->tx_conn->gnc_device);
+}
+
+/* push TX through state machine */
+void
+kgnilnd_queue_tx(kgn_conn_t *conn, kgn_tx_t *tx)
+{
+	int            rc;
+	int            add_tail = 1;
+
+	/* set the tx_id here, we delay it until we have an actual conn
+	 * to fiddle with
+	 * in some cases, the tx_id is already set to provide for things
+	 * like RDMA completion cookies, etc */
+	if (tx->tx_id.txe_idx == 0) {
+		rc = kgnilnd_set_tx_id(tx, conn);
+		if (rc != 0) {
+			kgnilnd_tx_done(tx, rc);
+			return;
+		}
+	}
+
+	CDEBUG(D_NET, "%s to conn %p for %s\n", kgnilnd_msgtype2str(tx->tx_msg.gnm_type),
+		conn, libcfs_nid2str(conn->gnc_peer->gnp_nid));
+
+	/* Only let NOOPs to be sent while fail loc is set, otherwise kill the tx.
+	 */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_ONLY_NOOP) && (tx->tx_msg.gnm_type != GNILND_MSG_NOOP)) {
+		kgnilnd_tx_done(tx, rc);
+		return;
+	}
+
+	switch (tx->tx_msg.gnm_type) {
+	case GNILND_MSG_PUT_ACK:
+	case GNILND_MSG_GET_REQ:
+		/* hijacking time! If this messages will authorize our peer to
+		 * send his dirty little bytes in an RDMA, we need to get permission */
+		kgnilnd_queue_rdma(conn, tx);
+		break;
+	case GNILND_MSG_IMMEDIATE:
+		/* try to send right now, can help reduce latency */
+		rc = kgnilnd_sendmsg_trylock(tx, tx->tx_buffer, tx->tx_nob, &conn->gnc_list_lock, GNILND_TX_FMAQ);
+
+		if (rc >= 0) {
+			/* it was sent, break out of switch to avoid default case of queueing */
+			break;
+		} else if (rc == -EAGAIN) {
+			/* needs to queue to try again, so  fall through to default case */
+		} else {
+			/* bail: it wasnt sent and we didn't get EAGAIN indicating
+			 * we should retrans - We do not close the conn due to locking
+			 * we let the reaper thread take care of it. There are no hard
+			 * errors from send_msg that would require close to be called
+			 */
+			kgnilnd_tx_done(tx, rc);
+			break;
+		}
+	case GNILND_MSG_NOOP:
+		/* Just make sure this goes out first for this conn */
+		add_tail = 0;
+		/* fall through... */
+	default:
+		spin_lock(&conn->gnc_list_lock);
+		kgnilnd_tx_add_state_locked(tx, conn->gnc_peer, conn, GNILND_TX_FMAQ, add_tail);
+		tx->tx_qtime = jiffies;
+		spin_unlock(&conn->gnc_list_lock);
+		kgnilnd_schedule_conn(conn);
+	}
+}
+
+void
+kgnilnd_launch_tx(kgn_tx_t *tx, kgn_net_t *net, lnet_process_id_t *target)
+{
+	kgn_peer_t      *peer;
+	kgn_peer_t      *new_peer = NULL;
+	kgn_conn_t      *conn = NULL;
+	int              rc;
+
+	ENTRY;
+
+	/* If I get here, I've committed to send, so I complete the tx with
+	 * failure on any problems */
+
+	GNITX_ASSERTF(tx, tx->tx_conn == NULL,
+		      "tx already has connection %p", tx->tx_conn);
+
+	/* do all of the peer & conn searching in one swoop - this avoids
+	 * nastiness when dropping locks and needing to maintain a sane state
+	 * in the face of stack reset or something else nuking peers & conns */
+
+	/* I expect to find him, so only take a read lock */
+	read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+	peer = kgnilnd_find_peer_locked(target->nid);
+	if (peer != NULL) {
+		conn = kgnilnd_find_conn_locked(peer);
+		/* this could be NULL during quiesce */
+		if (conn != NULL)  {
+			/* Connection exists; queue message on it */
+			kgnilnd_queue_tx(conn, tx);
+			read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+			RETURN_EXIT;
+		}
+	}
+
+	/* creating peer or conn; I'll need a write lock... */
+	read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+	CFS_RACE(CFS_FAIL_GNI_FIND_TARGET);
+
+	/* NB - this will not block during normal operations -
+	 * the only writer of this is in the startup/shutdown path. */
+	rc = down_read_trylock(&kgnilnd_data.kgn_net_rw_sem);
+	if (!rc) {
+		rc = -ESHUTDOWN;
+		GOTO(no_peer, rc);
+	}
+
+	/* ignore previous peer entirely - we cycled the lock, so we
+	 * will create new peer and at worst drop it if peer is still
+	 * in the tables */
+	rc = kgnilnd_create_peer_safe(&new_peer, target->nid, net);
+	if (rc != 0) {
+		up_read(&kgnilnd_data.kgn_net_rw_sem);
+		GOTO(no_peer, rc);
+	}
+
+	write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+	up_read(&kgnilnd_data.kgn_net_rw_sem);
+
+	/* search for peer again now that we have the lock
+	 * if we don't find it, add our new one to the list */
+	kgnilnd_add_peer_locked(target->nid, new_peer, &peer);
+
+	conn = kgnilnd_find_or_create_conn_locked(peer);
+	if (conn != NULL) {
+		/* oh hey, found a conn now... magical */
+		kgnilnd_queue_tx(conn, tx);
+	} else {
+		/* no conn, must be trying to connect - so we queue for now */
+		tx->tx_qtime = jiffies;
+		kgnilnd_tx_add_state_locked(tx, peer, NULL, GNILND_TX_PEERQ, 1);
+	}
+	write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+	RETURN_EXIT;
+no_peer:
+	kgnilnd_tx_done(tx, rc);
+	RETURN_EXIT;
+}
+
+void
+kgnilnd_rdma(kgn_tx_t *tx, int type,
+	    kgn_rdma_desc_t *sink, unsigned int nob, __u64 cookie)
+{
+	kgn_conn_t   *conn = tx->tx_conn;
+	unsigned long timestamp;
+	gni_return_t  rrc;
+
+	LASSERTF(kgnilnd_tx_mapped(tx),
+		"unmapped tx %p\n", tx);
+	LASSERTF(conn != NULL,
+		"NULL conn on tx %p, naughty, naughty\n", tx);
+	LASSERTF(nob <= sink->gnrd_nob,
+		"nob %u > sink->gnrd_nob %d (%p)\n",
+		nob, sink->gnrd_nob, sink);
+	LASSERTF(nob <= tx->tx_nob,
+		"nob %d > tx(%p)->tx_nob %d\n",
+		nob, tx, tx->tx_nob);
+
+	memset(&tx->tx_rdma_desc, 0, sizeof(tx->tx_rdma_desc));
+	tx->tx_rdma_desc.post_id = tx->tx_id.txe_cookie;
+	tx->tx_rdma_desc.type = GNI_POST_RDMA_PUT;
+	tx->tx_rdma_desc.cq_mode = GNI_CQMODE_GLOBAL_EVENT;
+	tx->tx_rdma_desc.local_addr = (__u64)((unsigned long)tx->tx_buffer);
+	tx->tx_rdma_desc.local_mem_hndl = tx->tx_map_key;
+	tx->tx_rdma_desc.remote_addr = sink->gnrd_addr;
+	tx->tx_rdma_desc.remote_mem_hndl = sink->gnrd_key;
+	tx->tx_rdma_desc.length = nob;
+	if (!*kgnilnd_tunables.kgn_bte_hash)
+		tx->tx_rdma_desc.dlvr_mode |= GNI_DLVMODE_NO_HASH;
+	if (!*kgnilnd_tunables.kgn_bte_adapt)
+		tx->tx_rdma_desc.dlvr_mode |= (GNI_DLVMODE_NO_ADAPT | GNI_DLVMODE_NO_RADAPT);
+
+	/* prep final completion message */
+	kgnilnd_init_msg(&tx->tx_msg, type, tx->tx_msg.gnm_srcnid);
+	tx->tx_msg.gnm_u.completion.gncm_cookie = cookie;
+	/* send actual size RDMA'd in retval */
+	tx->tx_msg.gnm_u.completion.gncm_retval = nob;
+
+	kgnilnd_compute_rdma_cksum(tx);
+
+	if (nob == 0) {
+		kgnilnd_queue_tx(conn, tx);
+		return;
+	}
+
+	/* Don't lie (CLOSE == RDMA idle) */
+	LASSERTF(!conn->gnc_close_sent, "tx %p on conn %p after close sent %d\n",
+		 tx, conn, conn->gnc_close_sent);
+
+	GNIDBG_TX(D_NET, tx, "Post RDMA type 0x%02x dlvr_mode 0x%x",
+	       type, tx->tx_rdma_desc.dlvr_mode);
+
+	/* set CQ dedicated for RDMA */
+	tx->tx_rdma_desc.src_cq_hndl = conn->gnc_device->gnd_snd_rdma_cqh;
+
+	timestamp = jiffies;
+	mutex_lock(&conn->gnc_device->gnd_cq_mutex);
+	/* delay in jiffies - we are really concerned only with things that
+	 * result in a schedule() or really holding this off for long times .
+	 * NB - mutex_lock could spin for 2 jiffies before going to sleep to wait */
+	conn->gnc_device->gnd_mutex_delay += (long) jiffies - timestamp;
+
+	rrc = kgnilnd_post_rdma(conn->gnc_ephandle, &tx->tx_rdma_desc);
+
+	spin_lock(&conn->gnc_list_lock);
+	kgnilnd_tx_add_state_locked(tx, conn->gnc_peer, conn, GNILND_TX_LIVE_RDMAQ, 1);
+	tx->tx_qtime = jiffies;
+	spin_unlock(&conn->gnc_list_lock);
+
+	mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+
+	/* XXX Nic: is this a place we should handle more errors for
+	 * robustness sake */
+	LASSERT(rrc == GNI_RC_SUCCESS);
+
+}
+
+kgn_rx_t *
+kgnilnd_alloc_rx(void)
+{
+	kgn_rx_t        *rx;
+
+	rx = cfs_mem_cache_alloc(kgnilnd_data.kgn_rx_cache, CFS_ALLOC_ATOMIC);
+	if (rx == NULL) {
+		CERROR("failed to allocate rx\n");
+		return NULL;
+	}
+	CDEBUG(D_MALLOC, "slab-alloced 'rx': %lu at %p.\n",
+	       sizeof(*rx), rx);
+
+	/* no memset to zero, we'll always fill all members */
+	return rx;
+}
+
+/* release is to just free connection resources
+ * we use this for the eager path after copying */
+void
+kgnilnd_release_msg(kgn_conn_t *conn)
+{
+	gni_return_t    rrc;
+	unsigned long   timestamp;
+
+	CDEBUG(D_NET, "consuming %p\n", conn);
+
+	timestamp = jiffies;
+	mutex_lock(&conn->gnc_device->gnd_cq_mutex);
+	/* delay in jiffies - we are really concerned only with things that
+	 * result in a schedule() or really holding this off for long times .
+	 * NB - mutex_lock could spin for 2 jiffies before going to sleep to wait */
+	conn->gnc_device->gnd_mutex_delay += (long) jiffies - timestamp;
+
+	rrc = kgnilnd_smsg_release(conn->gnc_ephandle);
+	mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+
+	LASSERTF(rrc == GNI_RC_SUCCESS, "bad rrc %d\n", rrc);
+	GNIDBG_SMSG_CREDS(D_NET, conn);
+
+	return;
+}
+
+void
+kgnilnd_consume_rx(kgn_rx_t *rx)
+{
+	kgn_conn_t      *conn = rx->grx_conn;
+	kgn_msg_t       *rxmsg = rx->grx_msg;
+
+	/* if we are eager, free the cache alloc'd msg */
+	if (unlikely(rx->grx_eager)) {
+		LIBCFS_FREE(rxmsg, sizeof(*rxmsg) + *kgnilnd_tunables.kgn_max_immediate);
+
+		/* release ref from eager_recv */
+		kgnilnd_conn_decref(conn);
+	} else {
+		GNIDBG_MSG(D_NET, rxmsg, "rx %p processed", rx);
+		kgnilnd_release_msg(conn);
+	}
+
+	cfs_mem_cache_free(kgnilnd_data.kgn_rx_cache, rx);
+	CDEBUG(D_MALLOC, "slab-freed 'rx': %lu at %p.\n",
+	       sizeof(*rx), rx);
+
+	return;
+}
+
+int
+kgnilnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+	lnet_hdr_t       *hdr = &lntmsg->msg_hdr;
+	int               type = lntmsg->msg_type;
+	lnet_process_id_t target = lntmsg->msg_target;
+	int               target_is_router = lntmsg->msg_target_is_router;
+	int               routing = lntmsg->msg_routing;
+	unsigned int      niov = lntmsg->msg_niov;
+	struct iovec     *iov = lntmsg->msg_iov;
+	lnet_kiov_t      *kiov = lntmsg->msg_kiov;
+	unsigned int      offset = lntmsg->msg_offset;
+	unsigned int      nob = lntmsg->msg_len;
+	unsigned int      msg_vmflush = lntmsg->msg_vmflush;
+	kgn_net_t        *net = ni->ni_data;
+	kgn_tx_t         *tx;
+	int               rc = 0;
+	int               mpflag = 0;
+
+	/* NB 'private' is different depending on what we're sending.... */
+	LASSERT(!in_interrupt());
+
+	CDEBUG(D_NET, "sending msg type %d with %d bytes in %d frags to %s\n",
+	       type, nob, niov, libcfs_id2str(target));
+
+	LASSERTF(nob == 0 || niov > 0,
+		"lntmsg %p nob %d niov %d\n", lntmsg, nob, niov);
+	LASSERTF(niov <= LNET_MAX_IOV,
+		"lntmsg %p niov %d\n", lntmsg, niov);
+
+	/* payload is either all vaddrs or all pages */
+	LASSERTF(!(kiov != NULL && iov != NULL),
+		"lntmsg %p kiov %p iov %p\n", lntmsg, kiov, iov);
+
+	if (msg_vmflush)
+		mpflag = cfs_memory_pressure_get_and_set();
+
+	switch (type) {
+	default:
+		CERROR("lntmsg %p with unexpected type %d\n",
+			lntmsg, type);
+		LBUG();
+
+	case LNET_MSG_ACK:
+		LASSERTF(nob == 0, "lntmsg %p nob %d\n",
+			lntmsg, nob);
+		break;
+
+	case LNET_MSG_GET:
+		LASSERT(niov == 0);
+		LASSERT(nob == 0);
+
+		if (routing || target_is_router)
+			break;                  /* send IMMEDIATE */
+
+		/* it is safe to do direct GET with out mapping buffer for RDMA as we
+		 * check the eventual sink buffer here - if small enough, remote
+		 * end is perfectly capable of returning data in short message -
+		 * The magic is that we call lnet_parse in kgnilnd_recv with rdma_req=0
+		 * for IMMEDIATE messages which will have it send a real reply instead
+		 * of doing kgnilnd_recv to have the RDMA continued */
+		if (lntmsg->msg_md->md_length <= *kgnilnd_tunables.kgn_max_immediate)
+		       break;
+
+		tx = kgnilnd_new_tx_msg(GNILND_MSG_GET_REQ, ni->ni_nid);
+		if (tx == NULL) {
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		/* slightly different options as we might actually have a GET with a
+		 * MD_KIOV set but a non-NULL md_iov.iov */
+		if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0)
+			rc = kgnilnd_setup_rdma_buffer(tx, lntmsg->msg_md->md_niov,
+						      lntmsg->msg_md->md_iov.iov, NULL,
+						      0, lntmsg->msg_md->md_length);
+		else
+			rc = kgnilnd_setup_rdma_buffer(tx, lntmsg->msg_md->md_niov,
+						      NULL, lntmsg->msg_md->md_iov.kiov,
+						      0, lntmsg->msg_md->md_length);
+		if (rc != 0) {
+			CERROR("unable to setup buffer: %d\n", rc);
+			kgnilnd_tx_done(tx, rc);
+			rc = -EIO;
+			goto out;
+		}
+
+		tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg);
+		if (tx->tx_lntmsg[1] == NULL) {
+			CERROR("Can't create reply for GET to %s\n",
+			       libcfs_nid2str(target.nid));
+			kgnilnd_tx_done(tx, rc);
+			rc = -EIO;
+			goto out;
+		}
+
+		tx->tx_lntmsg[0] = lntmsg;
+		tx->tx_msg.gnm_u.get.gngm_hdr = *hdr;
+		/* rest of tx_msg is setup just before it is sent */
+		kgnilnd_launch_tx(tx, net, &target);
+		goto out;
+
+	case LNET_MSG_REPLY:
+	case LNET_MSG_PUT:
+		/* to save on MDDs, we'll handle short kiov by vmap'ing
+		 * and sending via SMSG */
+		if (nob <= *kgnilnd_tunables.kgn_max_immediate)
+		       break;
+
+		tx = kgnilnd_new_tx_msg(GNILND_MSG_PUT_REQ, ni->ni_nid);
+		if (tx == NULL) {
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		rc = kgnilnd_setup_rdma_buffer(tx, niov, iov, kiov, offset, nob);
+		if (rc != 0) {
+			kgnilnd_tx_done(tx, rc);
+			rc = -EIO;
+			goto out;
+		}
+
+		tx->tx_lntmsg[0] = lntmsg;
+		tx->tx_msg.gnm_u.putreq.gnprm_hdr = *hdr;
+		/* rest of tx_msg is setup just before it is sent */
+		kgnilnd_launch_tx(tx, net, &target);
+		goto out;
+	}
+
+	/* send IMMEDIATE */
+
+	LASSERTF(nob <= *kgnilnd_tunables.kgn_max_immediate,
+		"lntmsg 0x%p too large %d\n", lntmsg, nob);
+
+	tx = kgnilnd_new_tx_msg(GNILND_MSG_IMMEDIATE, ni->ni_nid);
+	if (tx == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	rc = kgnilnd_setup_immediate_buffer(tx, niov, iov, kiov, offset, nob);
+	if (rc != 0) {
+		kgnilnd_tx_done(tx, rc);
+		goto out;
+	}
+
+	tx->tx_msg.gnm_u.immediate.gnim_hdr = *hdr;
+	tx->tx_lntmsg[0] = lntmsg;
+	kgnilnd_launch_tx(tx, net, &target);
+
+out:
+	/* use stored value as we could have already finalized lntmsg here from a failed launch */
+	if (msg_vmflush)
+		cfs_memory_pressure_restore(mpflag);
+	return rc;
+}
+
+void
+kgnilnd_reply(lnet_ni_t *ni, kgn_rx_t *rx, lnet_msg_t *lntmsg)
+{
+	kgn_conn_t    *conn = rx->grx_conn;
+	kgn_msg_t     *rxmsg = rx->grx_msg;
+	unsigned int   niov = lntmsg->msg_niov;
+	struct iovec  *iov = lntmsg->msg_iov;
+	lnet_kiov_t   *kiov = lntmsg->msg_kiov;
+	unsigned int   offset = lntmsg->msg_offset;
+	unsigned int   nob = lntmsg->msg_len;
+	kgn_tx_t      *tx;
+	int            rc = 0;
+
+	tx = kgnilnd_new_tx_msg(GNILND_MSG_GET_DONE, ni->ni_nid);
+	if (tx == NULL)
+		goto failed_0;
+
+	rc = kgnilnd_set_tx_id(tx, conn);
+	if (rc != 0)
+		goto failed_1;
+
+	rc = kgnilnd_setup_rdma_buffer(tx, niov, iov, kiov, offset, nob);
+	if (rc != 0)
+		goto failed_1;
+
+	tx->tx_lntmsg[0] = lntmsg;
+	tx->tx_getinfo = rxmsg->gnm_u.get;
+
+	/* we only queue from kgnilnd_recv - we might get called from other contexts
+	 * and we don't want to block the mutex in those cases */
+
+	spin_lock(&tx->tx_conn->gnc_device->gnd_lock);
+	kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_MAPQ, 1);
+	spin_unlock(&tx->tx_conn->gnc_device->gnd_lock);
+	kgnilnd_schedule_device(tx->tx_conn->gnc_device);
+
+	return;
+
+ failed_1:
+	kgnilnd_tx_done(tx, rc);
+	kgnilnd_nak_rdma(conn, GNILND_MSG_GET_NAK, rc, rxmsg->gnm_u.get.gngm_cookie, ni->ni_nid);
+ failed_0:
+	lnet_finalize(ni, lntmsg, rc);
+}
+
+int
+kgnilnd_eager_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+		   void **new_private)
+{
+	kgn_rx_t        *rx = private;
+	kgn_conn_t      *conn = rx->grx_conn;
+	kgn_msg_t       *rxmsg = rx->grx_msg;
+	kgn_msg_t       *eagermsg = NULL;
+
+	GNIDBG_MSG(D_NET, rxmsg, "eager recv for conn %p, rxmsg %p, lntmsg %p",
+		conn, rxmsg, lntmsg);
+
+	if (rxmsg->gnm_payload_len > *kgnilnd_tunables.kgn_max_immediate) {
+		GNIDBG_MSG(D_ERROR, rxmsg, "payload too large %d",
+			rxmsg->gnm_payload_len);
+		return -EPROTO;
+	}
+
+	/* we have no credits or buffers for this message, so copy it
+	 * somewhere for a later kgnilnd_recv */
+	LIBCFS_ALLOC(eagermsg, sizeof(*eagermsg) + *kgnilnd_tunables.kgn_max_immediate);
+	if (eagermsg == NULL) {
+		CERROR("couldn't allocate eager rx message for conn %p to %s\n",
+			conn, libcfs_nid2str(conn->gnc_peer->gnp_nid));
+		return -ENOMEM;
+	}
+
+	/* copy msg and payload */
+	memcpy(eagermsg, rxmsg, sizeof(*rxmsg) + rxmsg->gnm_payload_len);
+	rx->grx_msg = eagermsg;
+	rx->grx_eager = 1;
+
+	/* stash this for lnet_finalize on cancel-on-conn-close */
+	rx->grx_lntmsg = lntmsg;
+
+	/* add conn ref to ensure it doesn't go away until all eager messages processed */
+	kgnilnd_conn_addref(conn);
+
+	/* keep the same rx_t, it just has a new grx_msg now */
+	*new_private = private;
+
+	/* release SMSG buffer */
+	kgnilnd_release_msg(conn);
+
+	return 0;
+}
+
+int
+kgnilnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+	     int delayed, unsigned int niov,
+	     struct iovec *iov, lnet_kiov_t *kiov,
+	     unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+	kgn_rx_t    *rx = private;
+	kgn_conn_t  *conn = rx->grx_conn;
+	kgn_msg_t   *rxmsg = rx->grx_msg;
+	kgn_tx_t    *tx;
+	int          rc = 0;
+	__u32        pload_cksum;
+	ENTRY;
+
+	LASSERT(!in_interrupt());
+	LASSERTF(mlen <= rlen, "%d <= %d\n", mlen, rlen);
+	/* Either all pages or all vaddrs */
+	LASSERTF(!(kiov != NULL && iov != NULL), "kiov %p iov %p\n",
+		kiov, iov);
+
+	GNIDBG_MSG(D_NET, rxmsg, "conn %p, rxmsg %p, lntmsg %p"
+		" niov=%d kiov=%p iov=%p offset=%d mlen=%d rlen=%d",
+		conn, rxmsg, lntmsg,
+		niov, kiov, iov, offset, mlen, rlen);
+
+	/* we need to lock here as recv can be called from any context */
+	read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+	if (rx->grx_eager && conn->gnc_state != GNILND_CONN_ESTABLISHED) {
+		read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+		/* someone closed the conn after we copied this out, nuke it */
+		kgnilnd_consume_rx(rx);
+		lnet_finalize(ni, lntmsg, conn->gnc_error);
+		RETURN(0);
+	}
+	read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+	switch (rxmsg->gnm_type) {
+	default:
+		LBUG();
+
+	case GNILND_MSG_IMMEDIATE:
+		if (mlen > rxmsg->gnm_payload_len) {
+			GNIDBG_MSG(D_ERROR, rxmsg,
+				"Immediate message from %s too big: %d > %d",
+				libcfs_nid2str(conn->gnc_peer->gnp_nid), mlen,
+				rxmsg->gnm_payload_len);
+			rc = -EINVAL;
+			kgnilnd_consume_rx(rx);
+			RETURN(rc);
+		}
+
+		/* rxmsg[1] is a pointer to the payload, sitting in the buffer
+		 * right after the kgn_msg_t header - so just 'cute' way of saying
+		 * rxmsg + sizeof(kgn_msg_t) */
+
+		/* check payload checksum if sent */
+
+		if (*kgnilnd_tunables.kgn_checksum >= 2 &&
+			!rxmsg->gnm_payload_cksum &&
+			rxmsg->gnm_payload_len != 0)
+			GNIDBG_MSG(D_WARNING, rxmsg, "no msg payload checksum when enabled");
+
+		if (rxmsg->gnm_payload_cksum != 0) {
+			/* gnm_payload_len set in kgnilnd_sendmsg from tx->tx_nob,
+			 * which is what is used to calculate the cksum on the TX side */
+			pload_cksum = kgnilnd_cksum(&rxmsg[1], rxmsg->gnm_payload_len);
+
+			if (rxmsg->gnm_payload_cksum != pload_cksum) {
+				GNIDBG_MSG(D_NETERROR, rxmsg,
+					   "Bad payload checksum (%x expected %x)",
+					    pload_cksum, rxmsg->gnm_payload_cksum);
+				switch (*kgnilnd_tunables.kgn_checksum_dump) {
+				case 2:
+					kgnilnd_dump_blob(D_BUFFS, "bad payload checksum",
+							  &rxmsg[1], rxmsg->gnm_payload_len);
+					/* fall through to dump */
+				case 1:
+					libcfs_debug_dumplog();
+					break;
+				default:
+					break;
+				}
+				rc = -ENOKEY;
+				/* checksum problems are fatal, kill the conn */
+				kgnilnd_consume_rx(rx);
+				kgnilnd_close_conn(conn, rc);
+				RETURN(rc);
+			}
+		}
+
+		if (kiov != NULL)
+			lnet_copy_flat2kiov(
+				niov, kiov, offset,
+				*kgnilnd_tunables.kgn_max_immediate,
+				&rxmsg[1], 0, mlen);
+		else
+			lnet_copy_flat2iov(
+				niov, iov, offset,
+				*kgnilnd_tunables.kgn_max_immediate,
+				&rxmsg[1], 0, mlen);
+
+		kgnilnd_consume_rx(rx);
+		lnet_finalize(ni, lntmsg, 0);
+		RETURN(0);
+
+	case GNILND_MSG_PUT_REQ:
+		/* LNET wants to truncate or drop transaction, sending NAK */
+		if (mlen == 0) {
+			kgnilnd_consume_rx(rx);
+			lnet_finalize(ni, lntmsg, 0);
+
+			/* only error if lntmsg == NULL, otherwise we are just
+			 * short circuiting the rdma process of 0 bytes */
+			kgnilnd_nak_rdma(conn, GNILND_MSG_PUT_NAK,
+					lntmsg == NULL ? -ENOENT : 0,
+					rxmsg->gnm_u.get.gngm_cookie,
+					ni->ni_nid);
+			RETURN(0);
+		}
+		/* sending ACK with sink buff. info */
+		tx = kgnilnd_new_tx_msg(GNILND_MSG_PUT_ACK, ni->ni_nid);
+		if (tx == NULL) {
+			kgnilnd_consume_rx(rx);
+			RETURN(-ENOMEM);
+		}
+
+		rc = kgnilnd_set_tx_id(tx, conn);
+		if (rc != 0) {
+			GOTO(nak_put_req, rc);
+		}
+
+		rc = kgnilnd_setup_rdma_buffer(tx, niov, iov, kiov, offset, mlen);
+		if (rc != 0) {
+			GOTO(nak_put_req, rc);
+		}
+
+		tx->tx_msg.gnm_u.putack.gnpam_src_cookie =
+			rxmsg->gnm_u.putreq.gnprm_cookie;
+		tx->tx_msg.gnm_u.putack.gnpam_dst_cookie = tx->tx_id.txe_cookie;
+		tx->tx_msg.gnm_u.putack.gnpam_desc.gnrd_addr =
+			(__u64)((unsigned long)tx->tx_buffer);
+		tx->tx_msg.gnm_u.putack.gnpam_desc.gnrd_nob = mlen;
+
+		tx->tx_lntmsg[0] = lntmsg; /* finalize this on RDMA_DONE */
+
+		/* we only queue from kgnilnd_recv - we might get called from other contexts
+		 * and we don't want to block the mutex in those cases */
+
+		spin_lock(&tx->tx_conn->gnc_device->gnd_lock);
+		kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_MAPQ, 1);
+		spin_unlock(&tx->tx_conn->gnc_device->gnd_lock);
+		kgnilnd_schedule_device(tx->tx_conn->gnc_device);
+
+		kgnilnd_consume_rx(rx);
+		RETURN(0);
+
+nak_put_req:
+		/* make sure we send an error back when the PUT fails */
+		kgnilnd_nak_rdma(conn, GNILND_MSG_PUT_NAK, rc, rxmsg->gnm_u.get.gngm_cookie, ni->ni_nid);
+		kgnilnd_tx_done(tx, rc);
+		kgnilnd_consume_rx(rx);
+
+		/* return magic LNet network error */
+		RETURN(-EIO);
+
+	case GNILND_MSG_GET_REQ:
+		if (lntmsg != NULL) {
+			/* Matched! */
+			kgnilnd_reply(ni, rx, lntmsg);
+		} else {
+			/* No match */
+			kgnilnd_nak_rdma(conn, GNILND_MSG_GET_NAK,
+					-ENOENT,
+					rxmsg->gnm_u.get.gngm_cookie,
+					ni->ni_nid);
+		}
+		kgnilnd_consume_rx(rx);
+		RETURN(0);
+	}
+	RETURN(0);
+}
+
+/* needs write_lock on kgn_peer_conn_lock held */
+int
+kgnilnd_check_conn_timeouts_locked(kgn_conn_t *conn)
+{
+	unsigned long      timeout, keepalive;
+	unsigned long      now = jiffies;
+	unsigned long      newest_last_rx;
+	kgn_tx_t          *tx;
+
+	/* given that we found this conn hanging off a peer, it better damned
+	 * well be connected */
+	LASSERTF(conn->gnc_state == GNILND_CONN_ESTABLISHED,
+		 "conn 0x%p->%s with bad state%s\n", conn,
+		 conn->gnc_peer ? libcfs_nid2str(conn->gnc_peer->gnp_nid)
+			       : "<?>",
+		 kgnilnd_conn_state2str(conn));
+
+	CDEBUG(D_NET, "checking conn %p->%s timeout %d keepalive %d "
+		      "rx_diff %lu tx_diff %lu\n",
+		conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+		conn->gnc_timeout, GNILND_TO2KA(conn->gnc_timeout),
+		cfs_duration_sec(now - conn->gnc_last_rx_cq),
+		cfs_duration_sec(now - conn->gnc_last_tx));
+
+	timeout = cfs_time_seconds(conn->gnc_timeout);
+	keepalive = cfs_time_seconds(GNILND_TO2KA(conn->gnc_timeout));
+
+	/* just in case our lack of RX msg processing is gumming up the works - give the
+	 * remove an extra chance */
+
+	newest_last_rx = GNILND_LASTRX(conn);
+
+	if (time_after_eq(now, newest_last_rx + timeout)) {
+		GNIDBG_CONN(D_CONSOLE|D_NETERROR, conn, "No gnilnd traffic received from %s for %lu "
+			"seconds, terminating connection. Is node down? ",
+			libcfs_nid2str(conn->gnc_peer->gnp_nid),
+			cfs_duration_sec(now - newest_last_rx));
+		return -ETIMEDOUT;
+	}
+
+	/* we don't timeout on last_tx stalls - we are going to trust the
+	 * underlying network to let us know when sends are failing.
+	 * At worst, the peer will timeout our RX stamp and drop the connection
+	 * at that point. We'll then see his CLOSE or at worst his RX
+	 * stamp stop and drop the connection on our end */
+
+	if (time_after_eq(now, conn->gnc_last_tx + keepalive)) {
+		CDEBUG(D_NET, "sending NOOP -> %s (%p idle %lu(%lu)) "
+		       "last %lu/%lu/%lu %lus/%lus/%lus\n",
+		       libcfs_nid2str(conn->gnc_peer->gnp_nid), conn,
+		       cfs_duration_sec(jiffies - conn->gnc_last_tx),
+		       keepalive,
+		       conn->gnc_last_noop_want, conn->gnc_last_noop_sent,
+		       conn->gnc_last_noop_cq,
+		       cfs_duration_sec(jiffies - conn->gnc_last_noop_want),
+		       cfs_duration_sec(jiffies - conn->gnc_last_noop_sent),
+		       cfs_duration_sec(jiffies - conn->gnc_last_noop_cq));
+		set_mb(conn->gnc_last_noop_want, jiffies);
+		atomic_inc(&conn->gnc_reaper_noop);
+		if (CFS_FAIL_CHECK(CFS_FAIL_GNI_NOOP_SEND))
+			return 0;
+
+		tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP, conn->gnc_peer->gnp_net->gnn_ni->ni_nid);
+		if (tx == NULL)
+			return 0;
+		kgnilnd_queue_tx(conn, tx);
+	}
+
+	return 0;
+}
+
+/* needs write_lock on kgn_peer_conn_lock held */
+void
+kgnilnd_check_peer_timeouts_locked(kgn_peer_t *peer, struct list_head *todie,
+				    struct list_head *souls)
+{
+	unsigned long           timeout;
+	kgn_conn_t             *conn, *connN = NULL;
+	kgn_tx_t               *tx, *txN;
+	int                     rc = 0;
+	int                     count = 0;
+	int                     reconnect;
+	short                   releaseconn = 0;
+	unsigned long           first_rx = 0;
+
+	CDEBUG(D_NET, "checking peer 0x%p->%s for timeouts; interval %lus\n",
+		peer, libcfs_nid2str(peer->gnp_nid),
+		peer->gnp_reconnect_interval);
+
+	timeout = cfs_time_seconds(MAX(*kgnilnd_tunables.kgn_timeout,
+				       GNILND_MIN_TIMEOUT));
+
+	conn = kgnilnd_find_conn_locked(peer);
+	if (conn) {
+		/* if there is a valid conn, check the queues for timeouts */
+		rc = kgnilnd_check_conn_timeouts_locked(conn);
+		if (rc) {
+			if (CFS_FAIL_CHECK(CFS_FAIL_GNI_RX_CLOSE_CLOSING)) {
+				/* simulate a RX CLOSE after the timeout but before
+				 * the scheduler thread gets it */
+				conn->gnc_close_recvd = GNILND_CLOSE_INJECT1;
+				conn->gnc_peer_error = -ETIMEDOUT;
+			}
+			/* Once we mark closed, any of the scheduler threads could
+			 * get it and move through before we hit the fail loc code */
+			kgnilnd_close_conn_locked(conn, rc);
+		} else {
+			/* first_rx is used to decide when to release a conn from purgatory.
+			 */
+			first_rx = conn->gnc_first_rx;
+		}
+	}
+
+	/* now regardless of starting new conn, find tx on peer queue that
+	 * are old and smell bad - do this first so we don't trigger
+	 * reconnect on empty queue if we timeout all */
+	list_for_each_entry_safe(tx, txN, &peer->gnp_tx_queue, tx_list) {
+		if (time_after_eq(jiffies, tx->tx_qtime + timeout)) {
+			if (count == 0) {
+				LCONSOLE_INFO("could not send to %s due to connection"
+				       " setup failure after %lu seconds\n",
+				       libcfs_nid2str(peer->gnp_nid),
+				       cfs_duration_sec(jiffies - tx->tx_qtime));
+			}
+			kgnilnd_tx_del_state_locked(tx, peer, NULL,
+						   GNILND_TX_ALLOCD);
+			list_add_tail(&tx->tx_list, todie);
+			count++;
+		}
+	}
+
+	if (count || peer->gnp_connecting == GNILND_PEER_KILL) {
+		CDEBUG(D_NET, "canceling %d tx for peer 0x%p->%s\n",
+			count, peer, libcfs_nid2str(peer->gnp_nid));
+		/* if we nuked all the TX, stop peer connection attempt (if there is one..) */
+		if (list_empty(&peer->gnp_tx_queue) ||
+			peer->gnp_connecting == GNILND_PEER_KILL) {
+			/* we pass down todie to use a common function - but we know there are
+			 * no TX to add */
+			kgnilnd_cancel_peer_connect_locked(peer, todie);
+		}
+	}
+
+	/* Don't reconnect if we are still trying to clear out old conns.
+	 * This prevents us sending traffic on the new mbox before ensuring we are done
+	 * with the old one */
+	reconnect = (atomic_read(&peer->gnp_dirty_eps) == 0);
+
+	/* if we are not connected and there are tx on the gnp_tx_queue waiting
+	 * to be sent, we'll check the reconnect interval and fire up a new
+	 * connection request */
+
+	if ((peer->gnp_connecting == GNILND_PEER_IDLE) &&
+	    (time_after_eq(jiffies, peer->gnp_reconnect_time)) &&
+	     !list_empty(&peer->gnp_tx_queue) && reconnect) {
+
+		CDEBUG(D_NET, "starting connect to %s\n",
+			libcfs_nid2str(peer->gnp_nid));
+		LASSERTF(peer->gnp_connecting == GNILND_PEER_IDLE, "Peer was idle and we"
+			"have a write_lock, state issue %d\n", peer->gnp_connecting);
+
+		peer->gnp_connecting = GNILND_PEER_CONNECT;
+		kgnilnd_peer_addref(peer); /* extra ref for connd */
+
+		spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
+		list_add_tail(&peer->gnp_connd_list,
+			      &peer->gnp_net->gnn_dev->gnd_connd_peers);
+		spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
+
+		kgnilnd_schedule_dgram(peer->gnp_net->gnn_dev);
+	}
+
+	/* fail_loc to allow us to delay release of purgatory */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PURG_REL_DELAY))
+		return;
+
+	/* This check allows us to verify that the new conn is actually being used. This allows us to
+	 * pull the old conns out of purgatory if they have actually seen traffic.
+	 * We only release a conn from purgatory during stack reset, admin command, or when a peer reconnects
+	 */
+	if (first_rx &&
+		time_after(jiffies, first_rx + cfs_time_seconds(*kgnilnd_tunables.kgn_hardware_timeout))) {
+		CDEBUG(D_NET,"We can release conn %p from purgatory %lu\n",
+		       conn, first_rx + cfs_time_seconds(*kgnilnd_tunables.kgn_hardware_timeout));
+		releaseconn = 1;
+	}
+
+	list_for_each_entry_safe (conn, connN, &peer->gnp_conns, gnc_list) {
+	/* check for purgatory timeouts */
+		if (conn->gnc_in_purgatory) {
+			/* We cannot detach this conn from purgatory if it has not been closed so we reschedule it
+			 * that way the next time we check it we can detach it from purgatory
+			 */
+
+			if (conn->gnc_state != GNILND_CONN_DONE) {
+				/* Skip over conns that are currently not DONE. If they arent already scheduled
+				 * for completion something in the state machine is broken.
+				 */
+				continue;
+			}
+
+			/* We only detach a conn that is in purgatory if we have received a close message,
+			 * we have a new valid connection that has successfully received data, or an admin
+			 * command tells us we need to detach.
+			 */
+
+			if (conn->gnc_close_recvd || releaseconn || conn->gnc_needs_detach) {
+				unsigned long   waiting;
+
+				waiting = (long) jiffies - conn->gnc_last_rx_cq;
+
+				/* C.E: The remote peer is expected to close the
+				 * connection (see kgnilnd_check_conn_timeouts)
+				 * via the reaper thread and nuke out the MDD and
+				 * FMA resources after conn->gnc_timeout has expired
+				 * without an FMA RX */
+				CDEBUG(D_NET, "Reconnected to %s in %lds or admin forced detach, dropping "
+					" held resources\n",
+					libcfs_nid2str(conn->gnc_peer->gnp_nid),
+					cfs_duration_sec(waiting));
+
+				kgnilnd_detach_purgatory_locked(conn, souls);
+			}
+		}
+	}
+
+	return;
+}
+
+void
+kgnilnd_reaper_check(int idx)
+{
+	struct list_head  *peers = &kgnilnd_data.kgn_peers[idx];
+	struct list_head  *ctmp, *ctmpN;
+	struct list_head   geriatrics;
+	struct list_head   souls;
+
+	INIT_LIST_HEAD(&geriatrics);
+	INIT_LIST_HEAD(&souls);
+
+	write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+	list_for_each_safe(ctmp, ctmpN, peers) {
+		kgn_peer_t        *peer = NULL;
+
+		/* don't timeout stuff if the network is mucked or shutting down */
+		if (kgnilnd_check_hw_quiesce()) {
+			break;
+		}
+		peer = list_entry(ctmp, kgn_peer_t, gnp_list);
+
+		kgnilnd_check_peer_timeouts_locked(peer, &geriatrics, &souls);
+	}
+
+	write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+	kgnilnd_txlist_done(&geriatrics, -EHOSTUNREACH);
+	kgnilnd_release_purgatory_list(&souls);
+}
+
+void
+kgnilnd_update_reaper_timeout(long timeout)
+{
+	LASSERT(timeout > 0);
+
+	spin_lock(&kgnilnd_data.kgn_reaper_lock);
+
+	if (timeout < kgnilnd_data.kgn_new_min_timeout)
+		kgnilnd_data.kgn_new_min_timeout = timeout;
+
+	spin_unlock(&kgnilnd_data.kgn_reaper_lock);
+}
+
+static void
+kgnilnd_reaper_poke_with_stick(unsigned long arg)
+{
+	wake_up(&kgnilnd_data.kgn_reaper_waitq);
+}
+
+int
+kgnilnd_reaper(void *arg)
+{
+	long               timeout;
+	int                i;
+	int                hash_index = 0;
+	unsigned long      next_check_time = jiffies;
+	long               current_min_timeout = MAX_SCHEDULE_TIMEOUT;
+	struct timer_list  timer;
+	DEFINE_WAIT(wait);
+
+	cfs_daemonize("kgnilnd_rpr");
+	cfs_block_allsigs();
+
+	/* all gnilnd threads need to run fairly urgently */
+	set_user_nice(current, *kgnilnd_tunables.kgn_nice);
+	spin_lock(&kgnilnd_data.kgn_reaper_lock);
+
+	while (!kgnilnd_data.kgn_shutdown) {
+		/* I wake up every 'p' seconds to check for timeouts on some
+		 * more peers.  I try to check every connection 'n' times
+		 * within the global minimum of all keepalive and timeout
+		 * intervals, to ensure I attend to every connection within
+		 * (n+1)/n times its timeout intervals. */
+		const int     p = GNILND_REAPER_THREAD_WAKE;
+		const int     n = GNILND_REAPER_NCHECKS;
+		int           chunk;
+		/* to quiesce or to not quiesce, that is the question */
+		if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+			spin_unlock(&kgnilnd_data.kgn_reaper_lock);
+			KGNILND_SPIN_QUIESCE;
+			spin_lock(&kgnilnd_data.kgn_reaper_lock);
+		}
+
+		/* careful with the jiffy wrap... */
+		timeout = (long)(next_check_time - jiffies);
+
+		if (timeout > 0) {
+			prepare_to_wait(&kgnilnd_data.kgn_reaper_waitq, &wait,
+					TASK_INTERRUPTIBLE);
+			spin_unlock(&kgnilnd_data.kgn_reaper_lock);
+			setup_timer(&timer, kgnilnd_reaper_poke_with_stick,
+				    next_check_time);
+			mod_timer(&timer, (long) jiffies + timeout);
+
+			/* check flag variables before comitting */
+			if (!kgnilnd_data.kgn_shutdown &&
+			    !kgnilnd_data.kgn_quiesce_trigger) {
+				CDEBUG(D_INFO, "schedule timeout %ld (%lu sec)\n",
+				       timeout, cfs_duration_sec(timeout));
+				schedule();
+				CDEBUG(D_INFO, "awake after schedule\n");
+			}
+
+			del_singleshot_timer_sync(&timer);
+			spin_lock(&kgnilnd_data.kgn_reaper_lock);
+			finish_wait(&kgnilnd_data.kgn_reaper_waitq, &wait);
+			continue;
+		}
+
+		/* new_min_timeout is set from the conn timeouts and keepalive
+		 * this should end up with a min timeout of
+		 * GNILND_TIMEOUT2KEEPALIVE(t) or roughly LND_TIMEOUT/2 */
+		if (kgnilnd_data.kgn_new_min_timeout < current_min_timeout) {
+			current_min_timeout = kgnilnd_data.kgn_new_min_timeout;
+			CDEBUG(D_NET, "Set new min timeout %ld\n",
+			       current_min_timeout);
+		}
+
+		spin_unlock(&kgnilnd_data.kgn_reaper_lock);
+
+		/* Compute how many table entries to check now so I get round
+		 * the whole table fast enough given that I do this at fixed
+		 * intervals of 'p' seconds) */
+		chunk = *kgnilnd_tunables.kgn_peer_hash_size;
+		if (kgnilnd_data.kgn_new_min_timeout > n * p)
+			chunk = (chunk * n * p) /
+				kgnilnd_data.kgn_new_min_timeout;
+		if (chunk == 0)
+			chunk = 1;
+		for (i = 0; i < chunk; i++) {
+			kgnilnd_reaper_check(hash_index);
+			hash_index = (hash_index + 1) %
+				*kgnilnd_tunables.kgn_peer_hash_size;
+		}
+		next_check_time = (long) jiffies + cfs_time_seconds(p);
+		CDEBUG(D_INFO, "next check at %lu or in %d sec\n", next_check_time, p);
+
+		spin_lock(&kgnilnd_data.kgn_reaper_lock);
+	}
+
+	spin_unlock(&kgnilnd_data.kgn_reaper_lock);
+
+	kgnilnd_thread_fini();
+	return 0;
+}
+
+int
+kgnilnd_check_rdma_cq(kgn_device_t *dev)
+{
+	gni_return_t           rrc;
+	gni_post_descriptor_t *desc;
+	__u64                  event_data;
+	kgn_tx_ev_id_t         ev_id;
+	char                   err_str[256];
+	int                    should_retry, rc;
+	long                   num_processed = 0;
+	kgn_conn_t            *conn = NULL;
+	kgn_tx_t              *tx = NULL;
+
+	for (;;) {
+		/* make sure we don't keep looping if we need to reset */
+		if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+			return num_processed;
+		}
+		rc = kgnilnd_mutex_trylock(&dev->gnd_cq_mutex);
+		if (!rc) {
+			/* we didn't get the mutex, so return that there is still work
+			 * to be done */
+			return 1;
+		}
+		if (CFS_FAIL_CHECK(CFS_FAIL_GNI_DELAY_RDMA)) {
+			/* a bit gross - but we need a good way to test for
+			 * delayed RDMA completions and the easiest way to do
+			 * that is to delay the RDMA CQ events */
+			rrc = GNI_RC_NOT_DONE;
+		} else {
+			rrc = kgnilnd_cq_get_event(dev->gnd_snd_rdma_cqh, &event_data);
+		}
+
+		if (rrc == GNI_RC_NOT_DONE) {
+			mutex_unlock(&dev->gnd_cq_mutex);
+			CDEBUG(D_INFO, "SEND RDMA CQ %d empty processed %ld\n",
+			       dev->gnd_id, num_processed);
+			return num_processed;
+		}
+		dev->gnd_sched_alive = jiffies;
+		num_processed++;
+
+		LASSERTF(!GNI_CQ_OVERRUN(event_data),
+			"this is bad, somehow our credits didn't protect us"
+			" from CQ overrun\n");
+		LASSERTF(GNI_CQ_GET_TYPE(event_data) == GNI_CQ_EVENT_TYPE_POST,
+			"rrc %d, GNI_CQ_GET_TYPE("LPX64") = "LPX64"\n", rrc,
+			event_data, GNI_CQ_GET_TYPE(event_data));
+
+		rrc = kgnilnd_get_completed(dev->gnd_snd_rdma_cqh, event_data,
+					    &desc);
+		mutex_unlock(&dev->gnd_cq_mutex);
+
+		/* XXX Nic: Need better error handling here... */
+		LASSERTF((rrc == GNI_RC_SUCCESS) ||
+			  (rrc == GNI_RC_TRANSACTION_ERROR),
+			 "rrc %d\n", rrc);
+
+		ev_id.txe_cookie = desc->post_id;
+
+		kgnilnd_validate_tx_ev_id(&ev_id, &tx, &conn);
+
+		if (conn == NULL || tx == NULL) {
+			/* either conn or tx was already nuked and this is a "late"
+			 * completion, so drop it */
+			continue;
+		}
+
+		GNITX_ASSERTF(tx, tx->tx_msg.gnm_type == GNILND_MSG_PUT_DONE ||
+			tx->tx_msg.gnm_type == GNILND_MSG_GET_DONE,
+			"tx %p with type %d\n", tx, tx->tx_msg.gnm_type);
+
+		GNIDBG_TX(D_NET, tx, "RDMA completion for %d bytes", tx->tx_nob);
+
+		/* remove from rdmaq */
+		spin_lock(&conn->gnc_list_lock);
+		kgnilnd_tx_del_state_locked(tx, NULL, conn, GNILND_TX_ALLOCD);
+		spin_unlock(&conn->gnc_list_lock);
+
+		if (likely(desc->status == GNI_RC_SUCCESS)) {
+			atomic_inc(&dev->gnd_rdma_ntx);
+			atomic64_add(tx->tx_nob, &dev->gnd_rdma_txbytes);
+			/* transaction succeeded, add into fmaq */
+			kgnilnd_queue_tx(conn, tx);
+			kgnilnd_peer_alive(conn->gnc_peer);
+
+			/* drop ref from kgnilnd_validate_tx_ev_id */
+			kgnilnd_conn_decref(conn);
+			continue;
+		}
+
+		/* fall through to the TRANSACTION_ERROR case */
+		tx->tx_retrans++;
+
+		/* get stringified version for log messages */
+		kgnilnd_cq_error_str(event_data, &err_str, 256);
+		kgnilnd_cq_error_recoverable(event_data, &should_retry);
+
+		/* make sure we are not off in the weeds with this tx */
+		if (tx->tx_retrans >
+			*kgnilnd_tunables.kgn_max_retransmits) {
+			GNIDBG_TX(D_NETERROR, tx,
+			       "giving up on TX, too many retries", NULL);
+			should_retry = 0;
+		}
+
+		GNIDBG_TX(D_NETERROR, tx, "RDMA %s error (%s)",
+			should_retry ? "transient" : "unrecoverable", err_str);
+
+		if (tx->tx_msg.gnm_type == GNILND_MSG_PUT_DONE) {
+			if (should_retry) {
+				kgnilnd_rdma(tx, GNILND_MSG_PUT_DONE,
+					     &tx->tx_putinfo.gnpam_desc,
+					     tx->tx_putinfo.gnpam_desc.gnrd_nob,
+					     tx->tx_putinfo.gnpam_dst_cookie);
+			} else {
+				kgnilnd_nak_rdma(conn, GNILND_MSG_PUT_NAK,
+						-EFAULT,
+						tx->tx_putinfo.gnpam_dst_cookie,
+						tx->tx_msg.gnm_srcnid);
+				kgnilnd_tx_done(tx, -EFAULT);
+			}
+		} else {
+			if (should_retry) {
+				kgnilnd_rdma(tx, GNILND_MSG_GET_DONE,
+					     &tx->tx_getinfo.gngm_desc,
+					     tx->tx_lntmsg[0]->msg_len,
+					     tx->tx_getinfo.gngm_cookie);
+			} else {
+				kgnilnd_nak_rdma(conn, GNILND_MSG_GET_NAK,
+						-EFAULT,
+						tx->tx_getinfo.gngm_cookie,
+						tx->tx_msg.gnm_srcnid);
+				kgnilnd_tx_done(tx, -EFAULT);
+			}
+		}
+
+		/* drop ref from kgnilnd_validate_tx_ev_id */
+		kgnilnd_conn_decref(conn);
+	}
+}
+
+int
+kgnilnd_check_fma_send_cq(kgn_device_t *dev)
+{
+	gni_return_t           rrc;
+	__u64                  event_data;
+	kgn_tx_ev_id_t         ev_id;
+	kgn_tx_t              *tx = NULL;
+	kgn_conn_t            *conn = NULL;
+	int                    queued_fma, saw_reply, rc;
+	long                   num_processed = 0;
+
+	for (;;) {
+		/* make sure we don't keep looping if we need to reset */
+		if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+			return num_processed;
+		}
+
+		rc = kgnilnd_mutex_trylock(&dev->gnd_cq_mutex);
+		if (!rc) {
+			/* we didn't get the mutex, so return that there is still work
+			 * to be done */
+			return 1;
+		}
+
+		rrc = kgnilnd_cq_get_event(dev->gnd_snd_fma_cqh, &event_data);
+		mutex_unlock(&dev->gnd_cq_mutex);
+
+		if (rrc == GNI_RC_NOT_DONE) {
+			CDEBUG(D_INFO,
+			       "SMSG send CQ %d not ready (data "LPX64") "
+			       "processed %ld\n", dev->gnd_id, event_data,
+			       num_processed);
+			return num_processed;
+		}
+
+		dev->gnd_sched_alive = jiffies;
+		num_processed++;
+
+		LASSERTF(!GNI_CQ_OVERRUN(event_data),
+			"this is bad, somehow our credits didn't "
+			"protect us from CQ overrun\n");
+		LASSERTF(GNI_CQ_GET_TYPE(event_data) == GNI_CQ_EVENT_TYPE_SMSG,
+			"rrc %d, GNI_CQ_GET_TYPE("LPX64") = "LPX64"\n", rrc,
+			event_data, GNI_CQ_GET_TYPE(event_data));
+
+		/* if SMSG couldn't handle an error, time for conn to die */
+		if (unlikely(rrc == GNI_RC_TRANSACTION_ERROR)) {
+			char            err_str[256];
+
+			/* need to take the write_lock to ensure atomicity
+			 * on the conn state if we need to close it */
+			write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+			conn = kgnilnd_cqid2conn_locked(GNI_CQ_GET_INST_ID(event_data));
+			if (conn == NULL) {
+				/* Conn was destroyed? */
+				CDEBUG(D_NET,
+					"SMSG CQID lookup "LPX64" failed\n",
+					GNI_CQ_GET_INST_ID(event_data));
+				write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+				continue;
+			}
+
+			kgnilnd_cq_error_str(event_data, &err_str, 256);
+			CNETERR("SMSG send error to %s: rc %d (%s)\n",
+			       libcfs_nid2str(conn->gnc_peer->gnp_nid),
+			       rrc, err_str);
+			kgnilnd_close_conn_locked(conn, -ECOMM);
+
+			write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+			/* no need to process rest of this tx -
+			 * it is getting canceled */
+			continue;
+		}
+
+		/* fall through to GNI_RC_SUCCESS case */
+		ev_id.txe_smsg_id = GNI_CQ_GET_MSG_ID(event_data);
+
+		kgnilnd_validate_tx_ev_id(&ev_id, &tx, &conn);
+		if (conn == NULL || tx == NULL) {
+			/* either conn or tx was already nuked and this is a "late"
+			 * completion, so drop it */
+			continue;
+		}
+
+		tx->tx_conn->gnc_last_tx_cq = jiffies;
+		if (tx->tx_msg.gnm_type == GNILND_MSG_NOOP) {
+			set_mb(conn->gnc_last_noop_cq, jiffies);
+		}
+
+		/* lock tx_list_state and tx_state */
+		spin_lock(&tx->tx_conn->gnc_list_lock);
+
+		GNITX_ASSERTF(tx, tx->tx_list_state == GNILND_TX_LIVE_FMAQ,
+				"state not GNILND_TX_LIVE_FMAQ", NULL);
+		GNITX_ASSERTF(tx, tx->tx_state & GNILND_TX_WAITING_COMPLETION,
+			"not waiting for completion", NULL);
+
+		GNIDBG_TX(D_NET, tx, "SMSG complete tx_state %x rc %d",
+			tx->tx_state, rrc);
+
+		tx->tx_state &= ~GNILND_TX_WAITING_COMPLETION;
+
+		/* This will trigger other FMA sends that were
+		 * pending this completion */
+		queued_fma = !list_empty(&tx->tx_conn->gnc_fmaq);
+
+		/* we either did not expect reply or we already got it */
+		saw_reply = !(tx->tx_state & GNILND_TX_WAITING_REPLY);
+
+		spin_unlock(&tx->tx_conn->gnc_list_lock);
+
+		if (queued_fma) {
+			CDEBUG(D_NET, "scheduling conn 0x%p->%s for fmaq\n",
+			       conn,
+			       libcfs_nid2str(conn->gnc_peer->gnp_nid));
+			kgnilnd_schedule_conn(conn);
+		}
+
+		/* If saw_reply is false as soon as gnc_list_lock is dropped the tx could be nuked
+		 * If saw_reply is true we know that the tx is safe to use as the other thread
+		 * is already finished with it.
+		 */
+
+		if (saw_reply) {
+			/* no longer need to track on the live_fmaq */
+			kgnilnd_tx_del_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_ALLOCD);
+
+			if (tx->tx_state & GNILND_TX_PENDING_RDMA) {
+				/* we already got reply & were waiting for
+				 * completion of initial send */
+				/* to initiate RDMA transaction */
+				GNIDBG_TX(D_NET, tx,
+					 "Pending RDMA 0x%p type 0x%02x",
+					 tx->tx_msg.gnm_type);
+				tx->tx_state &= ~GNILND_TX_PENDING_RDMA;
+				rc = kgnilnd_send_mapped_tx(tx, 0);
+				GNITX_ASSERTF(tx, rc == 0, "RDMA send failed: %d\n", rc);
+			} else {
+				/* we are done with this tx */
+				GNIDBG_TX(D_NET, tx,
+					 "Done with tx type 0x%02x",
+					 tx->tx_msg.gnm_type);
+				kgnilnd_tx_done(tx, tx->tx_rc);
+			}
+		}
+
+		/* drop ref from kgnilnd_validate_tx_ev_id */
+		kgnilnd_conn_decref(conn);
+
+		/* if we are waiting for a REPLY, we'll handle the tx then */
+	} /* end for loop */
+}
+
+int
+kgnilnd_check_fma_rcv_cq(kgn_device_t *dev)
+{
+	kgn_conn_t         *conn;
+	gni_return_t        rrc;
+	__u64               event_data;
+	long                num_processed = 0;
+	struct list_head   *conns;
+	struct list_head   *tmp;
+	int                 rc;
+
+	for (;;) {
+		/* make sure we don't keep looping if we need to reset */
+		if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+			return num_processed;
+		}
+
+		rc = kgnilnd_mutex_trylock(&dev->gnd_cq_mutex);
+		if (!rc) {
+			/* we didn't get the mutex, so return that there is still work
+			 * to be done */
+			return 1;
+		}
+		rrc = kgnilnd_cq_get_event(dev->gnd_rcv_fma_cqh, &event_data);
+		mutex_unlock(&dev->gnd_cq_mutex);
+
+		if (rrc == GNI_RC_NOT_DONE) {
+			CDEBUG(D_INFO, "SMSG RX CQ %d empty data "LPX64" "
+				"processed %ld\n",
+				dev->gnd_id, event_data, num_processed);
+			return num_processed;
+		}
+		dev->gnd_sched_alive = jiffies;
+		num_processed++;
+
+		/* this is the only CQ that can really handle transient
+		 * CQ errors */
+		if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CQ_GET_EVENT)) {
+			rrc = cfs_fail_val ? cfs_fail_val
+					   : GNI_RC_ERROR_RESOURCE;
+			if (rrc == GNI_RC_ERROR_RESOURCE) {
+				/* set overrun too */
+				event_data |= (1UL << 63);
+				LASSERTF(GNI_CQ_OVERRUN(event_data),
+					 "(1UL << 63) is no longer the bit to"
+					 "set to indicate CQ_OVERRUN\n");
+			}
+		}
+		/* sender should get error event too and take care
+		of failed transaction by re-transmitting */
+		if (rrc == GNI_RC_TRANSACTION_ERROR) {
+			CDEBUG(D_NET, "SMSG RX CQ error "LPX64"\n", event_data);
+			continue;
+		}
+
+		if (likely(!GNI_CQ_OVERRUN(event_data))) {
+			read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+			conn = kgnilnd_cqid2conn_locked(
+						 GNI_CQ_GET_INST_ID(event_data));
+			if (conn == NULL) {
+				CDEBUG(D_NET, "SMSG RX CQID lookup "LPU64" "
+					"failed, dropping event "LPX64"\n",
+					GNI_CQ_GET_INST_ID(event_data),
+					event_data);
+			} else {
+				CDEBUG(D_NET, "SMSG RX: CQID "LPU64" "
+				       "conn %p->%s\n",
+					GNI_CQ_GET_INST_ID(event_data),
+					conn, conn->gnc_peer ?
+					libcfs_nid2str(conn->gnc_peer->gnp_nid) :
+					"<?>");
+
+				conn->gnc_last_rx_cq = jiffies;
+
+				/* stash first rx so we can clear out purgatory.
+				 */
+				if (conn->gnc_first_rx == 0) {
+					conn->gnc_first_rx = jiffies;
+				}
+				kgnilnd_peer_alive(conn->gnc_peer);
+				kgnilnd_schedule_conn(conn);
+			}
+			read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+			continue;
+		}
+
+		/* FMA CQ has overflowed: check ALL conns */
+		CNETERR("SMSG RX CQ overflow: scheduling ALL "
+		       "conns on device %d\n", dev->gnd_id);
+
+		for (rc = 0; rc < *kgnilnd_tunables.kgn_peer_hash_size; rc++) {
+
+			read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+			conns = &kgnilnd_data.kgn_conns[rc];
+
+			list_for_each(tmp, conns) {
+				conn = list_entry(tmp, kgn_conn_t,
+						  gnc_hashlist);
+
+				if (conn->gnc_device == dev) {
+					kgnilnd_schedule_conn(conn);
+					conn->gnc_last_rx_cq = jiffies;
+				}
+			}
+
+			/* don't block write lockers for too long... */
+			read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+		}
+	}
+}
+
+/* try_map_if_full should only be used when processing TX from list of
+ * backlog TX waiting on mappings to free up
+ *
+ * Return Codes:
+ *  try_map_if_full = 0: 0 (sent or queued), (-|+)errno failure of kgnilnd_sendmsg
+ *  try_map_if_full = 1: 0 (sent), -ENOMEM for caller to requeue, (-|+)errno failure of kgnilnd_sendmsg */
+
+int
+kgnilnd_send_mapped_tx(kgn_tx_t *tx, int try_map_if_full)
+{
+	/* slight bit of race if multiple people calling, but at worst we'll have
+	 * order altered just a bit... which would not be determenistic anyways */
+	int     rc = atomic_read(&tx->tx_conn->gnc_device->gnd_nq_map);
+
+	GNIDBG_TX(D_NET, tx, "try %d nq_map %d", try_map_if_full, rc);
+
+	/* We know that we have a GART reservation that should guarantee forward progress.
+	 * This means we don't need to take any extraordinary efforts if we are failing
+	 * mappings here - even if we are holding a very small number of these. */
+
+	if (try_map_if_full || (rc == 0)) {
+		rc = kgnilnd_map_buffer(tx);
+	}
+
+	/* rc should be 0 if we mapped succesfully here, if non-zero we are queueing */
+	if (rc != 0) {
+		/* if try_map_if_full set, they handle requeuing */
+		if (unlikely(try_map_if_full)) {
+			RETURN(rc);
+		} else {
+			spin_lock(&tx->tx_conn->gnc_device->gnd_lock);
+			kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_MAPQ, 1);
+			spin_unlock(&tx->tx_conn->gnc_device->gnd_lock);
+			/* make sure we wake up sched to run this */
+			kgnilnd_schedule_device(tx->tx_conn->gnc_device);
+			/* return 0 as this is now queued for later sending */
+			RETURN(0);
+		}
+	}
+
+	switch (tx->tx_msg.gnm_type) {
+	default:
+		LBUG();
+		break;
+	/* GET_REQ and PUT_ACK are outbound messages sending our mapping key to
+	 * remote node where the RDMA will be started
+	 * Special case -EAGAIN logic - this should just queued as if the mapping couldn't
+	 * be satisified. The rest of the errors are "hard" errors that require
+	 * upper layers to handle themselves */
+	case GNILND_MSG_GET_REQ:
+		tx->tx_msg.gnm_u.get.gngm_desc.gnrd_key = tx->tx_map_key;
+		tx->tx_msg.gnm_u.get.gngm_cookie = tx->tx_id.txe_cookie;
+		tx->tx_msg.gnm_u.get.gngm_desc.gnrd_addr = (__u64)((unsigned long)tx->tx_buffer);
+		tx->tx_msg.gnm_u.get.gngm_desc.gnrd_nob = tx->tx_nob;
+		tx->tx_state = GNILND_TX_WAITING_COMPLETION | GNILND_TX_WAITING_REPLY;
+		if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GET_REQ_AGAIN)) {
+			tx->tx_state |= GNILND_TX_FAIL_SMSG;
+		}
+		/* redirect to FMAQ on failure, no need to infinite loop here in MAPQ */
+		rc = kgnilnd_sendmsg(tx, NULL, 0, &tx->tx_conn->gnc_list_lock, GNILND_TX_FMAQ);
+		break;
+	case GNILND_MSG_PUT_ACK:
+		tx->tx_msg.gnm_u.putack.gnpam_desc.gnrd_key = tx->tx_map_key;
+		tx->tx_state = GNILND_TX_WAITING_COMPLETION | GNILND_TX_WAITING_REPLY;
+		if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PUT_ACK_AGAIN)) {
+			tx->tx_state |= GNILND_TX_FAIL_SMSG;
+		}
+		/* redirect to FMAQ on failure, no need to infinite loop here in MAPQ */
+		rc = kgnilnd_sendmsg(tx, NULL, 0, &tx->tx_conn->gnc_list_lock, GNILND_TX_FMAQ);
+		break;
+
+	/* PUT_REQ and GET_DONE are where we do the actual RDMA */
+	case GNILND_MSG_PUT_REQ:
+		kgnilnd_rdma(tx, GNILND_MSG_PUT_DONE,
+			     &tx->tx_putinfo.gnpam_desc,
+			     tx->tx_putinfo.gnpam_desc.gnrd_nob,
+			     tx->tx_putinfo.gnpam_dst_cookie);
+		break;
+	case GNILND_MSG_GET_DONE:
+		kgnilnd_rdma(tx, GNILND_MSG_GET_DONE,
+			     &tx->tx_getinfo.gngm_desc,
+			     tx->tx_lntmsg[0]->msg_len,
+			     tx->tx_getinfo.gngm_cookie);
+
+		break;
+	}
+
+	RETURN(rc);
+}
+
+void
+kgnilnd_process_fmaq(kgn_conn_t *conn)
+{
+	int           more_to_do = 0;
+	kgn_tx_t     *tx = NULL;
+	void         *buffer = NULL;
+	unsigned int  nob = 0;
+	int           rc;
+
+	/* NB 1. kgnilnd_sendmsg() may fail if I'm out of credits right now.
+	 *       However I will be rescheduled by an FMA completion event
+	 *       when I eventually get some.
+	 * NB 2. Sampling gnc_state here races with setting it elsewhere.
+	 *       But it doesn't matter if I try to send a "real" message just
+	 *       as I start closing because I'll get scheduled to send the
+	 *       close anyway. */
+
+	/* Short circuit if the ep_handle is null we cant send anyway. */
+	if (conn->gnc_ephandle == NULL)
+		return;
+
+	LASSERTF(!conn->gnc_close_sent, "Conn %p close was sent\n", conn);
+
+	spin_lock(&conn->gnc_list_lock);
+
+	if (list_empty(&conn->gnc_fmaq)) {
+		int     keepalive = GNILND_TO2KA(conn->gnc_timeout);
+
+		spin_unlock(&conn->gnc_list_lock);
+
+		if (time_after_eq(jiffies, conn->gnc_last_tx + cfs_time_seconds(keepalive))) {
+			CDEBUG(D_NET, "sending NOOP -> %s (%p idle %lu(%d)) "
+			       "last %lu/%lu/%lu %lus/%lus/%lus\n",
+			       libcfs_nid2str(conn->gnc_peer->gnp_nid), conn,
+			       cfs_duration_sec(jiffies - conn->gnc_last_tx),
+			       keepalive,
+			       conn->gnc_last_noop_want, conn->gnc_last_noop_sent,
+			       conn->gnc_last_noop_cq,
+			       cfs_duration_sec(jiffies - conn->gnc_last_noop_want),
+			       cfs_duration_sec(jiffies - conn->gnc_last_noop_sent),
+			       cfs_duration_sec(jiffies - conn->gnc_last_noop_cq));
+			atomic_inc(&conn->gnc_sched_noop);
+			set_mb(conn->gnc_last_noop_want, jiffies);
+
+			if (CFS_FAIL_CHECK(CFS_FAIL_GNI_NOOP_SEND))
+				return;
+
+			tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP, conn->gnc_peer->gnp_net->gnn_ni->ni_nid);
+			if (tx != NULL) {
+				int     rc;
+
+				rc = kgnilnd_set_tx_id(tx, conn);
+				if (rc != 0) {
+					kgnilnd_tx_done(tx, rc);
+					return;
+				}
+			}
+		}
+	} else {
+		tx = list_first_entry(&conn->gnc_fmaq, kgn_tx_t, tx_list);
+		/* move from fmaq to allocd, kgnilnd_sendmsg will move to live_fmaq */
+		kgnilnd_tx_del_state_locked(tx, NULL, conn, GNILND_TX_ALLOCD);
+		more_to_do = !list_empty(&conn->gnc_fmaq);
+		spin_unlock(&conn->gnc_list_lock);
+	}
+
+	/* if there is no real TX or no NOOP to send, bail */
+	if (tx == NULL) {
+		return;
+	}
+
+	if (!tx->tx_retrans)
+		tx->tx_cred_wait = jiffies;
+
+	GNITX_ASSERTF(tx, tx->tx_id.txe_smsg_id != 0,
+		      "tx with zero id", NULL);
+
+	CDEBUG(D_NET, "sending regular msg: %p, type %s(0x%02x), cookie "LPX64"\n",
+	       tx, kgnilnd_msgtype2str(tx->tx_msg.gnm_type),
+	       tx->tx_msg.gnm_type, tx->tx_id.txe_cookie);
+
+	rc = 0;
+
+	switch (tx->tx_msg.gnm_type) {
+	default:
+		LBUG();
+
+	case GNILND_MSG_NOOP:
+	case GNILND_MSG_CLOSE:
+	case GNILND_MSG_IMMEDIATE:
+		tx->tx_state = GNILND_TX_WAITING_COMPLETION;
+		buffer = tx->tx_buffer;
+		nob = tx->tx_nob;
+		break;
+
+	case GNILND_MSG_GET_DONE:
+	case GNILND_MSG_PUT_DONE:
+	case GNILND_MSG_PUT_NAK:
+	case GNILND_MSG_GET_NAK:
+		tx->tx_state = GNILND_TX_WAITING_COMPLETION;
+		break;
+
+	case GNILND_MSG_PUT_REQ:
+		tx->tx_msg.gnm_u.putreq.gnprm_cookie = tx->tx_id.txe_cookie;
+
+	case GNILND_MSG_PUT_ACK:
+	case GNILND_MSG_GET_REQ:
+		/* This is really only to handle the retransmit of SMSG once these
+		 * two messages are setup in send_mapped_tx */
+		tx->tx_state = GNILND_TX_WAITING_COMPLETION | GNILND_TX_WAITING_REPLY;
+		break;
+	}
+
+	if (likely(rc == 0)) {
+		rc = kgnilnd_sendmsg(tx, buffer, nob, &conn->gnc_list_lock, GNILND_TX_FMAQ);
+	}
+
+	if (rc > 0) {
+		/* don't explicitly reschedule here - we are short credits and will rely on
+		 * kgnilnd_sendmsg to resched the conn if need be */
+		more_to_do = 0;
+	} else if (rc < 0) {
+		/* bail: it wasn't sent and we didn't get EAGAIN indicating we should retrans
+		 * almost certainly a software bug, but lets play nice with the other kids */
+		kgnilnd_tx_done(tx, rc);
+		/* just for fun, kick peer in arse - resetting conn might help to correct
+		 * this almost certainly buggy software caused return code */
+		kgnilnd_close_conn(conn, rc);
+	}
+
+	if (more_to_do) {
+		CDEBUG(D_NET, "Rescheduling %p (more to do)\n", conn);
+		kgnilnd_schedule_conn(conn);
+	}
+}
+
+int
+kgnilnd_process_rdmaq(kgn_device_t *dev)
+{
+	int               found_work = 0;
+	kgn_tx_t         *tx;
+
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_DELAY_RDMAQ)) {
+		RETURN(found_work);
+	}
+
+	if (time_after_eq(jiffies, dev->gnd_rdmaq_deadline)) {
+		unsigned long           dead_bump;
+		long                    new_ok;
+
+		/* if we think we need to adjust, take lock to serialize and recheck */
+		spin_lock(&dev->gnd_rdmaq_lock);
+		if (time_after_eq(jiffies, dev->gnd_rdmaq_deadline)) {
+			del_singleshot_timer_sync(&dev->gnd_rdmaq_timer);
+
+			dead_bump = cfs_time_seconds(1) / *kgnilnd_tunables.kgn_rdmaq_intervals;
+
+			/* roll the bucket forward */
+			dev->gnd_rdmaq_deadline = jiffies + dead_bump;
+
+			if (kgnilnd_data.kgn_rdmaq_override &&
+				(*kgnilnd_tunables.kgn_rdmaq_intervals != 0)) {
+				new_ok = kgnilnd_data.kgn_rdmaq_override / *kgnilnd_tunables.kgn_rdmaq_intervals;
+			}  else {
+				new_ok = ~0UL >> 1;
+			}
+
+			/* roll current outstanding forward to make sure we carry outstanding
+			 * committment forward
+			 * new_ok starts out as the whole interval value
+			 *  - first subtract bytes_out from last interval, as that would push us over
+			 *    strict limits for this interval
+			 *  - second, set bytes_ok to new_ok to ensure it doesn't exceed the current auth
+			 *
+			 * there is a small race here if someone is actively processing mappings and
+			 * adding to rdmaq_bytes_out, but it should be small as the mappings are triggered
+			 * quite quickly after kgnilnd_auth_rdma_bytes gives us the go-ahead
+			 * - if this gives us problems in the future, we could use a read/write lock
+			 * to protect the resetting of these values */
+			new_ok -= atomic64_read(&dev->gnd_rdmaq_bytes_out);
+			atomic64_set(&dev->gnd_rdmaq_bytes_ok, new_ok);
+
+			CDEBUG(D_NET, "resetting rdmaq bytes to %ld, deadline +%lu -> %lu, "
+				       "current out %ld\n",
+			       atomic64_read(&dev->gnd_rdmaq_bytes_ok), dead_bump, dev->gnd_rdmaq_deadline,
+			       atomic64_read(&dev->gnd_rdmaq_bytes_out));
+		}
+		spin_unlock(&dev->gnd_rdmaq_lock);
+	}
+
+	spin_lock(&dev->gnd_rdmaq_lock);
+	while (!list_empty(&dev->gnd_rdmaq)) {
+		int     rc;
+
+		/* make sure we break out early on quiesce */
+		if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+			/* always break with lock held - we unlock outside loop */
+			break;
+		}
+
+		tx = list_first_entry(&dev->gnd_rdmaq, kgn_tx_t, tx_list);
+		kgnilnd_tx_del_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_ALLOCD);
+		found_work++;
+
+		/* sample with lock held, serializing with kgnilnd_complete_closed_conn */
+		if (tx->tx_conn->gnc_state != GNILND_CONN_ESTABLISHED) {
+			/* if conn is dying, mark tx in tx_ref_table for
+			 * kgnilnd_complete_closed_conn to finish up */
+			kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_DYING, 1);
+
+			/* tx was moved to DYING, get next */
+			continue;
+		}
+		spin_unlock(&dev->gnd_rdmaq_lock);
+
+		rc = kgnilnd_auth_rdma_bytes(dev, tx);
+		spin_lock(&dev->gnd_rdmaq_lock);
+
+		if (rc < 0) {
+			/* no ticket! add back to head */
+			kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_RDMAQ, 0);
+			/* clear found_work so scheduler threads wait for timer */
+			found_work = 0;
+			break;
+		} else {
+			/* TX is GO for launch */
+			tx->tx_qtime = jiffies;
+			kgnilnd_send_mapped_tx(tx, 0);
+			found_work++;
+		}
+	}
+	spin_unlock(&dev->gnd_rdmaq_lock);
+
+	RETURN(found_work);
+}
+
+static inline void
+kgnilnd_swab_rdma_desc(kgn_rdma_desc_t *d)
+{
+	__swab64s(&d->gnrd_key.qword1);
+	__swab64s(&d->gnrd_key.qword2);
+	__swab64s(&d->gnrd_addr);
+	__swab32s(&d->gnrd_nob);
+}
+
+#define kgnilnd_match_reply_either(w, x, y, z) _kgnilnd_match_reply(w, x, y, z)
+#define kgnilnd_match_reply(x, y, z) _kgnilnd_match_reply(x, y, GNILND_MSG_NONE, z)
+
+kgn_tx_t *
+_kgnilnd_match_reply(kgn_conn_t *conn, int type1, int type2, __u64 cookie)
+{
+	kgn_tx_ev_id_t    ev_id;
+	kgn_tx_t         *tx;
+
+	/* we use the cookie from the original TX, so we can find the match
+	 * by parsing that and using the txe_idx */
+	ev_id.txe_cookie = cookie;
+
+	tx = conn->gnc_tx_ref_table[ev_id.txe_idx];
+
+	if (tx != NULL) {
+		/* check tx to make sure kgni didn't eat it */
+		GNITX_ASSERTF(tx, tx->tx_msg.gnm_magic == GNILND_MSG_MAGIC,
+			      "came back from kgni with bad magic %x\n", tx->tx_msg.gnm_magic);
+
+		GNITX_ASSERTF(tx, ((tx->tx_id.txe_idx == ev_id.txe_idx) &&
+				  (tx->tx_id.txe_cookie = cookie)),
+			      "conn 0x%p->%s tx_ref_table hosed: wanted "
+			      "txe_cookie "LPX64" txe_idx %d "
+			      "found tx %p cookie "LPX64" txe_idx %d\n",
+			      conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+			      cookie, ev_id.txe_idx,
+			      tx, tx->tx_id.txe_cookie, tx->tx_id.txe_idx);
+
+		LASSERTF((((tx->tx_msg.gnm_type == type1) || (tx->tx_msg.gnm_type == type2)) &&
+			(tx->tx_state & GNILND_TX_WAITING_REPLY)),
+			"Unexpected TX type (%x, %x or %x) "
+			"or state (%x, expected +%x) "
+			"matched reply from %s\n",
+			tx->tx_msg.gnm_type, type1, type2,
+			tx->tx_state, GNILND_TX_WAITING_REPLY,
+			libcfs_nid2str(conn->gnc_peer->gnp_nid));
+	} else {
+		CWARN("Unmatched reply %02x, or %02x/"LPX64" from %s\n",
+		      type1, type2, cookie, libcfs_nid2str(conn->gnc_peer->gnp_nid));
+	}
+	return tx;
+}
+
+static inline void
+kgnilnd_complete_tx(kgn_tx_t *tx, int rc)
+{
+	int             complete = 0;
+	kgn_conn_t      *conn = tx->tx_conn;
+
+	spin_lock(&conn->gnc_list_lock);
+
+	GNITX_ASSERTF(tx, tx->tx_state & GNILND_TX_WAITING_REPLY,
+		"not waiting for reply", NULL);
+
+	tx->tx_rc = rc;
+	tx->tx_state &= ~GNILND_TX_WAITING_REPLY;
+
+	if (!(tx->tx_state & GNILND_TX_WAITING_COMPLETION)) {
+		kgnilnd_tx_del_state_locked(tx, NULL, conn, GNILND_TX_ALLOCD);
+		/* sample under lock as follow on steps require gnc_list_lock
+		 * - or call kgnilnd_tx_done which requires no locks held over
+		 *   call to lnet_finalize */
+		complete = 1;
+	}
+	spin_unlock(&conn->gnc_list_lock);
+
+	if (complete) {
+		kgnilnd_tx_done(tx, tx->tx_rc);
+	}
+}
+
+static inline void
+kgnilnd_finalize_rx_done(kgn_tx_t *tx, kgn_msg_t *msg)
+{
+	int              rc;
+	kgn_conn_t      *conn = tx->tx_conn;
+
+	atomic_inc(&conn->gnc_device->gnd_rdma_nrx);
+	atomic64_add(tx->tx_nob, &conn->gnc_device->gnd_rdma_rxbytes);
+
+	rc = kgnilnd_verify_rdma_cksum(tx, msg->gnm_payload_cksum);
+
+	kgnilnd_complete_tx(tx, rc);
+}
+
+void
+kgnilnd_check_fma_rx(kgn_conn_t *conn)
+{
+	__u32         seq;
+	kgn_tx_t     *tx;
+	kgn_rx_t     *rx;
+	kgn_msg_t    *msg;
+	void         *prefix;
+	gni_return_t  rrc;
+	kgn_peer_t   *peer = conn->gnc_peer;
+	kgn_net_t    *net;
+	int           rc = 0;
+	__u16         tmp_cksum = 0, msg_cksum = 0;
+	int           repost = 1, saw_complete;
+	unsigned long timestamp, newest_last_rx, timeout;
+	int           last_seq;
+	void         *memory = NULL;
+	ENTRY;
+
+	/* Short circuit if the ep_handle is null.
+	 * It's likely that its about to be closed as stale.
+	 */
+	if (conn->gnc_ephandle == NULL)
+		RETURN_EXIT;
+
+	timestamp = jiffies;
+	mutex_lock(&conn->gnc_device->gnd_cq_mutex);
+	/* delay in jiffies - we are really concerned only with things that
+	 * result in a schedule() or really holding this off for long times .
+	 * NB - mutex_lock could spin for 2 jiffies before going to sleep to wait */
+	conn->gnc_device->gnd_mutex_delay += (long) jiffies - timestamp;
+
+	/* Resample current time as we have no idea how long it took to get the mutex */
+	timestamp = jiffies;
+
+	/* We check here when the last time we received an rx, we do this before
+	 * we call getnext in case the thread has been blocked for a while. If we
+	 * havent received an rx since our timeout value we close the connection
+	 * as we should assume the other side has closed the connection. This will
+	 * stop us from sending replies to a mailbox that is already in purgatory.
+	 */
+
+	timeout = cfs_time_seconds(conn->gnc_timeout);
+	newest_last_rx = GNILND_LASTRX(conn);
+
+	/* Error injection to validate that timestamp checking works and closing the conn */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_RECV_TIMEOUT)) {
+		timestamp = timestamp + (GNILND_TIMEOUTRX(timeout) * 2);
+	}
+
+	if (time_after_eq(timestamp, newest_last_rx + (GNILND_TIMEOUTRX(timeout)))) {
+		GNIDBG_CONN(D_NETERROR|D_CONSOLE, conn, "Cant receive from %s after timeout lapse of %lu; TO %lu",
+		libcfs_nid2str(conn->gnc_peer->gnp_nid),
+		cfs_duration_sec(timestamp - newest_last_rx),
+		cfs_duration_sec(GNILND_TIMEOUTRX(timeout)));
+		mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+		rc = -ETIME;
+		kgnilnd_close_conn(conn, rc);
+		RETURN_EXIT;
+	}
+
+	rrc = kgnilnd_smsg_getnext(conn->gnc_ephandle, &prefix);
+
+	if (rrc == GNI_RC_NOT_DONE) {
+		mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+		CDEBUG(D_INFO, "SMSG RX empty\n");
+		RETURN_EXIT;
+	}
+
+	if (rrc == GNI_RC_INVALID_STATE) {
+		LIBCFS_ALLOC(memory, conn->gnpr_smsg_attr.buff_size);
+		if (memory == NULL) {
+			memory = (void *)0xdeadbeef;
+		} else {
+			memcpy(memory, conn->gnpr_smsg_attr.msg_buffer + conn->gnpr_smsg_attr.mbox_offset, conn->gnpr_smsg_attr.buff_size);
+		}
+	}
+
+	LASSERTF(rrc == GNI_RC_SUCCESS,
+		"bad rc %d on conn %p from peer %s mailbox copy %p\n",
+		 rrc, conn, libcfs_nid2str(peer->gnp_nid), memory);
+
+	msg = (kgn_msg_t *)prefix;
+
+	rx = kgnilnd_alloc_rx();
+	if (rx == NULL) {
+		mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+		kgnilnd_release_msg(conn);
+		GNIDBG_MSG(D_NETERROR, msg, "Dropping SMSG RX from 0x%p->%s, no RX memory",
+			   conn, libcfs_nid2str(peer->gnp_nid));
+		RETURN_EXIT;
+	}
+
+	GNIDBG_MSG(D_INFO, msg, "SMSG RX on %p from %s",
+		conn, libcfs_nid2str(peer->gnp_nid));
+
+	timestamp = conn->gnc_last_rx;
+	last_seq = conn->gnc_rx_seq;
+
+	conn->gnc_last_rx = jiffies;
+	/* stash first rx so we can clear out purgatory
+	 */
+	if (conn->gnc_first_rx == 0)
+		conn->gnc_first_rx = jiffies;
+
+	seq = conn->gnc_rx_seq++;
+
+	/* needs to linger to protect gnc_rx_seq like we do with gnc_tx_seq */
+	mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+	kgnilnd_peer_alive(conn->gnc_peer);
+
+	rx->grx_msg = msg;
+	rx->grx_conn = conn;
+	rx->grx_eager = 0;
+	rx->grx_received = current_kernel_time();
+
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_NET_LOOKUP)) {
+		rc = -ENONET;
+	} else {
+		rc = kgnilnd_find_net(msg->gnm_srcnid, &net);
+	}
+
+	if (rc < 0) {
+		GOTO(out, rc);
+	} else {
+		kgnilnd_net_decref(net);
+	}
+
+	if (*kgnilnd_tunables.kgn_checksum && !msg->gnm_cksum)
+		GNIDBG_MSG(D_WARNING, msg, "no msg header checksum when enabled");
+
+	/* XXX Nic: Do we need to swab cksum */
+	if (msg->gnm_cksum != 0) {
+		msg_cksum = msg->gnm_cksum;
+		msg->gnm_cksum = 0;
+		tmp_cksum = kgnilnd_cksum(msg, sizeof(kgn_msg_t));
+
+		if (tmp_cksum != msg_cksum) {
+			GNIDBG_MSG(D_NETERROR, msg, "Bad hdr checksum (%x expected %x)",
+					tmp_cksum, msg_cksum);
+			kgnilnd_dump_msg(D_BUFFS, msg);
+			rc = -ENOKEY;
+			GOTO(out, rc);
+		}
+	}
+	/* restore checksum for future debug messages */
+	msg->gnm_cksum = tmp_cksum;
+
+	if (msg->gnm_magic != GNILND_MSG_MAGIC) {
+		if (__swab32(msg->gnm_magic) != GNILND_MSG_MAGIC) {
+			GNIDBG_MSG(D_NETERROR, msg, "Unexpected magic %08x from %s",
+			       msg->gnm_magic, libcfs_nid2str(peer->gnp_nid));
+			rc = -EPROTO;
+			GOTO(out, rc);
+		}
+
+		__swab32s(&msg->gnm_magic);
+		__swab16s(&msg->gnm_version);
+		__swab16s(&msg->gnm_type);
+		__swab64s(&msg->gnm_srcnid);
+		__swab64s(&msg->gnm_connstamp);
+		__swab32s(&msg->gnm_seq);
+
+		/* NB message type checked below; NOT here... */
+		switch (msg->gnm_type) {
+		case GNILND_MSG_PUT_ACK:
+			kgnilnd_swab_rdma_desc(&msg->gnm_u.putack.gnpam_desc);
+			break;
+
+		case GNILND_MSG_GET_REQ:
+			kgnilnd_swab_rdma_desc(&msg->gnm_u.get.gngm_desc);
+			break;
+
+		default:
+			break;
+		}
+	}
+
+	if (msg->gnm_version != GNILND_MSG_VERSION) {
+		GNIDBG_MSG(D_NETERROR, msg, "Unexpected protocol version %d from %s",
+		       msg->gnm_version, libcfs_nid2str(peer->gnp_nid));
+		rc = -EPROTO;
+		GOTO(out, rc);
+	}
+
+	if (LNET_NIDADDR(msg->gnm_srcnid) != LNET_NIDADDR(peer->gnp_nid)) {
+		GNIDBG_MSG(D_NETERROR, msg, "Unexpected peer %s from %s",
+		       libcfs_nid2str(msg->gnm_srcnid),
+		       libcfs_nid2str(peer->gnp_nid));
+		rc = -EPROTO;
+		GOTO(out, rc);
+	}
+
+	if (msg->gnm_connstamp != conn->gnc_peer_connstamp) {
+		GNIDBG_MSG(D_NETERROR, msg, "Unexpected connstamp "LPX64"("LPX64
+		       " expected) from %s",
+		       msg->gnm_connstamp, conn->gnc_peer_connstamp,
+		       libcfs_nid2str(peer->gnp_nid));
+		rc = -EPROTO;
+		GOTO(out, rc);
+	}
+
+	if (msg->gnm_seq != seq) {
+		GNIDBG_MSG(D_NETERROR, msg, "Unexpected sequence number %d(%d expected) from %s",
+		       msg->gnm_seq, seq, libcfs_nid2str(peer->gnp_nid));
+		rc = -EPROTO;
+		GOTO(out, rc);
+	}
+
+	atomic_inc(&conn->gnc_device->gnd_short_nrx);
+
+	if (msg->gnm_type == GNILND_MSG_CLOSE) {
+		CDEBUG(D_NETTRACE, "%s sent us CLOSE msg\n",
+			      libcfs_nid2str(conn->gnc_peer->gnp_nid));
+		write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+		conn->gnc_close_recvd = GNILND_CLOSE_RX;
+		conn->gnc_peer_error = msg->gnm_u.completion.gncm_retval;
+		/* double check state with lock held */
+		if (conn->gnc_state == GNILND_CONN_ESTABLISHED) {
+			/* only error if we are not already closing */
+			if (conn->gnc_peer_error == -ETIMEDOUT) {
+				unsigned long           now = jiffies;
+				CNETERR("peer 0x%p->%s closed connection 0x%p due to timeout. "
+				       "Is node down? "
+				       "RX %d @ %lus/%lus; TX %d @ %lus/%lus; "
+				       "NOOP %lus/%lus/%lus; sched %lus/%lus/%lus ago\n",
+				       conn->gnc_peer, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+				       conn, last_seq,
+				       cfs_duration_sec(now - timestamp),
+				       cfs_duration_sec(now - conn->gnc_last_rx_cq),
+				       conn->gnc_tx_seq,
+				       cfs_duration_sec(now - conn->gnc_last_tx),
+				       cfs_duration_sec(now - conn->gnc_last_tx_cq),
+				       cfs_duration_sec(now - conn->gnc_last_noop_want),
+				       cfs_duration_sec(now - conn->gnc_last_noop_sent),
+				       cfs_duration_sec(now - conn->gnc_last_noop_cq),
+				       cfs_duration_sec(now - conn->gnc_last_sched_ask),
+				       cfs_duration_sec(now - conn->gnc_last_sched_do),
+				       cfs_duration_sec(now - conn->gnc_device->gnd_sched_alive));
+			}
+			kgnilnd_close_conn_locked(conn, -ECONNRESET);
+		}
+		write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+		GOTO(out, rc);
+	}
+
+	if (conn->gnc_close_recvd) {
+		GNIDBG_MSG(D_NETERROR, msg, "Unexpected message %s(%d/%d) after CLOSE from %s",
+		       kgnilnd_msgtype2str(msg->gnm_type),
+		       msg->gnm_type, conn->gnc_close_recvd,
+		       libcfs_nid2str(conn->gnc_peer->gnp_nid));
+		rc = -EPROTO;
+		GOTO(out, rc);
+	}
+
+	if (conn->gnc_state != GNILND_CONN_ESTABLISHED) {
+		/* XXX Nic: log message received on bad connection state */
+		GOTO(out, rc);
+	}
+
+	switch (msg->gnm_type) {
+	case GNILND_MSG_NOOP:
+		/* Nothing to do; just a keepalive */
+		break;
+
+	case GNILND_MSG_IMMEDIATE:
+		/* only get SMSG payload for IMMEDIATE */
+		atomic64_add(msg->gnm_payload_len, &conn->gnc_device->gnd_short_rxbytes);
+		rc = lnet_parse(net->gnn_ni, &msg->gnm_u.immediate.gnim_hdr,
+				msg->gnm_srcnid, rx, 0);
+		repost = rc < 0;
+		break;
+
+	case GNILND_MSG_PUT_REQ:
+		rc = lnet_parse(net->gnn_ni, &msg->gnm_u.putreq.gnprm_hdr,
+				msg->gnm_srcnid, rx, 1);
+		repost = rc < 0;
+		break;
+
+	case GNILND_MSG_PUT_NAK:
+		tx = kgnilnd_match_reply_either(conn, GNILND_MSG_PUT_REQ, GNILND_MSG_PUT_ACK,
+					msg->gnm_u.completion.gncm_cookie);
+		if (tx == NULL)
+			break;
+
+		kgnilnd_complete_tx(tx, msg->gnm_u.completion.gncm_retval);
+		break;
+
+	case GNILND_MSG_PUT_ACK:
+		tx = kgnilnd_match_reply(conn, GNILND_MSG_PUT_REQ,
+					msg->gnm_u.putack.gnpam_src_cookie);
+		if (tx == NULL)
+			break;
+
+		/* store putack data for later: deferred rdma or re-try */
+		tx->tx_putinfo = msg->gnm_u.putack;
+
+		saw_complete = 0;
+		spin_lock(&tx->tx_conn->gnc_list_lock);
+
+		GNITX_ASSERTF(tx, tx->tx_state & GNILND_TX_WAITING_REPLY,
+			"not waiting for reply", NULL);
+
+		tx->tx_state &= ~GNILND_TX_WAITING_REPLY;
+
+		if (likely(!(tx->tx_state & GNILND_TX_WAITING_COMPLETION))) {
+			kgnilnd_tx_del_state_locked(tx, NULL, conn, GNILND_TX_ALLOCD);
+			/* sample under lock as follow on steps require gnc_list_lock
+			 * - or call kgnilnd_tx_done which requires no locks held over
+			 *   call to lnet_finalize */
+			saw_complete = 1;
+		} else {
+			/* cannot launch rdma if still waiting for fma-msg completion */
+			CDEBUG(D_NET, "tx 0x%p type 0x%02x will need to "
+				       "wait for SMSG completion\n", tx, tx->tx_msg.gnm_type);
+			tx->tx_state |= GNILND_TX_PENDING_RDMA;
+		}
+		spin_unlock(&tx->tx_conn->gnc_list_lock);
+
+		if (saw_complete) {
+			rc = kgnilnd_send_mapped_tx(tx, 0);
+			if (rc < 0)
+				kgnilnd_tx_done(tx, rc);
+		}
+		break;
+
+	case GNILND_MSG_PUT_DONE:
+		tx = kgnilnd_match_reply(conn, GNILND_MSG_PUT_ACK,
+					msg->gnm_u.completion.gncm_cookie);
+		if (tx == NULL)
+			break;
+
+		GNITX_ASSERTF(tx, tx->tx_buftype == GNILND_BUF_PHYS_MAPPED ||
+			       tx->tx_buftype == GNILND_BUF_VIRT_MAPPED,
+			       "bad tx buftype %d", tx->tx_buftype);
+
+		kgnilnd_finalize_rx_done(tx, msg);
+		break;
+
+	case GNILND_MSG_GET_REQ:
+		rc = lnet_parse(net->gnn_ni, &msg->gnm_u.get.gngm_hdr,
+				msg->gnm_srcnid, rx, 1);
+		repost = rc < 0;
+		break;
+
+	case GNILND_MSG_GET_NAK:
+		tx = kgnilnd_match_reply(conn, GNILND_MSG_GET_REQ,
+					msg->gnm_u.completion.gncm_cookie);
+		if (tx == NULL)
+			break;
+
+		GNITX_ASSERTF(tx, tx->tx_buftype == GNILND_BUF_PHYS_MAPPED ||
+			       tx->tx_buftype == GNILND_BUF_VIRT_MAPPED,
+			       "bad tx buftype %d", tx->tx_buftype);
+
+		kgnilnd_complete_tx(tx, msg->gnm_u.completion.gncm_retval);
+		break;
+
+	case GNILND_MSG_GET_DONE:
+		tx = kgnilnd_match_reply(conn, GNILND_MSG_GET_REQ,
+					msg->gnm_u.completion.gncm_cookie);
+		if (tx == NULL)
+			break;
+
+		GNITX_ASSERTF(tx, tx->tx_buftype == GNILND_BUF_PHYS_MAPPED ||
+			       tx->tx_buftype == GNILND_BUF_VIRT_MAPPED,
+			       "bad tx buftype %d", tx->tx_buftype);
+
+		lnet_set_reply_msg_len(net->gnn_ni, tx->tx_lntmsg[1],
+				       msg->gnm_u.completion.gncm_retval);
+
+		kgnilnd_finalize_rx_done(tx, msg);
+		break;
+	}
+
+ out:
+	if (rc < 0)                             /* protocol/comms error */
+		kgnilnd_close_conn(conn, rc);
+
+	if (repost && rx != NULL) {
+		kgnilnd_consume_rx(rx);
+	}
+
+	/* we got an event so assume more there and call for reschedule */
+	if (rc >= 0)
+		kgnilnd_schedule_conn(conn);
+	EXIT;
+}
+
+/* Do the failure injections that we need to affect conn processing in the following function.
+ * When writing tests that use this function make sure to use a fail_loc with a fail mask.
+ * If you dont you can cause the scheduler threads to spin on the conn without it leaving
+ * process_conns.
+ *
+ * intent is used to signal the calling function whether or not the conn needs to be rescheduled.
+ */
+
+static inline int
+kgnilnd_check_conn_fail_loc(kgn_device_t *dev, kgn_conn_t *conn, int *intent)
+{
+	int     rc = 0;
+
+	/* short circuit out when not set */
+	if (likely(!cfs_fail_loc)) {
+		RETURN(rc);
+	}
+
+	/* failure injection to test for stack reset clean ups */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_DROP_CLOSING)) {
+		/* we can't rely on busy loops being nice enough to get the
+		 *  stack reset triggered - it'd just spin on this conn */
+		CFS_RACE(CFS_FAIL_GNI_DROP_CLOSING);
+		rc = 1;
+		*intent = 1;
+		GOTO(did_fail_loc, rc);
+	}
+
+	if (conn->gnc_state == GNILND_CONN_DESTROY_EP) {
+		/* DESTROY_EP set in kgnilnd_conn_decref on gnc_refcount = 1 */
+
+		if (CFS_FAIL_CHECK(CFS_FAIL_GNI_DROP_DESTROY_EP)) {
+			CFS_RACE(CFS_FAIL_GNI_DROP_DESTROY_EP);
+			rc = 1;
+			*intent = 1;
+			GOTO(did_fail_loc, rc);
+		}
+	}
+
+	/* CFS_FAIL_GNI_FINISH_PURG2 is used to stop a connection from fully closing. This scheduler
+	 * will spin on the CFS_FAIL_TIMEOUT until the fail_loc is cleared at which time the connection
+	 * will be closed by kgnilnd_complete_closed_conn.
+	 */
+	if ((conn->gnc_state == GNILND_CONN_CLOSED) && CFS_FAIL_CHECK(CFS_FAIL_GNI_FINISH_PURG2)) {
+		while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_FINISH_PURG2, 1)) {};
+		rc = 1;
+		*intent = 1;
+		GOTO(did_fail_loc, rc);
+	}
+
+	/* this one is a bit gross - we can't hold the mutex from process_conns
+	 * across a CFS_RACE here - it'd block the conn threads from doing an ep_bind
+	 * and moving onto finish_connect
+	 * so, we'll just set the rc - kgnilnd_process_conns will clear
+	 * found_work on a fail_loc, getting the scheduler thread to call schedule()
+	 * and effectively getting this thread to sleep */
+	if ((conn->gnc_state == GNILND_CONN_CLOSED) && CFS_FAIL_CHECK(CFS_FAIL_GNI_FINISH_PURG)) {
+		rc = 1;
+		*intent = 1;
+		GOTO(did_fail_loc, rc);
+	}
+
+did_fail_loc:
+	RETURN(rc);
+}
+
+static inline void
+kgnilnd_send_conn_close(kgn_conn_t *conn)
+{
+	kgn_tx_t        *tx;
+
+	/* we are closing the conn - we will try to send the CLOSE msg
+	 * but will not wait for anything else to flush */
+
+	/* send the close if not already done so or received one */
+	if (!conn->gnc_close_sent && !conn->gnc_close_recvd) {
+		/* set close_sent regardless of the success of the
+		 * CLOSE message. We are going to try once and then
+		 * kick him out of the sandbox */
+		conn->gnc_close_sent = 1;
+		mb();
+
+		/* EP might be null already if remote side initiated a new connection.
+		 * kgnilnd_finish_connect destroys existing ep_handles before wiring up the new connection,
+		 * so this check is here to make sure we dont attempt to send with a null ep_handle.
+		 */
+		if (conn->gnc_ephandle != NULL) {
+			int rc = 0;
+
+			tx = kgnilnd_new_tx_msg(GNILND_MSG_CLOSE, conn->gnc_peer->gnp_net->gnn_ni->ni_nid);
+			if (tx != NULL) {
+				tx->tx_msg.gnm_u.completion.gncm_retval = conn->gnc_error;
+				tx->tx_state = GNILND_TX_WAITING_COMPLETION;
+				tx->tx_qtime = jiffies;
+
+				if (tx->tx_id.txe_idx == 0) {
+					rc = kgnilnd_set_tx_id(tx, conn);
+					if (rc != 0) {
+						kgnilnd_tx_done(tx, rc);
+					}
+				}
+
+				CDEBUG(D_NETTRACE, "sending close with errno %d\n",
+						conn->gnc_error);
+
+				if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CLOSE_SEND)) {
+					kgnilnd_tx_done(tx, -EAGAIN);
+				} else if (!rc) {
+					rc = kgnilnd_sendmsg(tx, NULL, 0, NULL, GNILND_TX_FMAQ);
+					if (rc) {
+						/* It wasnt sent and we dont care. */
+						kgnilnd_tx_done(tx, rc);
+					}
+				}
+
+			}
+		}
+	}
+
+	conn->gnc_state = GNILND_CONN_CLOSED;
+	/* mark this conn as CLOSED now that we processed it
+	 * do after TX, so we can use CLOSING in asserts */
+
+	mb();
+
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_RX_CLOSE_CLOSED)) {
+		/* simulate a RX CLOSE after the timeout but before
+		 * the scheduler thread gets it */
+		conn->gnc_close_recvd = GNILND_CLOSE_INJECT2;
+		conn->gnc_peer_error = -ETIMEDOUT;
+	}
+	/* schedule to allow potential CLOSE and get the complete phase run */
+	kgnilnd_schedule_conn(conn);
+}
+
+int
+kgnilnd_process_mapped_tx(kgn_device_t *dev)
+{
+	int		found_work = 0;
+	int		rc = 0;
+	kgn_tx_t	*tx;
+	int		max_retrans = *kgnilnd_tunables.kgn_max_retransmits;
+	int		log_retrans, log_retrans_level;
+	static int	last_map_version;
+	ENTRY;
+
+	spin_lock(&dev->gnd_lock);
+	if (list_empty(&dev->gnd_map_tx)) {
+		spin_unlock(&dev->gnd_lock);
+		RETURN(0);
+	}
+
+	dev->gnd_sched_alive = jiffies;
+
+	/* we'll retry as fast as possible up to 25% of the limit, then we start
+	 * backing off until our map version changes - indicating we unmapped
+	 * something */
+	tx = list_first_entry(&dev->gnd_map_tx, kgn_tx_t, tx_list);
+	if ((tx->tx_retrans > (max_retrans / 4)) &&
+	    (last_map_version == dev->gnd_map_version)) {
+		GNIDBG_TX(D_NET, tx, "waiting for mapping event event to retry", NULL);
+		spin_unlock(&dev->gnd_lock);
+		RETURN(0);
+	}
+
+	/* stash the last map version to let us know when a good one was seen */
+	last_map_version = dev->gnd_map_version;
+
+	/* we need to to take the lock and continually refresh the head of the list as
+	 * kgnilnd_complete_closed_conn might be nuking stuff and we are cycling the lock
+	 * allowing them to squeeze in */
+
+	while (!list_empty(&dev->gnd_map_tx)) {
+		/* make sure we break out early on quiesce */
+		if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+			/* always break with lock held - we unlock outside loop */
+			break;
+		}
+
+		tx = list_first_entry(&dev->gnd_map_tx, kgn_tx_t, tx_list);
+
+		kgnilnd_tx_del_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_ALLOCD);
+		found_work++;
+
+		/* sample with lock held, serializing with kgnilnd_complete_closed_conn */
+		if (tx->tx_conn->gnc_state != GNILND_CONN_ESTABLISHED) {
+			/* if conn is dying, mark tx in tx_ref_table for
+			 * kgnilnd_complete_closed_conn to finish up */
+			kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_DYING, 1);
+			found_work++;
+
+			/* tx was moved to DYING, get next */
+			continue;
+		}
+
+		spin_unlock(&dev->gnd_lock);
+		rc = kgnilnd_send_mapped_tx(tx, 1);
+
+		/* We made it! skip error handling.. */
+		if (rc >= 0) {
+			/* OK to continue on +ve errors as it won't get seen until
+			 * this function is called again - we operate on a copy of the original
+			 * list and not the live list */
+			spin_lock(&dev->gnd_lock);
+			continue;
+		} else if (rc != -ENOMEM) {
+			/* carp, failure we can't handle */
+			kgnilnd_tx_done(tx, rc);
+			spin_lock(&dev->gnd_lock);
+			continue;
+		}
+
+		/* time to handle the retry cases.. */
+		tx->tx_retrans++;
+		if (tx->tx_retrans == 1)
+			tx->tx_qtime = jiffies;
+
+		/* only log occasionally once we've retried max / 2 */
+		log_retrans = (tx->tx_retrans >= (max_retrans / 2)) &&
+			      ((tx->tx_retrans % 32) == 0);
+		log_retrans_level = log_retrans ? D_NETERROR : D_NET;
+
+		/* make sure we are not off in the weeds with this tx */
+		if (tx->tx_retrans > *kgnilnd_tunables.kgn_max_retransmits) {
+		       GNIDBG_TX(D_NETERROR, tx,
+			       "giving up on TX, too many retries", NULL);
+		       kgnilnd_tx_done(tx, -ENOMEM);
+		       GOTO(get_out_mapped, rc);
+		} else {
+		       GNIDBG_TX(log_retrans_level, tx,
+				"transient map failure #%d %d pages/%d bytes phys %u@%u "
+				"virt %u@"LPU64" "
+				"nq_map %d mdd# %d/%d GART %ld",
+				tx->tx_retrans, tx->tx_phys_npages, tx->tx_nob,
+				dev->gnd_map_nphys, dev->gnd_map_physnop * PAGE_SIZE,
+				dev->gnd_map_nvirt, dev->gnd_map_virtnob,
+				atomic_read(&dev->gnd_nq_map),
+				atomic_read(&dev->gnd_n_mdd), atomic_read(&dev->gnd_n_mdd_held),
+				atomic64_read(&dev->gnd_nbytes_map));
+		}
+
+		/* we need to stop processing the rest of the list, so add it back in */
+		spin_lock(&dev->gnd_lock);
+		kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_MAPQ, 0);
+		spin_unlock(&dev->gnd_lock);
+		GOTO(get_out_mapped, rc);
+	}
+	spin_unlock(&dev->gnd_lock);
+get_out_mapped:
+	RETURN(found_work);
+}
+
+int
+kgnilnd_process_conns(kgn_device_t *dev)
+{
+	int              found_work = 0;
+	int              conn_sched;
+	int              intent = 0;
+	kgn_conn_t      *conn;
+
+	spin_lock(&dev->gnd_lock);
+	while (!list_empty(&dev->gnd_ready_conns)) {
+		dev->gnd_sched_alive = jiffies;
+
+		if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+			/* break with lock held */
+			break;
+		}
+
+		conn = list_first_entry(&dev->gnd_ready_conns, kgn_conn_t, gnc_schedlist);
+		list_del_init(&conn->gnc_schedlist);
+		spin_unlock(&dev->gnd_lock);
+
+		conn_sched = xchg(&conn->gnc_scheduled, GNILND_CONN_PROCESS);
+
+		LASSERTF(conn_sched != GNILND_CONN_IDLE &&
+			 conn_sched != GNILND_CONN_PROCESS,
+			 "conn %p on ready list but in bad state: %d\n",
+			 conn, conn_sched);
+
+		CDEBUG(D_INFO, "conn %p@%s for processing\n",
+			conn, kgnilnd_conn_state2str(conn));
+
+		found_work++;
+		set_mb(conn->gnc_last_sched_do, jiffies);
+
+		if (kgnilnd_check_conn_fail_loc(dev, conn, &intent)) {
+
+			/* based on intent see if we should run again. */
+			kgnilnd_schedule_process_conn(conn, intent);
+
+			/* drop ref from gnd_ready_conns */
+			kgnilnd_conn_decref(conn);
+			/* clear this so that scheduler thread doesn't spin */
+			found_work = 0;
+			/* break with lock held... */
+			spin_lock(&dev->gnd_lock);
+			break;
+		}
+
+		if (unlikely(conn->gnc_state == GNILND_CONN_CLOSED)) {
+			/* CONN_CLOSED set in procces_fmaq when CLOSE is sent */
+			kgnilnd_complete_closed_conn(conn);
+		} else if (unlikely(conn->gnc_state == GNILND_CONN_DESTROY_EP)) {
+			/* DESTROY_EP set in kgnilnd_conn_decref on gnc_refcount = 1 */
+			/* serialize SMSG CQs with ep_bind and smsg_release */
+			kgnilnd_destroy_conn_ep(conn);
+		} else if (unlikely(conn->gnc_state == GNILND_CONN_CLOSING)) {
+		       /* if we need to do some CLOSE sending, etc done here do it */
+			kgnilnd_send_conn_close(conn);
+			kgnilnd_check_fma_rx(conn);
+		} else if (atomic_read(&conn->gnc_peer->gnp_dirty_eps) == 0) {
+			/* start moving traffic if the old conns are cleared out */
+			kgnilnd_check_fma_rx(conn);
+			kgnilnd_process_fmaq(conn);
+		}
+
+		kgnilnd_schedule_process_conn(conn, 0);
+
+		/* drop ref from gnd_ready_conns */
+		kgnilnd_conn_decref(conn);
+
+		/* check list again with lock held */
+		spin_lock(&dev->gnd_lock);
+	}
+	spin_unlock(&dev->gnd_lock);
+
+	RETURN(found_work);
+}
+
+int
+kgnilnd_scheduler(void *arg)
+{
+	int               threadno = (long)arg;
+	kgn_device_t     *dev;
+	char              name[16];
+	int               busy_loops = 0;
+	DEFINE_WAIT(wait);
+
+	dev = &kgnilnd_data.kgn_devices[(threadno + 1) % kgnilnd_data.kgn_ndevs];
+
+	snprintf(name, sizeof(name), "kgnilnd_sd_%02d", threadno);
+	cfs_daemonize(name);
+	cfs_block_allsigs();
+
+	/* all gnilnd threads need to run fairly urgently */
+	set_user_nice(current, *kgnilnd_tunables.kgn_nice);
+
+	while (!kgnilnd_data.kgn_shutdown) {
+		int     found_work = 0;
+		/* Safe: kgn_shutdown only set when quiescent */
+
+		/* to quiesce or to not quiesce, that is the question */
+
+		if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+			KGNILND_SPIN_QUIESCE;
+		}
+
+		/* tracking for when thread goes AWOL */
+		dev->gnd_sched_alive = jiffies;
+
+		/* let folks know we are up and kicking
+		 * - they can use this for latency savings, etc
+		 * - only change if IRQ, if IDLE leave alone as that
+		 *   schedule_device calls to put us back to IRQ */
+		(void)cmpxchg(&dev->gnd_ready, GNILND_DEV_IRQ, GNILND_DEV_LOOP);
+
+		/* always check these - they are super low cost  */
+		found_work += kgnilnd_check_fma_send_cq(dev);
+		found_work += kgnilnd_check_fma_rcv_cq(dev);
+
+		/* rdma CQ doesn't care about eps */
+		found_work += kgnilnd_check_rdma_cq(dev);
+
+		/* move some RDMA ? */
+		found_work += kgnilnd_process_rdmaq(dev);
+
+		/* map some pending RDMA requests ? */
+		found_work += kgnilnd_process_mapped_tx(dev);
+
+		/* the EP for a conn is not destroyed until all the references
+		 * to it are gone, so these checks should be safe
+		 * even if run in parallel with the CQ checking functions
+		 * _AND_ a thread that processes the CLOSED->DONE
+		 * transistion
+		 * ...should.... */
+
+		/* process all conns ready now */
+		found_work += kgnilnd_process_conns(dev);
+
+		/* do an eager check to avoid the IRQ disabling in
+		 * prepare_to_wait and friends */
+
+		if (found_work && busy_loops++ < *kgnilnd_tunables.kgn_loops) {
+			found_work = 0;
+			if ((busy_loops % 10) == 0) {
+				/* tickle heartbeat and watchdog to ensure our
+				 * piggishness doesn't turn into heartbeat failure */
+				touch_nmi_watchdog();
+				if (kgnilnd_hssops.hb_to_l0 != NULL) {
+					kgnilnd_hssops.hb_to_l0();
+				}
+			}
+			continue;
+		}
+
+		/* if we got here, found_work was zero or busy_loops means we
+		 * need to take a break. We'll clear gnd_ready but we'll check
+		 * one last time if there is an IRQ that needs processing */
+
+		prepare_to_wait(&dev->gnd_waitq, &wait, TASK_INTERRUPTIBLE);
+
+		/* the first time this will go LOOP -> IDLE and let us do one final check
+		 * during which we might get an IRQ, then IDLE->IDLE and schedule()
+		 * - this might allow other threads to block us for a bit if they
+		 *   try to get the mutex, but that is good as we'd need to wake
+		 *   up soon to handle the CQ or other processing anyways */
+
+		found_work += xchg(&dev->gnd_ready, GNILND_DEV_IDLE);
+
+		if (busy_loops >= *kgnilnd_tunables.kgn_loops) {
+			CDEBUG(D_INFO,
+			       "yeilding: found_work %d busy_loops %d\n",
+			       found_work, busy_loops);
+			busy_loops = 0;
+			/* use yield if we are bailing due to busy_loops
+			 * - this will ensure we wake up soonish. This closes
+			 * a race with kgnilnd_device_callback - where it'd
+			 * not call wake_up() because gnd_ready == 1, but then
+			 * we come down and schedule() because of busy_loops.
+			 * We'd not be woken up until something poked our waitq
+			 * again. yield() ensures we wake up without another
+			 * waitq poke in that case */
+			atomic_inc(&dev->gnd_n_yield);
+			yield();
+			CDEBUG(D_INFO, "awake after yeild\n");
+		} else if (found_work == GNILND_DEV_IDLE) {
+			/* busy_loops is low and there is nothing to do,
+			 * go to sleep and wait for a waitq poke */
+			CDEBUG(D_INFO,
+			       "scheduling: found_work %d busy_loops %d\n",
+			       found_work, busy_loops);
+			atomic_inc(&dev->gnd_n_schedule);
+			schedule();
+			CDEBUG(D_INFO, "awake after schedule\n");
+		}
+		finish_wait(&dev->gnd_waitq, &wait);
+	}
+
+	kgnilnd_thread_fini();
+	return 0;
+}
diff --git a/lnet/klnds/gnilnd/gnilnd_conn.c b/lnet/klnds/gnilnd/gnilnd_conn.c
new file mode 100644
index 0000000..38aee5b
--- /dev/null
+++ b/lnet/klnds/gnilnd/gnilnd_conn.c
@@ -0,0 +1,2408 @@
+/*
+ * Copyright (C) 2012 Cray, Inc.
+ *
+ *   Author: Igor Gorodetsky <iogordet@cray.com>
+ *   Author: Nic Henke <nic@cray.com>
+ *   Author: James Shimek <jshimek@cray.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "gnilnd.h"
+
+void
+kgnilnd_setup_smsg_attr(gni_smsg_attr_t *smsg_attr)
+{
+	smsg_attr->mbox_maxcredit = *kgnilnd_tunables.kgn_mbox_credits;
+	smsg_attr->msg_maxsize = GNILND_MAX_MSG_SIZE;
+	smsg_attr->msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT;
+}
+
+int
+kgnilnd_map_fmablk(kgn_device_t *device, kgn_fma_memblock_t *fma_blk)
+{
+	gni_return_t            rrc;
+	__u32                   flags = GNI_MEM_READWRITE;
+
+	if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
+		flags |= GNI_MEM_PHYS_CONT;
+	}
+
+	/* make sure we are mapping a clean block */
+	LASSERTF(fma_blk->gnm_hndl.qword1 == 0UL, "fma_blk %p dirty\n", fma_blk);
+
+	rrc = kgnilnd_mem_register(device->gnd_handle, (__u64)fma_blk->gnm_block,
+				   fma_blk->gnm_blk_size, device->gnd_rcv_fma_cqh,
+				   flags, &fma_blk->gnm_hndl);
+	if (rrc != GNI_RC_SUCCESS) {
+		/* XXX Nic: need a way to silence this for runtime stuff that is ok to fail
+		 * -- like when under MDD or GART pressure on big systems
+		 */
+		CNETERR("register fmablk failed 0x%p mbox_size %d flags %u\n",
+			fma_blk, fma_blk->gnm_mbox_size, flags);
+		RETURN(-ENOMEM);
+	}
+
+	/* PHYS_CONT memory isn't really mapped, at least not in GART -
+	 *  but all mappings chew up a MDD
+	 */
+	if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
+		atomic64_add(fma_blk->gnm_blk_size, &device->gnd_nbytes_map);
+	}
+
+	atomic_inc(&device->gnd_n_mdd);
+	/* nfmablk is live (mapped) blocks */
+	atomic_inc(&device->gnd_nfmablk);
+
+	RETURN(0);
+}
+
+int
+kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
+{
+	int                     rc = 0;
+	int                     num_mbox;
+	kgn_fma_memblock_t     *fma_blk;
+	gni_smsg_attr_t         smsg_attr;
+	unsigned long           fmablk_vers;
+
+	/* we'll use fmablk_vers and the gnd_fmablk_sem to gate access
+	 * to this allocation code. Everyone will sample the version
+	 * before and after getting the semaphore. If it has changed,
+	 * we'll bail out to check the lists again - this indicates that
+	 * some sort of change was made to the lists and it is possible
+	 * that there is a mailbox for us to find now. This should prevent
+	 * a ton of spinning in the case where there are lots of threads
+	 * that need a yet-to-be-allocated mailbox for a connection. */
+
+	fmablk_vers = atomic_read(&device->gnd_fmablk_vers);
+	down(&device->gnd_fmablk_sem);
+
+	if (fmablk_vers != atomic_read(&device->gnd_fmablk_vers)) {
+		/* version changed while we were waiting for semaphore,
+		 * we'll recheck the lists assuming something nice happened */
+		up(&device->gnd_fmablk_sem);
+		return 0;
+	}
+
+	LIBCFS_ALLOC(fma_blk, sizeof(kgn_fma_memblock_t));
+	if (fma_blk == NULL) {
+		CNETERR("could not allocate fma block descriptor\n");
+		rc = -ENOMEM;
+		GOTO(out, rc);
+	}
+
+	INIT_LIST_HEAD(&fma_blk->gnm_bufflist);
+
+	kgnilnd_setup_smsg_attr(&smsg_attr);
+
+	gni_smsg_buff_size_needed(&smsg_attr, &fma_blk->gnm_mbox_size);
+
+	LASSERTF(fma_blk->gnm_mbox_size, "mbox size %d\n", fma_blk->gnm_mbox_size);
+
+	/* gni_smsg_buff_size_needed calculates the base mailbox size and since
+	 * we want to hold kgn_peer_credits worth of messages in both directions,
+	 * we add PAYLOAD to grow the mailbox size
+	 */
+
+	fma_blk->gnm_mbox_size += GNILND_MBOX_PAYLOAD;
+
+	/* we'll only use physical during preallocate at startup -- this keeps it nice and
+	 * clean for runtime decisions. We'll keep the PHYS ones around until shutdown
+	 * as reallocating them is tough if there is memory fragmentation */
+
+	if (use_phys) {
+		fma_blk->gnm_block = cfs_mem_cache_alloc(kgnilnd_data.kgn_mbox_cache, CFS_ALLOC_ATOMIC);
+		if (fma_blk->gnm_block == NULL) {
+			CNETERR("could not allocate physical SMSG mailbox memory\n");
+			rc = -ENOMEM;
+			GOTO(free_desc, rc);
+		}
+		fma_blk->gnm_blk_size = KMALLOC_MAX_SIZE;
+		num_mbox = fma_blk->gnm_blk_size / fma_blk->gnm_mbox_size;
+
+		LASSERTF(num_mbox >= 1,
+			 "num_mbox %d blk_size %u mbox_size %d\n",
+			  num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size);
+
+		fma_blk->gnm_state = GNILND_FMABLK_PHYS;
+
+	} else {
+		num_mbox = *kgnilnd_tunables.kgn_mbox_per_block;
+		fma_blk->gnm_blk_size = num_mbox * fma_blk->gnm_mbox_size;
+
+		LASSERTF(num_mbox >= 1 && num_mbox >= *kgnilnd_tunables.kgn_mbox_per_block,
+			 "num_mbox %d blk_size %u mbox_size %d tunable %d\n",
+			 num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size,
+			 *kgnilnd_tunables.kgn_mbox_per_block);
+
+		LIBCFS_ALLOC(fma_blk->gnm_block, fma_blk->gnm_blk_size);
+		if (fma_blk->gnm_block == NULL) {
+			CNETERR("could not allocate virtual SMSG mailbox memory, %d bytes\n", fma_blk->gnm_blk_size);
+			rc = -ENOMEM;
+			GOTO(free_desc, rc);
+		}
+
+		fma_blk->gnm_state = GNILND_FMABLK_VIRT;
+	}
+
+	/* allocate just enough space for the bits to track the mailboxes */
+	LIBCFS_ALLOC(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox) * sizeof(unsigned long));
+	if (fma_blk->gnm_bit_array == NULL) {
+		CNETERR("could not allocate mailbox bitmask, %lu bytes for %d mbox\n",
+		       sizeof(unsigned long) * BITS_TO_LONGS(num_mbox), num_mbox);
+		rc = -ENOMEM;
+		GOTO(free_blk, rc);
+	}
+	bitmap_zero(fma_blk->gnm_bit_array, num_mbox);
+
+	/* now that the num_mbox is set based on allocation type, get debug info setup */
+	LIBCFS_ALLOC(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t) * num_mbox);
+	if (fma_blk->gnm_mbox_info == NULL) {
+		CNETERR("could not allocate mailbox debug, %lu bytes for %d mbox\n",
+		       sizeof(kgn_mbox_info_t) * num_mbox, num_mbox);
+		rc = -ENOMEM;
+		GOTO(free_bit, rc);
+	}
+
+	rc = kgnilnd_map_fmablk(device, fma_blk);
+	if (rc) {
+		GOTO(free_info, rc);
+	}
+
+	fma_blk->gnm_next_avail_mbox = 0;
+	fma_blk->gnm_avail_mboxs = fma_blk->gnm_num_mboxs = num_mbox;
+
+	CDEBUG(D_MALLOC, "alloc fmablk 0x%p num %d msg_maxsize %d credits %d "
+		"mbox_size %d MDD "LPX64"."LPX64"\n",
+		fma_blk, num_mbox, smsg_attr.msg_maxsize, smsg_attr.mbox_maxcredit,
+		fma_blk->gnm_mbox_size, fma_blk->gnm_hndl.qword1,
+		fma_blk->gnm_hndl.qword2);
+
+	/* lock Is protecting data structures, not semaphore */
+
+	spin_lock(&device->gnd_fmablk_lock);
+	list_add_tail(&fma_blk->gnm_bufflist, &device->gnd_fma_buffs);
+
+	/* toggle under the lock so once they change the list is also
+	 * ready for others to traverse */
+	atomic_inc(&device->gnd_fmablk_vers);
+
+	spin_unlock(&device->gnd_fmablk_lock);
+
+	up(&device->gnd_fmablk_sem);
+
+	return 0;
+
+free_info:
+	LIBCFS_FREE(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t)*num_mbox);
+free_bit:
+	LIBCFS_FREE(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox) * sizeof (unsigned long));
+free_blk:
+	if (fma_blk->gnm_state == GNILND_FMABLK_VIRT) {
+		LIBCFS_FREE(fma_blk->gnm_block, fma_blk->gnm_blk_size);
+	} else {
+		cfs_mem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
+	}
+free_desc:
+	LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
+out:
+	up(&device->gnd_fmablk_sem);
+	return rc;
+}
+
+void
+kgnilnd_unmap_fmablk(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
+{
+	gni_return_t            rrc;
+
+	/* if some held, set hold_timeout from conn timeouts used in this block
+	 * but not during shutdown, then just nuke and pave */
+	if (fma_blk->gnm_held_mboxs && (!kgnilnd_data.kgn_shutdown)) {
+		fma_blk->gnm_hold_timeout = GNILND_TIMEOUT2DEADMAN;
+	}
+
+	/* we are changing the state of a block, tickle version to tell
+	 * proc code list is stale now */
+	atomic_inc(&dev->gnd_fmablk_vers);
+
+	rrc = kgnilnd_mem_deregister(dev->gnd_handle, &fma_blk->gnm_hndl, fma_blk->gnm_hold_timeout);
+
+	CDEBUG(rrc == GNI_RC_SUCCESS ? D_MALLOC : D_CONSOLE|D_NETERROR,
+	       "unmap fmablk 0x%p@%s sz %u total %d avail %d held %d mbox_size %d "
+		"hold_timeout %d\n",
+	       fma_blk, kgnilnd_fmablk_state2str(fma_blk->gnm_state),
+	       fma_blk->gnm_blk_size, fma_blk->gnm_num_mboxs,
+	       fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs,
+	       fma_blk->gnm_mbox_size, fma_blk->gnm_hold_timeout);
+
+	LASSERTF(rrc == GNI_RC_SUCCESS,
+		"tried to double unmap or something bad, fma_blk %p (rrc %d)\n",
+		fma_blk, rrc);
+
+	if (fma_blk->gnm_hold_timeout) {
+		atomic_inc(&dev->gnd_n_mdd_held);
+	} else {
+		atomic_dec(&dev->gnd_n_mdd);
+	}
+
+	/* PHYS blocks don't get mapped */
+	if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
+		atomic64_sub(fma_blk->gnm_blk_size, &dev->gnd_nbytes_map);
+	} else if (kgnilnd_data.kgn_in_reset) {
+		/* in stack reset, clear MDD handle for PHYS blocks, as we'll
+		 * re-use the fma_blk after reset so we don't have to drop/allocate
+		 * all of those physical blocks */
+		fma_blk->gnm_hndl.qword1 = fma_blk->gnm_hndl.qword2 = 0UL;
+	}
+
+	/* Decrement here as this is the # of mapped blocks */
+	atomic_dec(&dev->gnd_nfmablk);
+}
+
+
+/* needs lock on gnd_fmablk_lock to cover gnd_fma_buffs */
+void
+kgnilnd_free_fmablk_locked(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
+{
+	LASSERTF(fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs,
+		 "fma_blk %p@%d free in bad state (%d): blk total %d avail %d held %d\n",
+		 fma_blk, fma_blk->gnm_state, fma_blk->gnm_hold_timeout, fma_blk->gnm_num_mboxs,
+		fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs);
+
+	atomic_inc(&dev->gnd_fmablk_vers);
+
+	if (fma_blk->gnm_hold_timeout) {
+		CDEBUG(D_MALLOC, "mdd release fmablk 0x%p sz %u avail %d held %d "
+			"mbox_size %d\n",
+			fma_blk, fma_blk->gnm_blk_size, fma_blk->gnm_avail_mboxs,
+			fma_blk->gnm_held_mboxs, fma_blk->gnm_mbox_size);
+
+		/* We leave MDD dangling over stack reset */
+		if (!kgnilnd_data.kgn_in_reset) {
+			kgnilnd_mem_mdd_release(dev->gnd_handle, &fma_blk->gnm_hndl);
+		}
+		/* ignoring the return code - if kgni/ghal can't find it
+		 * it must be released already */
+		atomic_dec(&dev->gnd_n_mdd_held);
+		atomic_dec(&dev->gnd_n_mdd);
+	}
+
+	/* we cant' free the gnm_block until all the conns have released their
+	 * purgatory holds. While we have purgatory holds, we might check the conn
+	 * RX mailbox during the CLOSING process. It is possible that kgni might
+	 * try to look into the RX side for credits when sending the CLOSE msg too */
+	CDEBUG(D_MALLOC, "fmablk %p free buffer %p mbox_size %d\n",
+		fma_blk, fma_blk->gnm_block, fma_blk->gnm_mbox_size);
+
+	if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
+		cfs_mem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
+	} else {
+		LIBCFS_FREE(fma_blk->gnm_block, fma_blk->gnm_blk_size);
+	}
+	fma_blk->gnm_state = GNILND_FMABLK_FREED;
+
+	list_del(&fma_blk->gnm_bufflist);
+
+	LIBCFS_FREE(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t)*fma_blk->gnm_num_mboxs);
+	LIBCFS_FREE(fma_blk->gnm_bit_array, BITS_TO_LONGS(fma_blk->gnm_num_mboxs) * sizeof (unsigned long));
+	LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
+}
+
+void
+kgnilnd_find_free_mbox(kgn_conn_t *conn)
+{
+	kgn_device_t            *dev = conn->gnc_device;
+	gni_smsg_attr_t         *smsg_attr = &conn->gnpr_smsg_attr;
+	kgn_fma_memblock_t      *fma_blk;
+	kgn_mbox_info_t         *mbox = NULL;
+	int                     id;
+
+	spin_lock(&dev->gnd_fmablk_lock);
+
+	list_for_each_entry(fma_blk, &conn->gnc_device->gnd_fma_buffs,
+			    gnm_bufflist) {
+		if (fma_blk->gnm_avail_mboxs <= 0 ||
+		    fma_blk->gnm_state <= GNILND_FMABLK_IDLE) {
+			continue;
+		}
+		/* look in bitarray for available mailbox */
+		do {
+			id = find_next_zero_bit(
+				fma_blk->gnm_bit_array,
+				fma_blk->gnm_num_mboxs,
+				fma_blk->gnm_next_avail_mbox);
+		      if (id == fma_blk->gnm_num_mboxs &&
+			  fma_blk->gnm_next_avail_mbox != 0) {
+				/* wrap around */
+				fma_blk->gnm_next_avail_mbox = 0;
+			} else {
+				break;
+			}
+		} while (1);
+
+		LASSERTF(id < fma_blk->gnm_num_mboxs, "id %d max %d\n",
+			 id, fma_blk->gnm_num_mboxs);
+		set_bit(id, (volatile unsigned long *)fma_blk->gnm_bit_array);
+		conn->gnc_mbox_id = id;
+
+		fma_blk->gnm_next_avail_mbox =
+			(id == (fma_blk->gnm_num_mboxs - 1)) ? 0 : (id + 1);
+		fma_blk->gnm_avail_mboxs--;
+		conn->gnc_fma_blk = fma_blk;
+
+		kgnilnd_setup_smsg_attr(smsg_attr);
+
+		smsg_attr->msg_buffer = fma_blk->gnm_block;
+		smsg_attr->mbox_offset = fma_blk->gnm_mbox_size * id;
+		smsg_attr->mem_hndl = fma_blk->gnm_hndl;
+		smsg_attr->buff_size = fma_blk->gnm_mbox_size;
+
+		/* We'll set the hndl to zero for PHYS blocks unmapped during stack
+		 * reset and re-use the same fma_blk after stack reset. This ensures we've
+		 * properly mapped it before we use it */
+		LASSERTF(fma_blk->gnm_hndl.qword1 != 0UL, "unmapped fma_blk %p, state %d\n",
+			 fma_blk, fma_blk->gnm_state);
+
+		CDEBUG(D_NET, "conn %p smsg %p fmablk %p "
+			"allocating SMSG mbox %d buf %p "
+			"offset %u hndl "LPX64"."LPX64"\n",
+			conn, smsg_attr, fma_blk, id,
+			smsg_attr->msg_buffer, smsg_attr->mbox_offset,
+			fma_blk->gnm_hndl.qword1,
+			fma_blk->gnm_hndl.qword2);
+
+		mbox = &fma_blk->gnm_mbox_info[id];
+		mbox->mbx_create_conn_memset = jiffies;
+
+		/* zero mbox to remove any old data from our last use.
+		 * this better be safe, if not our purgatory timers
+		 * are too short or a peer really is misbehaving */
+		memset(smsg_attr->msg_buffer + smsg_attr->mbox_offset,
+		       0, smsg_attr->buff_size);
+		break;
+	}
+
+	spin_unlock(&dev->gnd_fmablk_lock);
+}
+
+int
+kgnilnd_setup_mbox(kgn_conn_t *conn)
+{
+	gni_smsg_attr_t         *smsg_attr = &conn->gnpr_smsg_attr;
+	int                      err = 0;
+
+	smsg_attr->msg_buffer = NULL;
+	/* Look for available mbox */
+	do {
+		kgnilnd_find_free_mbox(conn);
+
+		/* nothing in the existing buffers, make a new one */
+		if (smsg_attr->msg_buffer == NULL) {
+			/* for runtime allocations, we only want vmalloc */
+			err = kgnilnd_alloc_fmablk(conn->gnc_device, 0);
+			if (err) {
+				break;
+			}
+		}
+	} while (smsg_attr->msg_buffer == NULL);
+
+	if (err)
+		CNETERR("couldn't allocate SMSG mbox for conn %p Error: %d\n",
+			conn, err);
+	return err;
+}
+
+void
+kgnilnd_release_mbox(kgn_conn_t *conn, int purgatory_hold)
+{
+	kgn_device_t           *dev = conn->gnc_device;
+	gni_smsg_attr_t        *smsg_attr = &conn->gnpr_smsg_attr;
+	kgn_fma_memblock_t     *fma_blk = NULL;
+	kgn_mbox_info_t        *mbox = NULL;
+	int                     found = 0;
+	int                     id;
+
+	/* if we failed to setup mbox and now destroying conn */
+	if (smsg_attr->msg_buffer == NULL) {
+		return;
+	}
+
+	id = conn->gnc_mbox_id;
+
+	spin_lock(&dev->gnd_fmablk_lock);
+	/* make sure our conn points at a valid fma_blk
+	 * We use this instead of a mem block search out of smsg_attr
+	 * because we could have freed a block for fma_blk #1 but the fma_blk
+	 * is still in the list for a purgatory hold. This would induce a false
+	 * match if that same block gets reallocated to fma_blk #2 */
+	list_for_each_entry(fma_blk, &dev->gnd_fma_buffs, gnm_bufflist) {
+		if (fma_blk == conn->gnc_fma_blk) {
+			found = 1;
+			break;
+		}
+	}
+	LASSERTF(found, "unable to find conn 0x%p with gnc_fma_blk %p "
+		 "anywhere in the world\n", conn, conn->gnc_fma_blk);
+
+	LASSERTF(id < fma_blk->gnm_num_mboxs,
+		"bad id %d max %d\n",
+		id, fma_blk->gnm_num_mboxs);
+
+	/* < 0 - was held, now free it
+	 * == 0 - just free it
+	 * > 0 - hold it for now */
+	if (purgatory_hold == 0) {
+		CDEBUG(D_NET, "conn %p smsg %p fmablk %p freeing SMSG mbox %d "
+			"hndl "LPX64"."LPX64"\n",
+			conn, smsg_attr, fma_blk, id,
+			fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
+		fma_blk->gnm_avail_mboxs++;
+
+	} else if (purgatory_hold > 0) {
+		CDEBUG(D_NET, "conn %p smsg %p fmablk %p holding SMSG mbox %d "
+			"hndl "LPX64"."LPX64"\n",
+			conn, smsg_attr, fma_blk, id,
+			fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
+
+		fma_blk->gnm_held_mboxs++;
+		fma_blk->gnm_max_timeout = MAX(fma_blk->gnm_max_timeout,
+						conn->gnc_timeout);
+	} else {
+		CDEBUG(D_NET, "conn %p smsg %p fmablk %p release SMSG mbox %d "
+			"hndl "LPX64"."LPX64"\n",
+			conn, smsg_attr, fma_blk, id,
+			fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
+
+		fma_blk->gnm_held_mboxs--;
+		fma_blk->gnm_avail_mboxs++;
+	}
+
+	if (purgatory_hold <= 0) {
+		/* if kgni is retransmitting, freeing the smsg block before the EP
+		 * is destroyed gets messy. Bug 768295. */
+		LASSERTF(conn->gnc_ephandle == NULL,
+			 "can't release mbox before EP is nuked. conn 0x%p\n", conn);
+
+		mbox = &fma_blk->gnm_mbox_info[id];
+		mbox->mbx_release_from_purgatory = jiffies;
+
+		/* clear conn gnc_fmablk if it is gone - this allows us to
+		 * not worry about state so much in kgnilnd_destroy_conn
+		 * and makes the guaranteed cleanup of the resources easier */
+		LASSERTF(test_and_clear_bit(id, fma_blk->gnm_bit_array),
+			"conn %p bit %d already cleared in fma_blk %p\n",
+			 conn, id, fma_blk);
+		conn->gnc_fma_blk = NULL;
+	}
+
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FMABLK_AVAIL)) {
+		CERROR("LBUGs in your future: forcibly marking fma_blk %p "
+		       "as mapped\n", fma_blk);
+		fma_blk->gnm_state = GNILND_FMABLK_VIRT;
+	}
+
+	/* we don't release or unmap PHYS blocks as part of the normal cycle --
+	 * those are controlled manually from startup/shutdown */
+	if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
+		/* we can unmap once all are unused (held or avail)
+		 * but check hold_timeout to make sure we are not trying to double
+		 * unmap this buffer. If there was no hold_timeout set due to
+		 * held_mboxs, we'll free the mobx here shortly and won't have to
+		 * worry about catching a double free for a 'clean' fma_blk */
+		if (((fma_blk->gnm_avail_mboxs + fma_blk->gnm_held_mboxs) == fma_blk->gnm_num_mboxs) &&
+		    (!fma_blk->gnm_hold_timeout)) {
+			kgnilnd_unmap_fmablk(dev, fma_blk);
+		}
+
+		/* But we can only free once they are all avail */
+		if (fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs &&
+		    fma_blk->gnm_held_mboxs == 0) {
+			/* all mailboxes are released, free fma_blk */
+			kgnilnd_free_fmablk_locked(dev, fma_blk);
+		}
+	}
+
+	spin_unlock(&dev->gnd_fmablk_lock);
+}
+
+int
+kgnilnd_count_phys_mbox(kgn_device_t *device)
+{
+	int                     i = 0;
+	kgn_fma_memblock_t     *fma_blk;
+
+	spin_lock(&device->gnd_fmablk_lock);
+
+	list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
+		if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
+			i += fma_blk->gnm_num_mboxs;
+	}
+	spin_unlock(&device->gnd_fmablk_lock);
+
+	RETURN(i);
+}
+
+int
+kgnilnd_allocate_phys_fmablk(kgn_device_t *device)
+{
+	int     rc;
+
+	while (kgnilnd_count_phys_mbox(device) < *kgnilnd_tunables.kgn_nphys_mbox) {
+
+		rc = kgnilnd_alloc_fmablk(device, 1);
+		if (rc) {
+			CERROR("failed phys mbox allocation, stopping at %d, rc %d\n",
+				kgnilnd_count_phys_mbox(device), rc);
+			RETURN(rc);
+		}
+	}
+	RETURN(0);
+}
+
+int
+kgnilnd_map_phys_fmablk(kgn_device_t *device)
+{
+
+	int                     rc = 0;
+	kgn_fma_memblock_t     *fma_blk;
+
+	/* use sem to gate access to single thread, just in case */
+	down(&device->gnd_fmablk_sem);
+
+	spin_lock(&device->gnd_fmablk_lock);
+
+	list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
+		if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
+			rc = kgnilnd_map_fmablk(device, fma_blk);
+			if (rc)
+				break;
+	}
+	spin_unlock(&device->gnd_fmablk_lock);
+
+	up(&device->gnd_fmablk_sem);
+
+	RETURN(rc);
+}
+
+void
+kgnilnd_unmap_phys_fmablk(kgn_device_t *device)
+{
+
+	kgn_fma_memblock_t      *fma_blk;
+
+	/* use sem to gate access to single thread, just in case */
+	down(&device->gnd_fmablk_sem);
+
+	spin_lock(&device->gnd_fmablk_lock);
+
+	list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
+		if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
+			kgnilnd_unmap_fmablk(device, fma_blk);
+	}
+	spin_unlock(&device->gnd_fmablk_lock);
+
+	up(&device->gnd_fmablk_sem);
+}
+
+void
+kgnilnd_free_phys_fmablk(kgn_device_t *device)
+{
+
+	kgn_fma_memblock_t      *fma_blk, *fma_blkN;
+
+	/* use sem to gate access to single thread, just in case */
+	down(&device->gnd_fmablk_sem);
+
+	spin_lock(&device->gnd_fmablk_lock);
+
+	list_for_each_entry_safe(fma_blk, fma_blkN, &device->gnd_fma_buffs, gnm_bufflist) {
+		if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
+			kgnilnd_free_fmablk_locked(device, fma_blk);
+	}
+	spin_unlock(&device->gnd_fmablk_lock);
+
+	up(&device->gnd_fmablk_sem);
+}
+
+/* kgnilnd dgram nid->struct managment */
+
+static inline struct list_head *
+kgnilnd_nid2dgramlist(kgn_device_t *dev, lnet_nid_t nid)
+{
+	unsigned int hash = ((unsigned int)nid) % *kgnilnd_tunables.kgn_peer_hash_size;
+
+	RETURN(&dev->gnd_dgrams[hash]);
+}
+
+
+/* needs dev->gnd_dgram_lock held */
+kgn_dgram_t *
+kgnilnd_find_dgram_locked(kgn_device_t *dev, lnet_nid_t dst_nid)
+{
+	struct list_head *dgram_list = kgnilnd_nid2dgramlist(dev, dst_nid);
+	kgn_dgram_t      *dgram;
+
+	list_for_each_entry(dgram, dgram_list, gndg_list) {
+
+		/* if state > POSTED, we are already handling cancel/completion */
+		if ((dgram->gndg_conn_out.gncr_dstnid != dst_nid) ||
+		     dgram->gndg_state > GNILND_DGRAM_POSTED)
+			continue;
+
+		CDEBUG(D_NET, "got dgram [%p] -> %s\n",
+		       dgram, libcfs_nid2str(dst_nid));
+		return dgram;
+	}
+	return NULL;
+}
+
+int
+kgnilnd_find_and_cancel_dgram(kgn_device_t *dev, lnet_nid_t dst_nid)
+{
+	kgn_dgram_t     *dgram;
+
+	spin_lock(&dev->gnd_dgram_lock);
+	dgram = kgnilnd_find_dgram_locked(dev, dst_nid);
+
+	if (dgram) {
+		kgnilnd_cancel_dgram_locked(dgram);
+	}
+	spin_unlock(&dev->gnd_dgram_lock);
+
+	RETURN(!!(dgram == NULL));
+}
+
+int
+kgnilnd_pack_connreq(kgn_connreq_t *connreq, kgn_conn_t *conn,
+		     lnet_nid_t srcnid, lnet_nid_t dstnid,
+		     kgn_connreq_type_t type)
+{
+	int err = 0;
+
+	/* ensure we haven't violated max datagram size */
+	CLASSERT(sizeof(kgn_connreq_t) <= GNI_DATAGRAM_MAXSIZE);
+
+	/* no need to zero out, we do that when allocating dgram */
+	connreq->gncr_magic     = GNILND_MSG_MAGIC;
+
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_SRCNID)) {
+		srcnid = 0xABADBABE;
+	} else if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) {
+		dstnid = 0xDEFEC8ED;
+	}
+
+	connreq->gncr_srcnid    = srcnid;
+	connreq->gncr_dstnid    = dstnid;
+
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
+		connreq->gncr_version = 99;
+	} else {
+		connreq->gncr_version   = GNILND_CONNREQ_VERSION;
+	}
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
+		connreq->gncr_type = 99;
+	} else {
+		connreq->gncr_type      = type;
+	}
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
+		connreq->gncr_peerstamp = 0;
+	} else {
+		connreq->gncr_peerstamp = kgnilnd_data.kgn_peerstamp;
+	}
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
+		connreq->gncr_connstamp = 0;
+	} else {
+		connreq->gncr_connstamp = conn->gnc_my_connstamp;
+	}
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
+		connreq->gncr_timeout = 0;
+	} else {
+		connreq->gncr_timeout   = conn->gnc_timeout;
+	}
+
+	/* the rest pack the data into the payload in other places */
+	if (type == GNILND_CONNREQ_REQ) {
+		kgn_gniparams_t       *req_params = &connreq->gncr_gnparams;
+		req_params->gnpr_host_id = conn->gnc_device->gnd_host_id;
+		req_params->gnpr_cqid = conn->gnc_cqid;
+
+		/* allocate mailbox for this connection */
+		err = kgnilnd_setup_mbox(conn);
+		if (err != 0) {
+			CERROR("Failed to setup FMA mailbox (%d)\n", err);
+		}
+		req_params->gnpr_smsg_attr = conn->gnpr_smsg_attr;
+	}
+
+	/* XXX Nic: TBD - checksum computation */
+
+	return err;
+}
+
+int
+kgnilnd_unpack_connreq(kgn_dgram_t *dgram)
+{
+	kgn_connreq_t           *connreq = &dgram->gndg_conn_in;
+	int                      swab, rc = 0;
+	kgn_net_t               *net;
+
+	/* the following fields must be handled in a backwards compatible
+	 * manner to ensure we can always send and interpret NAKs */
+
+	if (connreq->gncr_magic != GNILND_MSG_MAGIC &&
+	    connreq->gncr_magic != __swab32(GNILND_MSG_MAGIC)) {
+		/* Unexpected magic! */
+		CERROR("Unexpected magic %08x\n",
+		       connreq->gncr_magic);
+		return -EBADF;
+	}
+
+	swab = (connreq->gncr_magic == __swab32(GNILND_MSG_MAGIC));
+	if (swab) {
+		__swab32s(&connreq->gncr_magic);
+		__swab32s(&connreq->gncr_cksum);
+		__swab16s(&connreq->gncr_type);
+		__swab16s(&connreq->gncr_version);
+		__swab32s(&connreq->gncr_timeout);
+		__swab64s(&connreq->gncr_srcnid);
+		__swab64s(&connreq->gncr_dstnid);
+		__swab64s(&connreq->gncr_peerstamp);
+		__swab64s(&connreq->gncr_connstamp);
+	}
+
+	/* Do NOT return anything but -EBADF before we munge
+	 * connreq->gncr_srcnid - we need that to send the nak */
+
+	if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
+		lnet_nid_t      incoming = connreq->gncr_srcnid;
+
+		/* even if the incoming packet is hosed, we know who we sent
+		 * the original and can set the srcnid so that we can properly
+		 * look up our peer to close the loop on this connreq. We still use
+		 * -EBADF to prevent a NAK - just in case there are issues with
+		 * the payload coming from a random spot, etc. */
+		connreq->gncr_srcnid = dgram->gndg_conn_out.gncr_dstnid;
+
+		if (LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid) !=
+				LNET_NIDADDR(incoming)) {
+			/* we got a datagram match for the wrong nid... */
+			CERROR("matched datagram 0x%p with srcnid %s "
+				"(%x), expecting %s (%x)\n",
+				dgram,
+				libcfs_nid2str(incoming),
+				LNET_NIDADDR(incoming),
+				libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
+				LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid));
+			return -EBADF;
+		}
+	} else {
+		/* if we have a wildcard datagram it should match an
+		 * incoming "active" datagram that should have a fully formed
+		 * srcnid and dstnid. If we couldn't unpack it, we drop as
+		 * corrupted packet, otherwise we'll just verify that the dstnid
+		 * matches the NID for the NET that the dgram was posted */
+
+		/* make sure their wildcard didn't match ours, that is unpossible */
+		LASSERTF(connreq->gncr_dstnid != LNET_NID_ANY,
+			 "dgram 0x%p from %s, connreq 0x%p; "
+			 "wildcard matched wildcard \n", dgram,
+			 libcfs_nid2str(connreq->gncr_srcnid), connreq);
+
+		rc = kgnilnd_find_net(connreq->gncr_dstnid, &net);
+
+		if (rc == -ESHUTDOWN) {
+			CERROR("Looking up network: device is in shutdown");
+			return rc;
+		} else if (rc == -ENONET) {
+			CERROR("Connection data from %s: she sent "
+			"dst_nid %s, but net lookup failed on "
+			"dgram 0x%p@%s\n",
+			libcfs_nid2str(connreq->gncr_srcnid),
+			libcfs_nid2str(connreq->gncr_dstnid),
+			dgram, kgnilnd_dgram_type2str(dgram));
+			return rc;
+		}
+
+		if (net->gnn_ni->ni_nid != connreq->gncr_dstnid) {
+			CERROR("Bad connection data from %s: she sent "
+			       "dst_nid %s, but I am %s with dgram 0x%p@%s\n",
+			       libcfs_nid2str(connreq->gncr_srcnid),
+			       libcfs_nid2str(connreq->gncr_dstnid),
+			       libcfs_nid2str(net->gnn_ni->ni_nid),
+			       dgram, kgnilnd_dgram_type2str(dgram));
+			kgnilnd_net_decref(net);
+			return -EBADSLT;
+		}
+
+		/* kgnilnd_find_net takes a ref on the net it finds, You need to decref it when not needed. */
+		kgnilnd_net_decref(net);
+	}
+
+	if (connreq->gncr_version != GNILND_CONNREQ_VERSION) {
+		CERROR("Unexpected version %d\n", connreq->gncr_version);
+		return -EPROTO;
+	}
+
+	/* XXX Nic: TBD - checksum validation */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_DROP)) {
+		return -EBADF;
+	}
+
+	if (swab && connreq->gncr_type == GNILND_CONNREQ_REQ) {
+		__u64 msg_addr = (__u64) connreq->gncr_gnparams.gnpr_smsg_attr.msg_buffer;
+
+		__swab32s(&connreq->gncr_gnparams.gnpr_host_id);
+		__swab32s(&connreq->gncr_gnparams.gnpr_cqid);
+		__swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.buff_size);
+		__swab16s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_maxcredit);
+		__swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_offset);
+		__swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword1);
+		__swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword2);
+		__swab64s(&msg_addr);
+		__swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_maxsize);
+		__swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_type);
+	} else if (swab && connreq->gncr_type == GNILND_CONNREQ_NAK) {
+		__swab32s(&connreq->gncr_nakdata.gnnd_errno);
+	}
+
+	/* since we use a unique instance ID for each network, the driver
+	 * will take care of dropping datagrams if we don't have that network.
+	 */
+
+	/* few more idiot software or configuration checks */
+
+	switch (connreq->gncr_type) {
+	case GNILND_CONNREQ_REQ:
+		/* wire up EP and SMSG block - this will check the incoming data
+		 * and barf a NAK back if need to */
+		rc = kgnilnd_set_conn_params(dgram);
+		if (rc)
+			return rc;
+		break;
+	case GNILND_CONNREQ_NAK:
+	case GNILND_CONNREQ_CLOSE:
+		break;
+	default:
+		CERROR("unknown connreq packet type %d\n", connreq->gncr_type);
+		return -EPROTO;
+	}
+
+	if (connreq->gncr_peerstamp == 0 || connreq->gncr_connstamp == 0) {
+		CERROR("Recived bad timestamps peer "LPU64" conn "LPU64"\n",
+		connreq->gncr_peerstamp, connreq->gncr_connstamp);
+		return -EPROTO;
+	}
+
+	if (connreq->gncr_timeout < GNILND_MIN_TIMEOUT) {
+		CERROR("Received timeout %d < MIN %d\n",
+		       connreq->gncr_timeout, GNILND_MIN_TIMEOUT);
+		return -EPROTO;
+	}
+
+	return 0;
+}
+
+int
+kgnilnd_alloc_dgram(kgn_dgram_t **dgramp, kgn_device_t *dev, kgn_dgram_type_t type)
+{
+	kgn_dgram_t         *dgram;
+
+	dgram = cfs_mem_cache_alloc(kgnilnd_data.kgn_dgram_cache,
+				    CFS_ALLOC_ATOMIC);
+	if (dgram == NULL)
+		return -ENOMEM;
+
+	/* cache alloc'd memory is not zeroed */
+	memset((void *)dgram, 0, sizeof(*dgram)) ;
+
+	INIT_LIST_HEAD(&dgram->gndg_list);
+	dgram->gndg_state = GNILND_DGRAM_USED;
+	dgram->gndg_type = type;
+	dgram->gndg_magic = GNILND_DGRAM_MAGIC;
+
+	atomic_inc(&dev->gnd_ndgrams);
+
+	CDEBUG(D_MALLOC|D_NETTRACE, "slab-alloced 'dgram': %lu at %p.\n",
+	       sizeof(*dgram), dgram);
+
+	*dgramp = dgram;
+	return 0;
+}
+
+/* call this on a dgram that came back from kgnilnd_ep_postdata_test_by_id
+ * returns < 0 on dgram to be cleaned up
+ * > 0 on dgram that isn't done yet
+ * == 0 on dgram that is ok and needs connreq processing */
+int
+kgnilnd_process_dgram(kgn_dgram_t *dgram, gni_post_state_t post_state)
+{
+	int rc = 0;
+
+	switch (post_state) {
+	case GNI_POST_COMPLETED:
+		/* normal state for dgrams that need actual processing */
+		/* GOTO to avoid processing dgram as canceled/done */
+		GOTO(process_out, rc);
+
+	case GNI_POST_PENDING:
+		/* we should only see this if we are testing a WC dgram after a
+		 * cancel - it means that it needs a full cycle of waiting
+		 * for kgni_sm_task to finish moving it to TERMINATED */
+		LASSERTF((dgram->gndg_type == GNILND_DGRAM_WC_REQ) &&
+			  (dgram->gndg_state == GNILND_DGRAM_CANCELED),
+			 "POST_PENDING dgram 0x%p with bad type %d(%s) or state %d(%s)\n",
+			 dgram, dgram->gndg_type, kgnilnd_dgram_type2str(dgram),
+			 dgram->gndg_state, kgnilnd_dgram_state2str(dgram));
+
+		/* positive RC as this dgram isn't done yet */
+		rc = EINPROGRESS;
+
+		/* GOTO as this isn't done yet */
+		GOTO(process_out, rc);
+		break;
+
+	case GNI_POST_TERMINATED:
+		/* we've called cancel and it is done or remote guy called cancel and
+		 * we've receved it on a WC dgram */
+#if 0
+		/* we are seeing weird terminations on non WC dgrams when we have not
+		 * canceled them */
+
+		LASSERTF(dgram->gndg_state == GNILND_DGRAM_CANCELED ||
+			 dgram->gndg_conn_out.gncr_dstnid == LNET_NID_ANY,
+			"dgram 0x%p with bad state %d(%s) or dst nid %s\n",
+			dgram, dgram->gndg_state, kgnilnd_dgram_state2str(dgram),
+			libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid));
+#endif
+
+		CDEBUG(D_NETTRACE, "dgram 0x%p saw %s, cleaning it up\n", dgram,
+		       dgram->gndg_state == GNILND_DGRAM_CANCELED ?  "canceled" : "terminated");
+
+		rc =  -ECANCELED;
+		break;
+
+	case GNI_POST_TIMEOUT:
+		/* we could have a timeout on a wildcard dgram too - if
+		 * we got the incoming request but the remote node beefed
+		 * before kgni could send the match data back. We'll just error
+		 * on the active case and bail out gracefully */
+		if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
+			CNETERR("hardware timeout for connect to "
+			       "%s after %lu seconds. Is node dead?\n",
+			       libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
+			       cfs_duration_sec(jiffies - dgram->gndg_post_time));
+		}
+
+		rc = -ETIMEDOUT;
+		break;
+
+	default:
+		CERROR("dgram 0x%p with bad post_state %d\n", dgram, post_state);
+		LBUG();
+	}
+
+	/* now finish cleaning up a dgram that is canceled/terminated and needs to
+	 * go away */
+
+	/* If this was actively canceled, drop the count now that we are processing */
+	if (dgram->gndg_state == GNILND_DGRAM_CANCELED) {
+		atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
+		/* caller responsible for gndg_list removal */
+	}
+
+process_out:
+
+	RETURN(rc);
+}
+
+/* needs dev->gnd_dgram_lock held */
+void
+kgnilnd_cancel_dgram_locked(kgn_dgram_t *dgram)
+{
+	gni_return_t            grc;
+
+	if (dgram->gndg_state != GNILND_DGRAM_POSTED) {
+		return;
+	}
+
+	LASSERTF(dgram->gndg_conn != NULL,
+		 "dgram 0x%p with NULL conn\n", dgram);
+
+	/* C.E - WC dgrams could be canceled immediately but
+	 * if there was some match pending, we need to call
+	 * test_by_id to clear it out. If that test returns
+	 * POST_PENDING, it is half done and needs to go along
+	 * with the rest of dgrams and go through a kgni_sm_task cycle
+	 * and deliver a GNI_POST_TERMINATED event before they
+	 * are actually canceled */
+
+	dgram->gndg_state = GNILND_DGRAM_CANCELED;
+
+	if (dgram->gndg_conn->gnc_state >= GNILND_CONN_ESTABLISHED) {
+		/* we don't need to cancel_by_id if the datagram was good */
+		return;
+	}
+
+	/* let folks know there are outstanding cancels */
+	atomic_inc(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
+	/* leave on nid list until cancel is done for debugging fun */
+	grc = kgnilnd_ep_postdata_cancel_by_id(dgram->gndg_conn->gnc_ephandle, (__u64) dgram);
+
+	/* if we don't get success here, we have hosed up the dgram tracking
+	 * code and need to bail out */
+	LASSERTF(grc == GNI_RC_SUCCESS,
+		 "postdata_cancel returned %d for conn 0x%p to %s\n",
+		 grc, dgram->gndg_conn,
+		 dgram->gndg_conn->gnc_peer ?
+		  libcfs_nid2str(dgram->gndg_conn->gnc_peer->gnp_nid)
+		  : "<?>");
+
+	CDEBUG(D_NETTRACE,
+		"canceled dgram 0x%p conn 0x%p ephandle 0x%p\n",
+		dgram, dgram->gndg_conn,
+		dgram->gndg_conn->gnc_ephandle);
+
+	if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) {
+		gni_post_state_t         post_state;
+		int                      rc = 0;
+		__u32                    remote_addr = 0, remote_id = 0;
+
+		grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle,
+						     (__u64)dgram, &post_state,
+						     &remote_addr, &remote_id);
+
+		LASSERTF(grc == GNI_RC_NO_MATCH || grc == GNI_RC_SUCCESS,
+			 "bad grc %d from test_by_id on dgram 0x%p\n",
+			grc, dgram);
+
+		/* if WC was canceled immediately, we get NO_MATCH, if needs to go
+		 * through full cycle, we get SUCCESS and need to parse post_state */
+
+		CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
+			"remote_addr %u remote_id %u\n", grc, dgram,
+			kgnilnd_dgram_type2str(dgram),
+			post_state, remote_addr, remote_id);
+
+		if (grc == GNI_RC_NO_MATCH) {
+			/* she's gone, reduce count and move along */
+			dgram->gndg_state = GNILND_DGRAM_DONE;
+			atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
+			RETURN_EXIT;
+		}
+
+		rc = kgnilnd_process_dgram(dgram, post_state);
+
+		if (rc <= 0) {
+			/* if for some weird reason we get a valid dgram back, just mark as done
+			 * so we can drop it and move along.
+			 * C.E - if it was completed, we'll just release the conn/mbox
+			 * back into the pool and it'll get reused. That said, we should only
+			 * be canceling a WC dgram on stack rest or shutdown, so that is moot */
+			dgram->gndg_state = GNILND_DGRAM_DONE;
+			atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
+
+			/* caller context responsible for calling kgnilnd_release_dgram() */
+		} else {
+			/* still pending, let it simmer until golden brown and delicious */
+		}
+	}
+
+	/* for non WC dgrams, they are still on the nid list but marked canceled waiting
+	 * for kgni to return their ID to us via probe - that is when we'll complete their
+	 * cancel processing */
+}
+
+void
+kgnilnd_cleanup_dgram(kgn_dgram_t *dgram)
+{
+	/* release the dgram ref on conn */
+	if (dgram->gndg_conn) {
+		kgnilnd_conn_decref(dgram->gndg_conn);
+		dgram->gndg_conn = NULL;
+	}
+}
+
+void
+kgnilnd_free_dgram(kgn_device_t *dev, kgn_dgram_t *dgram)
+{
+	LASSERTF(dgram->gndg_state == GNILND_DGRAM_USED ||
+		 dgram->gndg_state == GNILND_DGRAM_DONE,
+		 "dgram 0x%p with bad state %s\n",
+		 dgram, kgnilnd_dgram_state2str(dgram));
+
+	/* bit of poisoning to help detect bad driver data */
+	dgram->gndg_magic = 0x6f5a6b5f;
+	atomic_dec(&dev->gnd_ndgrams);
+
+	cfs_mem_cache_free(kgnilnd_data.kgn_dgram_cache, dgram);
+	CDEBUG(D_MALLOC|D_NETTRACE, "slab-freed 'dgram': %lu at %p.\n",
+	       sizeof(*dgram), dgram);
+}
+
+int
+kgnilnd_post_dgram(kgn_device_t *dev, lnet_nid_t dstnid, kgn_connreq_type_t type,
+		   int data_rc)
+{
+	int              rc = 0;
+	kgn_dgram_t     *dgram = NULL;
+	kgn_dgram_t     *tmpdgram;
+	kgn_dgram_type_t dgtype;
+	gni_return_t     grc;
+	__u64            srcnid;
+	ENTRY;
+
+	switch (type) {
+	case GNILND_CONNREQ_REQ:
+		if (dstnid == LNET_NID_ANY)
+			dgtype = GNILND_DGRAM_WC_REQ;
+		else
+			dgtype = GNILND_DGRAM_REQ;
+		break;
+	case GNILND_CONNREQ_NAK:
+		LASSERTF(dstnid != LNET_NID_ANY, "can't NAK to LNET_NID_ANY\n");
+		dgtype = GNILND_DGRAM_NAK;
+		break;
+	default:
+		CERROR("unknown connreq type %d\n", type);
+		LBUG();
+	}
+
+	rc = kgnilnd_alloc_dgram(&dgram, dev, dgtype);
+	if (rc < 0) {
+		rc = -ENOMEM;
+		GOTO(post_failed, rc);
+	}
+
+	rc = kgnilnd_create_conn(&dgram->gndg_conn, dev);
+	if (rc) {
+		GOTO(post_failed, rc);
+	}
+
+	if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) {
+		/* clear buffer for sanity on reuse of wildcard */
+		memset(&dgram->gndg_conn_in, 0, sizeof(kgn_connreq_t));
+	}
+
+	if (dstnid == LNET_NID_ANY) {
+		/* set here to reset any dgram re-use */
+		dgram->gndg_conn->gnc_state = GNILND_CONN_LISTEN;
+	} else {
+		__u32            host_id;
+
+		rc = kgnilnd_nid_to_nicaddrs(LNET_NIDADDR(dstnid), 1, &host_id);
+		if (rc <= 0) {
+			rc = -ESRCH;
+			GOTO(post_failed, rc);
+		}
+
+		dgram->gndg_conn->gnc_state = GNILND_CONN_CONNECTING;
+
+		/* don't need to serialize, there are no CQs for the dgram
+		 * EP on the kgn_net_t */
+		grc = kgnilnd_ep_bind(dgram->gndg_conn->gnc_ephandle, host_id, dev->gnd_id);
+
+		if (grc != GNI_RC_SUCCESS) {
+			rc = -ECONNABORTED;
+			GOTO(post_failed, rc);
+		}
+
+	}
+
+	/* If we are posting wildcards post using a net of 0, otherwise we'll use the
+	 * net of the destination node.
+	 */
+
+	if (dstnid == LNET_NID_ANY) {
+		srcnid = LNET_MKNID(LNET_MKNET(GNILND, 0), dev->gnd_nid);
+	} else {
+		srcnid = LNET_MKNID(LNET_NIDNET(dstnid), dev->gnd_nid);
+	}
+
+	rc = kgnilnd_pack_connreq(&dgram->gndg_conn_out, dgram->gndg_conn,
+				  srcnid, dstnid, type);
+	if (rc) {
+		GOTO(post_failed, rc);
+	}
+
+	if (type == GNILND_CONNREQ_NAK)
+		dgram->gndg_conn_out.gncr_nakdata.gnnd_errno = data_rc;
+
+	dgram->gndg_post_time = jiffies;
+
+	/* XXX Nic: here is where we'd add in logical network multiplexing */
+
+	CDEBUG(D_NETTRACE, "dgram 0x%p type %s %s->%s cdm %d\n",
+	       dgram, kgnilnd_dgram_type2str(dgram),
+	       libcfs_nid2str(srcnid),
+	       libcfs_nid2str(dstnid), dev->gnd_id);
+
+	/* this allocates memory, can't hold locks across */
+	grc = kgnilnd_ep_postdata_w_id(dgram->gndg_conn->gnc_ephandle,
+				   &dgram->gndg_conn_out, sizeof(kgn_connreq_t),
+				   &dgram->gndg_conn_in, sizeof(kgn_connreq_t),
+				   (__u64)dgram);
+
+	if (grc != GNI_RC_SUCCESS) {
+		CNETERR("dropping failed dgram post id 0x%p type %s"
+			" reqtype %s to %s: rc %d\n",
+			dgram, kgnilnd_dgram_type2str(dgram),
+			kgnilnd_connreq_type2str(&dgram->gndg_conn_out),
+			libcfs_nid2str(dstnid), grc);
+		rc = (grc == GNI_RC_ERROR_NOMEM) ? -ENOMEM : -EBADR;
+		GOTO(post_failed, rc);
+	}
+
+	/* we don't need to add earlier - if someone does del_peer during post,
+	 * that peer will get marked as unlinked and the callers wil take care of it.
+	 * The dgram code is largely kgn_peer_t ignorant, so at worst, we'll just drop
+	 * the completed dgram later when we cant find a peer to stuff it into */
+
+	spin_lock(&dev->gnd_dgram_lock);
+
+	/* make sure we are not double posting targeted dgrams
+	 * - we can multiple post WC dgrams to help with processing speed */
+	if (dstnid != LNET_NID_ANY) {
+		tmpdgram = kgnilnd_find_dgram_locked(dev, dstnid);
+
+		LASSERTF(tmpdgram == NULL,
+			"dgram 0x%p->%s already posted\n",
+			 dgram, libcfs_nid2str(dstnid));
+	}
+
+	/* unmunge dstnid to help processing code cope... */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) {
+		dgram->gndg_conn_out.gncr_dstnid = dstnid;
+	}
+
+	list_add_tail(&dgram->gndg_list, kgnilnd_nid2dgramlist(dev, dstnid));
+	dgram->gndg_state = GNILND_DGRAM_POSTED;
+	spin_unlock(&dev->gnd_dgram_lock);
+
+post_failed:
+	if (rc < 0 && dgram != NULL) {
+		kgnilnd_cleanup_dgram(dgram);
+		kgnilnd_free_dgram(dev, dgram);
+	}
+
+	RETURN(rc);
+}
+
+void
+kgnilnd_release_dgram(kgn_device_t *dev, kgn_dgram_t *dgram)
+{
+	spin_lock(&dev->gnd_dgram_lock);
+	kgnilnd_cancel_dgram_locked(dgram);
+	spin_unlock(&dev->gnd_dgram_lock);
+
+	kgnilnd_cleanup_dgram(dgram);
+
+	/* if the dgram is 'canceled' it needs to be wait until the event
+	 * comes up from kgni that tells us it is safe to release */
+	if (dgram->gndg_state != GNILND_DGRAM_CANCELED) {
+		dgram->gndg_state = GNILND_DGRAM_DONE;
+
+		LASSERTF(list_empty(&dgram->gndg_list), "dgram 0x%p on list\n", dgram);
+
+		/* if it is a wildcard and we are in an appropriate state, repost
+		 * the wildcard */
+
+		if ((dgram->gndg_type == GNILND_DGRAM_WC_REQ) &&
+		    (!kgnilnd_data.kgn_wc_kill && !kgnilnd_data.kgn_in_reset)) {
+			int     rerc;
+
+			rerc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
+			LASSERTF(rerc == 0,
+				"error %d: dev %d could not repost wildcard datagram id 0x%p\n",
+				rerc, dev->gnd_id, dgram);
+		}
+
+		/* always free the old dgram */
+		kgnilnd_free_dgram(dev, dgram);
+	}
+}
+
+
+int
+kgnilnd_probe_for_dgram(kgn_device_t *dev, kgn_dgram_t **dgramp)
+{
+	kgn_dgram_t             *dgram = NULL;
+	gni_post_state_t         post_state;
+	gni_return_t             grc;
+	int                      rc = 0;
+	__u64                    readyid;
+	__u32                    remote_addr = 0, remote_id = 0;
+	ENTRY;
+
+	/* Probe with the lock held. That way if we get a dgram we dont have it canceled
+	 * between finding the ready dgram and grabbing the lock to remove it from the
+	 * list. Otherwise we could be left in an inconsistent state. We own the dgram
+	 * once its off the list so we don't need to worry about others changing it at
+	 * that point. */
+	spin_lock(&dev->gnd_dgram_lock);
+	grc = kgnilnd_postdata_probe_by_id(dev->gnd_handle, &readyid);
+	if (grc != GNI_RC_SUCCESS) {
+		spin_unlock(&dev->gnd_dgram_lock);
+		/* return 0 to indicate nothing happened */
+		RETURN(0);
+	}
+
+	CDEBUG(D_NET, "ready "LPX64" on device 0x%p\n",
+		readyid, dev);
+
+	dgram = (kgn_dgram_t *)readyid;
+
+	LASSERTF(dgram->gndg_magic == GNILND_DGRAM_MAGIC,
+		 "dgram 0x%p from id "LPX64" with bad magic %x\n",
+		 dgram, readyid, dgram->gndg_magic);
+
+	LASSERTF(dgram->gndg_state == GNILND_DGRAM_POSTED ||
+		 dgram->gndg_state == GNILND_DGRAM_CANCELED,
+		 "dgram 0x%p with bad state %s\n",
+		 dgram, kgnilnd_dgram_state2str(dgram));
+
+	LASSERTF(!list_empty(&dgram->gndg_list),
+		 "dgram 0x%p with bad list state %s\n",
+		 dgram, kgnilnd_dgram_state2str(dgram));
+
+	/* now we know that the datagram structure is ok, so pull off list */
+	list_del_init(&dgram->gndg_list);
+
+	/* while we have the gnn_dgram_lock and BEFORE we call test_by_id
+	 * change the state from POSTED to PROCESSING to ensure that
+	 * nobody cancels it after we've pulled it from the wire */
+	if (dgram->gndg_state == GNILND_DGRAM_POSTED) {
+		dgram->gndg_state = GNILND_DGRAM_PROCESSING;
+	}
+
+	spin_unlock(&dev->gnd_dgram_lock);
+
+	/* we now "own" this datagram */
+
+	LASSERTF(dgram->gndg_conn != NULL,
+		"dgram 0x%p with NULL conn\n", dgram);
+
+	grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle,
+					     (__u64)dgram, &post_state,
+					     &remote_addr, &remote_id);
+
+	LASSERTF(grc != GNI_RC_NO_MATCH, "kgni lied! probe_by_id told us that"
+		 " id "LPU64" was ready\n", readyid);
+
+	CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
+		"remote_addr %u remote_id %u\n", grc, dgram,
+		kgnilnd_dgram_type2str(dgram),
+		post_state, remote_addr, remote_id);
+
+	if (unlikely(grc != GNI_RC_SUCCESS)) {
+		CNETERR("getting data for dgram 0x%p->%s failed rc %d. Dropping it\n",
+			dgram, libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
+			grc);
+		rc = -EINVAL;
+		GOTO(probe_for_out, rc);
+	}
+
+	rc = kgnilnd_process_dgram(dgram, post_state);
+
+	/* we should never get probe finding a dgram for us and then it
+	 * being a WC dgram that is still in the middle of processing */
+	LASSERTF(rc <= 0, "bad rc %d from process_dgram 0x%p state %d\n",
+		 rc, dgram, post_state);
+
+	if (rc == 0) {
+		/* dgram is good enough for the data to be used */
+		dgram->gndg_state = GNILND_DGRAM_PROCESSING;
+		/* fake rc to mark that we've done something */
+		rc = 1;
+	} else {
+		/* bring out your dead! */
+		dgram->gndg_state = GNILND_DGRAM_DONE;
+	}
+
+	*dgramp = dgram;
+	RETURN(rc);
+
+probe_for_out:
+
+	kgnilnd_release_dgram(dev, dgram);
+	RETURN(rc);
+}
+
+int
+kgnilnd_setup_wildcard_dgram(kgn_device_t *dev)
+{
+	/* if kgn_wildcard is zero, return error */
+	int     rc = -ENOENT, i;
+	ENTRY;
+
+	for (i = 0; i < *kgnilnd_tunables.kgn_nwildcard; i++) {
+		rc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
+		if (rc < 0) {
+			CERROR("error %d: could not post wildcard datagram # %d\n",
+				rc, i);
+			rc = -EINVAL;
+			GOTO(failed, rc);
+		}
+	}
+
+failed:
+	RETURN(rc);
+}
+
+int
+kgnilnd_cancel_net_dgrams(kgn_net_t *net)
+{
+	kgn_dgram_t            *dg, *dgN;
+	struct list_head        zombies;
+	int                     i;
+	ENTRY;
+
+	/* we want to cancel any outstanding dgrams - we don't want to rely
+	 * on del_peer_or_conn catching all of them. This helps protect us in cases
+	 * where we don't quite keep the peer->dgram mapping in sync due to some
+	 * race conditions */
+
+	LASSERTF(net->gnn_shutdown || kgnilnd_data.kgn_in_reset,
+		 "called with LND invalid state: net shutdown %d "
+		 "in reset %d\n", net->gnn_shutdown,
+		 kgnilnd_data.kgn_in_reset);
+
+	INIT_LIST_HEAD(&zombies);
+
+	spin_lock(&net->gnn_dev->gnd_dgram_lock);
+
+	for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
+		list_for_each_entry_safe(dg, dgN, &net->gnn_dev->gnd_dgrams[i], gndg_list) {
+
+			/* skip nids not on our net or are wildcards */
+
+
+			if (dg->gndg_type == GNILND_DGRAM_WC_REQ ||
+				net->gnn_netnum != LNET_NETNUM(LNET_NIDNET(dg->gndg_conn_out.gncr_dstnid)))
+				continue;
+
+			kgnilnd_cancel_dgram_locked(dg);
+		}
+	}
+
+	spin_unlock(&net->gnn_dev->gnd_dgram_lock);
+
+	RETURN(0);
+}
+
+int
+kgnilnd_cancel_wc_dgrams(kgn_device_t *dev)
+{
+	kgn_dgram_t *dg, *dgN;
+	struct list_head zombies;
+	ENTRY;
+
+	/* Time to kill the outstanding WC's
+	 * WC's exist on net 0 only but match on any net...
+	 */
+
+	LASSERTF(kgnilnd_data.kgn_in_reset || kgnilnd_data.kgn_wc_kill,
+		"called with LND invalid state: WC shutdown %d "
+		"in reset %d\n", kgnilnd_data.kgn_wc_kill,
+		kgnilnd_data.kgn_in_reset);
+
+	INIT_LIST_HEAD(&zombies);
+	spin_lock(&dev->gnd_dgram_lock);
+
+	do {
+		dg = kgnilnd_find_dgram_locked(dev, LNET_NID_ANY);
+		if (dg != NULL) {
+			LASSERTF(dg->gndg_type == GNILND_DGRAM_WC_REQ,
+				 "dgram 0x%p->%s with bad type %d (%s)\n",
+				dg, libcfs_nid2str(dg->gndg_conn_out.gncr_dstnid),
+				dg->gndg_type, kgnilnd_dgram_type2str(dg));
+
+			kgnilnd_cancel_dgram_locked(dg);
+
+			/* WC could be DONE already, check and if so add to list to be released */
+			if (dg->gndg_state == GNILND_DGRAM_DONE) {
+				list_del_init(&dg->gndg_list);
+				list_add_tail(&dg->gndg_list, &zombies);
+			}
+		}
+	} while (dg != NULL);
+
+	spin_unlock(&dev->gnd_dgram_lock);
+
+	list_for_each_entry_safe(dg, dgN, &zombies, gndg_list) {
+		list_del_init(&dg->gndg_list);
+		kgnilnd_release_dgram(dev, dg);
+	}
+	RETURN(0);
+
+}
+
+void
+kgnilnd_wait_for_canceled_dgrams(kgn_device_t *dev)
+{
+	int             i = 4;
+	int             rc;
+	gni_return_t    grc;
+	__u64           readyid;
+	kgn_dgram_t    *dgram;
+
+	/* use do while to get at least one check run to allow
+	 * regression test for 762072 to hit bug if there */
+
+	/* This function races with the dgram mover during shutdown so it is possible for
+	 * a dgram to be seen in kgnilnd_postdata_probe_wait_by_id but be handled in the
+	 * dgram mover thread instead of inside of this function.
+	 */
+
+	/* This should only be called from within shutdown, baseshutdown, or stack reset.
+	 * there are no assertions here to verify since base_shutdown has nothing in it we can check
+	 * the net is gone by then.
+	 */
+
+	do {
+		i++;
+		CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
+			"Waiting for %d canceled datagrams to clear on device %d\n",
+			atomic_read(&dev->gnd_canceled_dgrams), dev->gnd_id);
+
+		/* check once a second */
+		grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle,
+		       250, &readyid);
+
+		if (grc != GNI_RC_SUCCESS)
+			continue;
+
+		CDEBUG(D_NET, "ready "LPX64" on device %d->0x%p\n",
+			readyid, dev->gnd_id, dev);
+
+		rc = kgnilnd_probe_for_dgram(dev, &dgram);
+		if (rc != 0) {
+			/* if we got a valid dgram or one that is now done, clean up */
+			kgnilnd_release_dgram(dev, dgram);
+		}
+	} while (atomic_read(&dev->gnd_canceled_dgrams));
+}
+
+int
+kgnilnd_start_connect(kgn_peer_t *peer)
+{
+	int              rc = 0;
+	/* sync point for kgnilnd_del_peer_locked - do an early check to
+	 * catch the most common hits where del_peer is done by the
+	 * time we get here */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING1)) {
+		while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING1, 1)) {};
+	}
+
+	write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+	if (!kgnilnd_peer_active(peer) || peer->gnp_connecting != GNILND_PEER_CONNECT) {
+		/* raced with peer getting unlinked */
+		write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+		rc = ESTALE;
+		GOTO(out, rc);
+	}
+	peer->gnp_connecting = GNILND_PEER_POSTING;
+	write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+	set_mb(peer->gnp_last_dgram_time, jiffies);
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING2)) {
+		while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING2, 1)) {};
+	}
+
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING3)) {
+		while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING3, 1)) {};
+		rc = cfs_fail_val ? cfs_fail_val : -ENOMEM;
+	} else {
+		rc = kgnilnd_post_dgram(peer->gnp_net->gnn_dev,
+					peer->gnp_nid, GNILND_CONNREQ_REQ, 0);
+	}
+	if (rc < 0) {
+		set_mb(peer->gnp_last_dgram_errno, rc);
+		GOTO(failed, rc);
+	}
+
+	/* while we're posting someone could have decided this peer/dgram needed to
+	 * die a quick death, so we check for state change and process accordingly */
+
+	write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+	if (!kgnilnd_peer_active(peer) || peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
+		if (peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
+			peer->gnp_connecting = GNILND_PEER_KILL;
+		}
+		write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+		/* positive RC to avoid dgram cleanup - we'll have to
+		 * wait for the kgni GNI_POST_TERMINATED event to
+		 * finish cleaning up */
+		rc = ESTALE;
+		kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev, peer->gnp_nid);
+		GOTO(out, rc);
+	}
+	peer->gnp_connecting = GNILND_PEER_POSTED;
+	write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+	/* reaper thread will take care of any timeouts */
+	CDEBUG(D_NET, "waiting for connect to finish to %s rc %d\n",
+	       libcfs_nid2str(peer->gnp_nid), rc);
+
+	RETURN(rc);
+
+failed:
+	CDEBUG(D_NET, "connect to %s failed: rc %d \n",
+	       libcfs_nid2str(peer->gnp_nid), rc);
+out:
+	RETURN(rc);
+}
+
+int
+kgnilnd_finish_connect(kgn_dgram_t *dgram)
+{
+	kgn_conn_t        *conn = dgram->gndg_conn;
+	lnet_nid_t         her_nid = dgram->gndg_conn_in.gncr_srcnid;
+	kgn_peer_t        *new_peer, *peer = NULL;
+	kgn_tx_t          *tx;
+	kgn_tx_t          *txn;
+	kgn_mbox_info_t   *mbox;
+	int                rc;
+	int                nstale;
+
+	/* try to find a peer that matches the nid we got in the connreq
+	 * kgnilnd_unpack_connreq makes sure that conn_in.gncr_srcnid is
+	 * HER and conn_out.gncr_srcnid is ME for both active and WC dgrams */
+
+	/* assume this is a new peer  - it makes locking cleaner when it isn't */
+	/* no holding kgn_net_rw_sem - already are at the kgnilnd_dgram_mover level */
+
+	rc = kgnilnd_create_peer_safe(&new_peer, her_nid, NULL);
+	if (rc != 0) {
+		CERROR("Can't create peer for %s\n", libcfs_nid2str(her_nid));
+		return rc;
+	}
+
+	write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+	/* this transfers ref from create_peer to the kgn_peer table */
+	kgnilnd_add_peer_locked(her_nid, new_peer, &peer);
+
+	/* if we found an existing peer, is it really ready for a new conn ? */
+	if (peer != new_peer) {
+		/* if this was an active connect attempt but we can't find a peer waiting for it
+		 * we will dump in the trash */
+
+		if (peer->gnp_connecting == GNILND_PEER_IDLE && dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
+			CDEBUG(D_NET, "dropping completed connreq for %s peer 0x%p->%s\n",
+			       libcfs_nid2str(her_nid), peer, libcfs_nid2str(peer->gnp_nid));
+			write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+			rc = ECANCELED;
+			GOTO(out, rc);
+		}
+
+		/* check to see if we can catch a connecting peer before it is
+		 * removed from the connd_peers list - if not, we need to
+		 * let the connreqs race and be handled by kgnilnd_conn_isdup_locked() */
+		if (peer->gnp_connecting != GNILND_PEER_IDLE) {
+			spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
+			if (!list_empty(&peer->gnp_connd_list)) {
+				list_del_init(&peer->gnp_connd_list);
+				/* drop connd ref */
+				kgnilnd_peer_decref(peer);
+			}
+			spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
+			/* clear rc to make sure we don't have fake error */
+			rc = 0;
+		}
+
+		/* no matter what, we are no longer waiting to connect this peer now */
+		peer->gnp_connecting = GNILND_PEER_IDLE;
+
+		/* Refuse to duplicate an existing connection (both sides might try to
+		 * connect at once).  NB we return success!  We _are_ connected so we
+		 * _don't_ have any blocked txs to complete with failure. */
+		rc = kgnilnd_conn_isdup_locked(peer, conn);
+		if (rc != 0) {
+			write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+			CDEBUG(D_NET, "Not creating duplicate connection to %s: %d\n",
+			      libcfs_nid2str(her_nid), rc);
+			rc = EALREADY;
+			GOTO(out, rc);
+		}
+	}
+
+	nstale = kgnilnd_close_stale_conns_locked(peer, conn);
+
+	/* either way with peer (new or existing), we are ok with ref counts here as the
+	 * kgnilnd_add_peer_locked will use our ref on new_peer (from create_peer_safe) as the
+	 * ref for the peer table. */
+
+	/* at this point, the connection request is a winner */
+
+	/* mark 'DONE' to avoid cancel being called from release */
+	dgram->gndg_state = GNILND_DGRAM_DONE;
+
+	/* initialise timestamps before reaper looks at them */
+	conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies;
+
+	/* last_tx is initialized to jiffies - (keepalive*2) so that if the NOOP fails it will
+	 * immediatly send a NOOP in the reaper thread during the call to
+	 * kgnilnd_check_conn_timeouts_locked
+	 */
+	conn->gnc_last_tx = jiffies - (cfs_time_seconds(GNILND_TO2KA(conn->gnc_timeout)) * 2);
+	conn->gnc_state = GNILND_CONN_ESTABLISHED;
+
+	/* refs are not transferred from dgram to tables, so increment to
+	 * take ownership */
+	kgnilnd_conn_addref(conn);
+	kgnilnd_peer_addref(peer);
+	conn->gnc_peer = peer;
+	list_add_tail(&conn->gnc_list, &peer->gnp_conns);
+
+	kgnilnd_conn_addref(conn);               /* +1 ref for conn table */
+	list_add_tail(&conn->gnc_hashlist,
+		      kgnilnd_cqid2connlist(conn->gnc_cqid));
+	kgnilnd_data.kgn_conn_version++;
+
+	/* Dont send NOOP if fail_loc is set
+	 */
+	if (!CFS_FAIL_CHECK(CFS_FAIL_GNI_ONLY_NOOP)) {
+		tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP, peer->gnp_net->gnn_ni->ni_nid);
+		if (tx == NULL) {
+			CNETERR("can't get TX to initiate NOOP to %s\n",
+				libcfs_nid2str(peer->gnp_nid));
+		} else {
+			kgnilnd_queue_tx(conn, tx);
+		}
+	}
+
+	/* Schedule all packets blocking for a connection */
+	list_for_each_entry_safe(tx, txn, &peer->gnp_tx_queue, tx_list) {
+		/* lock held here is the peer_conn lock */
+		kgnilnd_tx_del_state_locked(tx, peer, NULL, GNILND_TX_ALLOCD);
+		kgnilnd_queue_tx(conn, tx);
+	}
+
+	/* If this is an active connection lets mark its timestamp on the MBoX */
+	if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
+		mbox = &conn->gnc_fma_blk->gnm_mbox_info[conn->gnc_mbox_id];
+		/* conn->gnc_last_rx is jiffies it better exist as it was just set */
+		mbox->mbx_release_purg_active_dgram = conn->gnc_last_rx;
+	}
+
+	/* Bug 765042: wake up scheduler for a race with finish_connect and
+	 * complete_conn_closed with a conn in purgatory
+	 * since we can't use CFS_RACE due to mutex_holds in kgnilnd_process_conns,
+	 * we just check for set and then clear */
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FINISH_PURG)) {
+		cfs_fail_loc = 0x0;
+		/* get scheduler thread moving again */
+		kgnilnd_schedule_device(conn->gnc_device);
+	}
+
+	CDEBUG(D_NET, "New conn 0x%p->%s dev %d\n",
+	       conn, libcfs_nid2str(her_nid), conn->gnc_device->gnd_id);
+
+	/* make sure we reset peer reconnect interval now that we have a good conn */
+	kgnilnd_peer_alive(peer);
+	peer->gnp_reconnect_interval = 0;
+
+	/* clear the unlink attribute if we dont clear it kgnilnd_del_conn_or_peer will wait
+	 * on the atomic forever
+	 */
+	if (peer->gnp_pending_unlink) {
+		peer->gnp_pending_unlink = 0;
+		kgnilnd_admin_decref(kgnilnd_data.kgn_npending_unlink);
+		CDEBUG(D_NET, "Clearing peer unlink %p\n",peer);
+	}
+
+	/* add ref to make it hang around until after we drop the lock */
+	kgnilnd_conn_addref(conn);
+
+	/* Once the peer_conn lock is dropped, the conn could actually move into
+	 * CLOSING->CLOSED->DONE in the scheduler thread, so hold the
+	 * lock until we are really done */
+	write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+	/* Notify LNET that we now have a working connection to this peer.
+	 * This is a Cray extension to the "standard" LND behavior. */
+	lnet_notify(peer->gnp_net->gnn_ni, peer->gnp_nid,
+		     1, cfs_time_current());
+
+	/* schedule the conn to pick up any SMSG sent by peer before we could
+	 * process this dgram */
+	kgnilnd_schedule_conn(conn);
+
+	/* drop our 'hold' ref */
+	kgnilnd_conn_decref(conn);
+
+out:
+	RETURN(rc);
+}
+
+void
+kgnilnd_send_nak(kgn_device_t *dev, lnet_nid_t dst_nid, int error)
+{
+	int              rc = 0;
+	ENTRY;
+
+	LASSERTF(dst_nid != LNET_NID_ANY, "bad dst_nid %s\n", libcfs_nid2str(dst_nid));
+
+	CDEBUG(D_NET, "NAK to %s errno %d\n", libcfs_nid2str(dst_nid), error);
+
+	rc = kgnilnd_post_dgram(dev, dst_nid, GNILND_CONNREQ_NAK, error);
+
+	if (rc < 0) {
+		CDEBUG(D_NET, "NAK to %s failed: rc %d \n", libcfs_nid2str(dst_nid), rc);
+	}
+	EXIT;
+}
+
+int
+kgnilnd_process_nak(kgn_dgram_t *dgram)
+{
+	kgn_connreq_t     *connreq = &dgram->gndg_conn_in;
+	lnet_nid_t         src_nid = connreq->gncr_srcnid;
+	int                errno = connreq->gncr_nakdata.gnnd_errno;
+	kgn_peer_t        *peer;
+	int                rc = 0;
+
+	write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+	peer = kgnilnd_find_peer_locked(src_nid);
+	if (peer == NULL) {
+		/* we likely dropped him from bad data when we processed
+		 * the original REQ */
+		write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+		return -EBADSLT;
+	}
+
+	/* need to check peerstamp/connstamp against the ones we find
+	 * to make sure we don't close new (and good?) conns that we
+	 * formed after this connreq failed */
+	if (peer->gnp_connecting == GNILND_PEER_IDLE) {
+		kgn_conn_t        conn;
+
+		if (list_empty(&peer->gnp_conns)) {
+			/* assume already procced datagram and it barfed up
+			 * on this side too */
+			CDEBUG(D_NET, "dropping NAK from %s; "
+			       "peer %s is already not connected\n",
+				libcfs_nid2str(connreq->gncr_srcnid),
+				libcfs_nid2str(connreq->gncr_dstnid));
+			write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+			return 0;
+		}
+
+		/* stub up a connection with the connreq XXX_stamps to allow
+		 * use to use close_stale_conns_locked */
+		conn.gnc_peerstamp = connreq->gncr_peerstamp;
+		conn.gnc_my_connstamp = connreq->gncr_connstamp;
+		conn.gnc_peer_connstamp = connreq->gncr_connstamp;
+		conn.gnc_device = peer->gnp_net->gnn_dev;
+
+		rc = kgnilnd_close_stale_conns_locked(peer, &conn);
+
+		LCONSOLE_INFO("Received NAK from %s for %s errno %d; "
+			"closed %d connections\n",
+			libcfs_nid2str(connreq->gncr_srcnid),
+			libcfs_nid2str(connreq->gncr_dstnid), errno, rc);
+	} else {
+		rc = 0;
+		spin_lock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);
+
+		if (list_empty(&peer->gnp_connd_list)) {
+			/* if peer isn't on waiting list, try to find one to nuke */
+			rc = kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev,
+							   peer->gnp_nid);
+
+			if (rc) {
+				LCONSOLE_INFO("Received NAK from %s for %s errno %d; "
+					"canceled pending connect request\n",
+					libcfs_nid2str(connreq->gncr_srcnid),
+					libcfs_nid2str(connreq->gncr_dstnid), errno);
+			}
+
+			/* if we can't find a waiting dgram, we just drop the nak - the conn
+			 * connect must have failed (didn't find conn above and clear connecting
+			 * -- so nothing to do besides drop */
+		} else {
+			/* peer is on list, meaning it is a new connect attempt from the one
+			 * we started that generated the NAK - so just drop NAK */
+
+			/* use negative to prevent error message */
+			rc = -EAGAIN;
+		}
+		spin_unlock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);
+	}
+
+	/* success! we found a peer and at least marked pending_nak */
+	write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+	return 0;
+}
+
+int
+kgnilnd_process_connreq(kgn_dgram_t *dgram, int *needs_nak)
+{
+	int                      rc;
+
+	rc = kgnilnd_unpack_connreq(dgram);
+	if (rc < 0) {
+		if (rc != -EBADF) {
+			/* only NAK if we have good srcnid to use */
+			*needs_nak = 1;
+		}
+		goto connreq_out;
+	}
+
+	switch (dgram->gndg_conn_in.gncr_type) {
+	case GNILND_CONNREQ_REQ:
+		/* wire up peer & conn, send queued TX */
+		rc = kgnilnd_finish_connect(dgram);
+
+		/* don't nak when the nid is hosed */
+		if ((rc < 0)) {
+			*needs_nak = 1;
+		}
+
+		break;
+	case GNILND_CONNREQ_NAK:
+		rc = kgnilnd_process_nak(dgram);
+		/* return early to prevent reconnect bump */
+		return rc;
+	default:
+		CERROR("unexpected connreq type %s (%d) from %s\n",
+			kgnilnd_connreq_type2str(&dgram->gndg_conn_in),
+			dgram->gndg_conn_in.gncr_type,
+			libcfs_nid2str(dgram->gndg_conn_in.gncr_srcnid));
+		rc = -EINVAL;
+		*needs_nak = 1;
+		break;
+	}
+
+connreq_out:
+	RETURN(rc);
+}
+
+int
+kgnilnd_probe_and_process_dgram(kgn_device_t *dev)
+{
+	int                      rc;
+	int                      needs_nak = 0;
+	lnet_nid_t               nak_dstnid = LNET_NID_ANY;
+	lnet_nid_t               orig_dstnid;
+	kgn_dgram_t             *dgram = NULL;
+	kgn_peer_t              *peer;
+	ENTRY;
+
+	if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PAUSE_DGRAM_COMP)) {
+		rc = 0;
+	} else {
+		rc = kgnilnd_probe_for_dgram(dev, &dgram);
+	}
+
+	if (rc == 0) {
+		RETURN(0);
+	} else if (rc < 0) {
+		GOTO(inform_peer, rc);
+	} else {
+		/* rc > 1 means it did something, reset for this func  */
+		rc = 0;
+	}
+
+	switch (dgram->gndg_type) {
+	case GNILND_DGRAM_WC_REQ:
+	case GNILND_DGRAM_REQ:
+		rc = kgnilnd_process_connreq(dgram, &needs_nak);
+		break;
+	case GNILND_DGRAM_NAK:
+		CDEBUG(D_NETTRACE, "NAK to %s done\n",
+			libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid));
+		break;
+	default:
+		CERROR("unknown datagram type %s (%d)\n",
+		       kgnilnd_dgram_type2str(dgram), dgram->gndg_type);
+		break;
+	}
+
+	/* stash data to use after releasing current datagram */
+	/* don't stash net - we are operating on a net already,
+	 * so the lock on rw_net_lock is sufficient */
+
+	nak_dstnid = dgram->gndg_conn_in.gncr_srcnid;
+
+inform_peer:
+	LASSERTF(dgram != NULL, "dgram 0x%p rc %d needs_nak %d\n", dgram, rc, needs_nak);
+
+	orig_dstnid = dgram->gndg_conn_out.gncr_dstnid;
+
+	kgnilnd_release_dgram(dev, dgram);
+
+	CDEBUG(D_NET, "cleaning up dgram to %s, rc %d\n",
+	       libcfs_nid2str(orig_dstnid), rc);
+
+	/* if this was a WC_REQ that matched an existing peer, it'll get marked done
+	 * in kgnilnd_finish_connect - if errors are from before we get to there,
+	 * we just drop as it is a WC_REQ - the peer CAN'T be waiting for it */
+	if ((orig_dstnid != LNET_NID_ANY) && (rc < 0)) {
+		/* if we have a negative rc, we want to find a peer to inform about
+		 * the bad connection attempt. Sorry buddy, better luck next time! */
+
+		write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+		peer = kgnilnd_find_peer_locked(orig_dstnid);
+
+		if (peer != NULL) {
+			/* add ref to make sure he stays around past the possible unlink
+			 * so we can tell LNet about him */
+			kgnilnd_peer_addref(peer);
+
+			/* if he still cares about the outstanding connect */
+			if (peer->gnp_connecting >= GNILND_PEER_CONNECT) {
+				/* check if he is on the connd list and remove.. */
+				spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
+				if (!list_empty(&peer->gnp_connd_list)) {
+					list_del_init(&peer->gnp_connd_list);
+					/* drop connd ref */
+					kgnilnd_peer_decref(peer);
+				}
+				spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
+
+				/* clear gnp_connecting so we don't have a non-connecting peer
+				 * on gnd_connd_list */
+				peer->gnp_connecting = GNILND_PEER_IDLE;
+
+				set_mb(peer->gnp_last_dgram_errno, rc);
+
+				kgnilnd_peer_increase_reconnect_locked(peer);
+			}
+		}
+		write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+		/* now that we are outside the lock, tell Mommy */
+		if (peer != NULL) {
+			kgnilnd_peer_notify(peer, rc);
+			kgnilnd_peer_decref(peer);
+		}
+	}
+
+	if (needs_nak) {
+		kgnilnd_send_nak(dev, nak_dstnid, rc);
+	}
+
+	RETURN(1);
+}
+
+void
+kgnilnd_reaper_dgram_check(kgn_device_t *dev)
+{
+	kgn_dgram_t    *dgram, *tmp;
+	int             i;
+
+	spin_lock(&dev->gnd_dgram_lock);
+
+	for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size - 1); i++) {
+		list_for_each_entry_safe(dgram, tmp, &dev->gnd_dgrams[i], gndg_list) {
+			unsigned long            now = jiffies;
+			unsigned long            timeout;
+
+			/* don't timeout stuff if the network is mucked or shutting down */
+			if (kgnilnd_check_hw_quiesce()) {
+				break;
+			}
+
+			if ((dgram->gndg_state != GNILND_DGRAM_POSTED) ||
+			    (dgram->gndg_type == GNILND_DGRAM_WC_REQ)) {
+				continue;
+			}
+			CDEBUG(D_NETTRACE, "checking dgram 0x%p type %s "
+				"state %s conn 0x%p to %s age %lus\n",
+				dgram, kgnilnd_dgram_type2str(dgram),
+				kgnilnd_dgram_state2str(dgram), dgram->gndg_conn,
+				libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
+				cfs_duration_sec(now - dgram->gndg_post_time));
+
+			timeout = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout);
+
+			if (time_before(now, (dgram->gndg_post_time + timeout)))
+				continue;
+
+			CNETERR("%s datagram to %s timed out @ %lus dgram "
+				"0x%p state %s conn 0x%p\n",
+				kgnilnd_dgram_type2str(dgram),
+				libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
+				cfs_duration_sec(now - dgram->gndg_post_time),
+				dgram, kgnilnd_dgram_state2str(dgram),
+				dgram->gndg_conn);
+
+			kgnilnd_cancel_dgram_locked(dgram);
+		}
+	}
+	spin_unlock(&dev->gnd_dgram_lock);
+}
+
+
+/* use a thread for the possibly long-blocking wait_by_id to prevent
+ * stalling the global workqueues */
+int
+kgnilnd_dgram_waitq(void *arg)
+{
+	kgn_device_t     *dev = (kgn_device_t *) arg;
+	char              name[16];
+	gni_return_t      grc;
+	__u64             readyid;
+	DEFINE_WAIT(mover_done);
+
+	snprintf(name, sizeof(name), "kgnilnd_dgn_%02d", dev->gnd_id);
+	cfs_daemonize(name);
+	cfs_block_allsigs();
+
+	/* all gnilnd threads need to run fairly urgently */
+	set_user_nice(current, *kgnilnd_tunables.kgn_nice);
+
+	/* we dont shut down until the device shuts down ... */
+	while (!kgnilnd_data.kgn_shutdown) {
+		/* to quiesce or to not quiesce, that is the question */
+		if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+			KGNILND_SPIN_QUIESCE;
+		}
+
+		while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_PAUSE_DGRAM_COMP, 1)) {}
+
+		/* check once a second */
+		grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle,
+						       1000, &readyid);
+
+		if (grc == GNI_RC_SUCCESS) {
+			CDEBUG(D_INFO, "waking up dgram mover thread\n");
+			kgnilnd_schedule_dgram(dev);
+
+			/* wait for dgram thread to ping us before spinning again */
+			prepare_to_wait(&dev->gnd_dgping_waitq, &mover_done,
+					TASK_INTERRUPTIBLE);
+
+			/* don't sleep if we need to quiesce */
+			if (likely(!kgnilnd_data.kgn_quiesce_trigger)) {
+				schedule();
+			}
+			finish_wait(&dev->gnd_dgping_waitq, &mover_done);
+		}
+	}
+
+	kgnilnd_thread_fini();
+	return 0;
+}
+
+int
+kgnilnd_start_outbound_dgrams(kgn_device_t *dev)
+{
+	int                      did_something = 0, rc;
+	kgn_peer_t              *peer = NULL;
+
+	spin_lock(&dev->gnd_connd_lock);
+
+	/* Active connect - we added this in kgnilnd_launch_tx */
+	while (!list_empty(&dev->gnd_connd_peers)) {
+		peer = list_first_entry(&dev->gnd_connd_peers,
+					kgn_peer_t, gnp_connd_list);
+
+		/* ref for connd removed in if/else below */
+	       list_del_init(&peer->gnp_connd_list);
+
+		/* gnp_connecting and membership on gnd_connd_peers should be
+		 * done coherently to avoid double adding, etc */
+		/* don't need kgnilnd_data.kgn_peer_conn_lock here as that is only needed
+		 * to get the peer to gnp_connecting in the first place. We just need to
+		 * rely on gnd_connd_lock to serialize someone pulling him from the list
+		 * BEFORE clearing gnp_connecting */
+		LASSERTF(peer->gnp_connecting != GNILND_PEER_IDLE, "peer 0x%p->%s not connecting\n",
+			 peer, libcfs_nid2str(peer->gnp_nid));
+
+		spin_unlock(&dev->gnd_connd_lock);
+
+		CDEBUG(D_NET, "processing connect to %s\n",
+		       libcfs_nid2str(peer->gnp_nid));
+
+		did_something += 1;
+		rc = kgnilnd_start_connect(peer);
+
+		if (likely(rc >= 0)) {
+			/* 0 on success, positive on 'just drop peer' errors */
+			kgnilnd_peer_decref(peer);
+		} else if (rc == -ENOMEM) {
+			/* if we are out of wildcards, add back to
+			 * connd_list - then break out and we'll try later
+			 * if other errors, we'll bail & cancel pending tx */
+			write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+			if (peer->gnp_connecting == GNILND_PEER_POSTING) {
+				peer->gnp_connecting = GNILND_PEER_CONNECT;
+				spin_lock(&dev->gnd_connd_lock);
+				list_add_tail(&peer->gnp_connd_list,
+					      &dev->gnd_connd_peers);
+			} else {
+				/* connecting changed while we were posting */
+
+				LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid"
+					" state 0x%p->%s, connecting %d\n",
+					peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
+				peer->gnp_connecting = GNILND_PEER_KILL;
+				spin_lock(&dev->gnd_connd_lock);
+				/* remove the peer ref frrom the cond list */
+				kgnilnd_peer_decref(peer);
+				/* let the system handle itself */
+			}
+			write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+			/* the datagrams are a global pool,
+			 * so break out of trying and hope some free
+			 * up soon */
+			did_something -= 1;
+			break;
+		} else {
+			/* something bad happened, you lose */
+			CNETERR("could not start connecting to %s "
+				"rc %d: Will retry until TX timeout\n",
+			       libcfs_nid2str(peer->gnp_nid), rc);
+			/* It didnt post so just set connecting back to zero now.
+			 * The reaper will reattempt the connection if it needs too.
+			 * If the peer needs death set it so the reaper will cleanup.
+			 */
+			write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+			if (peer->gnp_connecting == GNILND_PEER_POSTING) {
+				peer->gnp_connecting = GNILND_PEER_IDLE;
+				kgnilnd_peer_increase_reconnect_locked(peer);
+			} else {
+				LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid"
+					" state 0x%p->%s, connecting %d\n",
+					peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
+				peer->gnp_connecting = GNILND_PEER_KILL;
+			}
+			write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+			/* hold onto ref until we are really done - if it was
+			 * unlinked this could result in a destroy */
+			kgnilnd_peer_decref(peer);
+		}
+		spin_lock(&dev->gnd_connd_lock);
+	}
+
+	spin_unlock(&dev->gnd_connd_lock);
+	RETURN(did_something);
+}
+
+static void
+kgnilnd_dgram_poke_with_stick(unsigned long arg)
+{
+	int             dev_id = arg;
+	kgn_device_t    *dev = &kgnilnd_data.kgn_devices[dev_id];
+
+	wake_up(&dev->gnd_dgram_waitq);
+}
+
+/* use single thread for dgrams - should be sufficient for performance */
+int
+kgnilnd_dgram_mover(void *arg)
+{
+	kgn_device_t            *dev = (kgn_device_t *)arg;
+	char                     name[16];
+	int                      rc, did_something;
+	unsigned long            next_purge_check = jiffies - 1;
+	unsigned long            timeout;
+	struct timer_list        timer;
+	DEFINE_WAIT(wait);
+
+	snprintf(name, sizeof(name), "kgnilnd_dg_%02d", dev->gnd_id);
+	cfs_daemonize(name);
+	cfs_block_allsigs();
+	/* all gnilnd threads need to run fairly urgently */
+	set_user_nice(current, *kgnilnd_tunables.kgn_nice);
+
+	/* we are ok not locking for these variables as the dgram waitq threads
+	 * will block both due to tying up net (kgn_shutdown) and the completion
+	 * event for the dgram_waitq (kgn_quiesce_trigger) */
+
+	while (!kgnilnd_data.kgn_shutdown) {
+		/* Safe: kgn_shutdown only set when quiescent */
+
+		/* race with stack reset - we want to hold off seeing any new incoming dgrams
+		 * so we can force a dirty WC dgram for Bug 762072 - put right before
+		 * quiesce check so that it'll go right into that and not do any
+		 * dgram mucking */
+		CFS_RACE(CFS_FAIL_GNI_WC_DGRAM_FREE);
+
+		/* to quiesce or to not quiesce, that is the question */
+		if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+			KGNILND_SPIN_QUIESCE;
+		}
+		did_something = 0;
+
+		CFS_RACE(CFS_FAIL_GNI_QUIESCE_RACE);
+
+		/* process any newly completed dgrams */
+		down_read(&kgnilnd_data.kgn_net_rw_sem);
+
+		rc = kgnilnd_probe_and_process_dgram(dev);
+		if (rc > 0) {
+			did_something += rc;
+		}
+
+		up_read(&kgnilnd_data.kgn_net_rw_sem);
+
+		/* start new outbound dgrams */
+		did_something += kgnilnd_start_outbound_dgrams(dev);
+
+		/* find dead dgrams */
+		if (time_after_eq(jiffies, next_purge_check)) {
+			/* these don't need to be checked that often */
+			kgnilnd_reaper_dgram_check(dev);
+
+			next_purge_check = (long) jiffies +
+				      cfs_time_seconds(kgnilnd_data.kgn_new_min_timeout / 4);
+		}
+
+		/* careful with the jiffy wrap... */
+		timeout = (long)(next_purge_check - jiffies);
+
+		CDEBUG(D_INFO, "did %d timeout %lu next %lu jiffies %lu\n",
+		       did_something, timeout, next_purge_check, jiffies);
+
+		if (did_something || timeout <= 0) {
+			did_something = 0;
+			continue;
+		}
+
+		prepare_to_wait(&dev->gnd_dgram_waitq, &wait, TASK_INTERRUPTIBLE);
+
+		setup_timer(&timer, kgnilnd_dgram_poke_with_stick, dev->gnd_id);
+		mod_timer(&timer, (long) jiffies + timeout);
+
+		/* last second chance for others to poke us */
+		did_something += xchg(&dev->gnd_dgram_ready, GNILND_DGRAM_IDLE);
+
+		/* check flag variables before comitting */
+		if (!did_something &&
+		    !kgnilnd_data.kgn_shutdown &&
+		    !kgnilnd_data.kgn_quiesce_trigger) {
+			CDEBUG(D_INFO, "schedule timeout %ld (%lu sec)\n",
+			       timeout, cfs_duration_sec(timeout));
+			wake_up_all(&dev->gnd_dgping_waitq);
+			schedule();
+			CDEBUG(D_INFO, "awake after schedule\n");
+		}
+
+		del_singleshot_timer_sync(&timer);
+		finish_wait(&dev->gnd_dgram_waitq, &wait);
+	}
+
+	kgnilnd_thread_fini();
+	return 0;
+}
+
diff --git a/lnet/klnds/gnilnd/gnilnd_debug.c b/lnet/klnds/gnilnd/gnilnd_debug.c
new file mode 100644
index 0000000..8230d98
--- /dev/null
+++ b/lnet/klnds/gnilnd/gnilnd_debug.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (C) 2009-2012 Cray, Inc.
+ *
+ *   Author: Nic Henke <nic@cray.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#include "gnilnd.h"
+
+void
+_kgnilnd_debug_msg(kgn_msg_t *msg, struct libcfs_debug_msg_data *msgdata,
+		   const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	/* XXX Nic TBD: add handling of gnm_u ? */
+	libcfs_debug_vmsg2(msgdata, fmt, args,
+			   " msg@0x%p m/v/ty/ck/pck/pl %08x/%d/%d/%x/%x/%d x%d:%s\n",
+			   msg, msg->gnm_magic, msg->gnm_version, msg->gnm_type,
+			   msg->gnm_cksum, msg->gnm_payload_cksum,
+			   msg->gnm_payload_len, msg->gnm_seq,
+			   kgnilnd_msgtype2str(msg->gnm_type));
+	va_end(args);
+}
+
+void
+_kgnilnd_debug_conn(kgn_conn_t *conn, struct libcfs_debug_msg_data *msgdata,
+		    const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	libcfs_debug_vmsg2(msgdata, fmt, args,
+		" conn@0x%p->%s:%s cq %u, to %ds, "
+		" RX %d @ %lu/%lus; TX %d @ %lus/%lus; "
+		" NOOP %lus/%lu/%lus; sched %lus/%lus/%lus ago \n",
+		conn, conn->gnc_peer ? libcfs_nid2str(conn->gnc_peer->gnp_nid) :
+		"<?>", kgnilnd_conn_state2str(conn),
+		conn->gnc_cqid, conn->gnc_timeout,
+		conn->gnc_rx_seq,
+		cfs_duration_sec(jiffies - conn->gnc_last_rx),
+		cfs_duration_sec(jiffies - conn->gnc_last_rx_cq),
+		conn->gnc_tx_seq,
+		cfs_duration_sec(jiffies - conn->gnc_last_tx),
+		cfs_duration_sec(jiffies - conn->gnc_last_tx_cq),
+		cfs_duration_sec(jiffies - conn->gnc_last_noop_want),
+		cfs_duration_sec(jiffies - conn->gnc_last_noop_sent),
+		cfs_duration_sec(jiffies - conn->gnc_last_noop_cq),
+		cfs_duration_sec(jiffies - conn->gnc_last_sched_ask),
+		cfs_duration_sec(jiffies - conn->gnc_last_sched_do),
+		cfs_duration_sec(jiffies - conn->gnc_device->gnd_sched_alive));
+
+
+	va_end(args);
+}
+
+void
+_kgnilnd_debug_tx(kgn_tx_t *tx, struct libcfs_debug_msg_data *msgdata,
+		  const char *fmt, ...)
+{
+	kgn_tx_ev_id_t  *id   = &tx->tx_id;
+	char            *nid = "<?>";
+	va_list          args;
+
+	if (tx->tx_conn && tx->tx_conn->gnc_peer) {
+		nid = libcfs_nid2str(tx->tx_conn->gnc_peer->gnp_nid);
+	}
+
+	va_start(args, fmt);
+	libcfs_debug_vmsg2(msgdata, fmt, args,
+		" tx@0x%p->%s id "LPX64"/%u/%d:%d msg %x/%s/%d q %s@%lds->0x%p f %x re %d\n",
+		tx, nid, id->txe_cookie, id->txe_smsg_id, id->txe_cqid,
+		id->txe_idx, tx->tx_msg.gnm_type,
+		kgnilnd_msgtype2str(tx->tx_msg.gnm_type), tx->tx_buftype,
+		kgnilnd_tx_state2str(tx->tx_list_state),
+		cfs_duration_sec((long)jiffies - tx->tx_qtime), tx->tx_list_p,
+		tx->tx_state, tx->tx_retrans);
+	va_end(args);
+}
+
+void
+_kgnilnd_api_rc_lbug(const char* rcstr, int rc, struct libcfs_debug_msg_data *msgdata,
+			const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	libcfs_debug_vmsg2(msgdata, fmt, args,
+			   " GNI API violated? Unexpected rc %s(%d)!\n",
+			   rcstr, rc);
+	va_end(args);
+	LBUG();
+}
diff --git a/lnet/klnds/gnilnd/gnilnd_hss_ops.h b/lnet/klnds/gnilnd/gnilnd_hss_ops.h
new file mode 100644
index 0000000..ec75177
--- /dev/null
+++ b/lnet/klnds/gnilnd/gnilnd_hss_ops.h
@@ -0,0 +1,284 @@
+/*
+ * Copyright (C) 2010-2012 Cray, Inc.
+ *   Author: Nic Henke <nic@cray.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#ifndef _GNILND_HSS_OPS_H
+#define _GNILND_HSS_OPS_H
+
+/* for krca nid & nic translation */
+#include <krca_lib.h>
+#include <linux/typecheck.h>
+
+/* the SimNow nodes can't load rca.ko, so we need to detect this
+ * and fake a table that'd work for lookups there */
+
+typedef struct kgn_nid_entry {
+	__u32   nid;
+	__u32   nicaddr;
+} kgn_nid_entry_t;
+
+typedef struct kgn_hssops
+{
+	/* function pointers for nid and nic conversion */
+	/* from krca_lib.h */
+	int     (*nid_to_nicaddr)(__u32 nid, int numnic, __u32 *nicaddr);
+	int     (*nicaddr_to_nid)(__u32 nicaddr, __u32 *nid);
+	void    (*hb_to_l0)(void);
+} kgn_hssops_t;
+
+/* pull in static store in gnilnd.c */
+extern kgn_hssops_t             kgnilnd_hssops;
+
+#define GNILND_NO_RCA           0xdeadbeef
+#define GNILND_NO_QUIESCE       0xdeadbeef
+
+static inline int
+kgnilnd_lookup_rca_funcs(void)
+{
+        void    *funcp;
+
+	funcp = __symbol_get("send_hb_2_l0");
+	if (funcp == 0) {
+		CERROR("couldn't find send_hb_2_l0\n");
+		/* not fatal for now */
+	} else {
+		kgnilnd_hssops.hb_to_l0 = funcp;
+	}
+
+	/* if we find one, we should get the other */
+
+	funcp = __symbol_get("krca_nid_to_nicaddrs");
+	if (funcp == 0) {
+		kgnilnd_hssops.nid_to_nicaddr = (void *)GNILND_NO_RCA;
+		kgnilnd_hssops.nicaddr_to_nid = (void *)GNILND_NO_RCA;
+		LCONSOLE_INFO("using SimNow nid table for RCA translation\n");
+		return 0;
+	}
+	kgnilnd_hssops.nid_to_nicaddr = funcp;
+
+	funcp = __symbol_get("krca_nicaddr_to_nid");
+	if (funcp == 0) {
+		CERROR("found krca_nid_to_nicaddrs but not "
+		       "krca_nicaddr_to_nid\n");
+		return -ESRCH;
+	}
+	kgnilnd_hssops.nicaddr_to_nid = funcp;
+	return 0;
+}
+
+#if defined(CONFIG_CRAY_GEMINI)
+/* Gemini SimNow has a hard coded table to use - no RCA there */
+#define GNILND_MAX_NID_TABLE    0xffffffff
+/* this is all of the nodes defined in the Baker SimNow "sim_platforms" page */
+static kgn_nid_entry_t kgn_nid_table[] = {
+	{0x1, 0x100}, {0x2, 0x101}, {0x3, 0x104}, {0x4, 0x105},
+	{0x5, 0x108}, {0x6, 0x109}, {0x7, 0x10c}, {0x8, 0x10d},
+	{0x9, 0x110}, {0xa, 0x111}, {0xb, 0x114}, {0xc, 0x115},
+	{0xd, 0x118}, {0xe, 0x119}, {0xf, 0x11c}, {0x10, 0x11d},
+	{0x11, 0x120}, {0x12, 0x121}, {0x13, 0x124}, {0x14, 0x125},
+	{0x15, 0x128}, {0x16, 0x129}, {0x17, 0x12c}, {0x18, 0x12d},
+	{0x19, 0x130}, {0x1a, 0x131}, {0x1b, 0x134}, {0x1c, 0x135},
+	{0x1d, 0x138}, {0x1e, 0x139}, {0x1f, 0x13c}, {0x20, 0x13d},
+	{0x21, 0x140}, {0x22, 0x141}, {0x23, 0x144}, {0x24, 0x145},
+	{0x25, 0x148}, {0x26, 0x149}, {0x27, 0x14c}, {0x28, 0x14d},
+	{0x29, 0x150}, {0x2a, 0x151}, {0x2b, 0x154}, {0x2c, 0x155},
+	{0x2d, 0x158}, {0x2e, 0x159}, {0x2f, 0x15c}, {0x30, 0x15d},
+	{0x31, 0x160}, {0x32, 0x161}, {0x33, 0x164}, {0x3d, 0x178},
+	{0x34, 0x165}, {0x3e, 0x179}, {0x35, 0x168}, {0x3f, 0x17c},
+	{0x36, 0x169}, {0x40, 0x17d}, {0x37, 0x16c}, {0x41, 0x180},
+	{0x38, 0x16d}, {0x42, 0x181}, {0x39, 0x170}, {0x3a, 0x171},
+	{0x3b, 0x174}, {0x3c, 0x175}, {0x43, 0x184}, {0x44, 0x185},
+	{0x45, 0x188}, {0x46, 0x189}, {0x47, 0x18c}, {0x48, 0x18d},
+	/* entries after this are for 'dead' peer tests */
+	{0x63, 0x1ff}, {0x111, 0x209},
+	{GNILND_MAX_NID_TABLE, GNILND_MAX_NID_TABLE}
+};
+static int
+gemini_nid_to_nicaddr(__u32 nid, int numnic, __u32 *nicaddr)
+{
+	int i;
+
+	/* GNILND_NO_RCA, so use hardcoded table for Gemini SimNow */
+	if (numnic > 1) {
+		CERROR("manual nid2nic translation doesn't support"
+		       "multiple nic addrs (you asked for %d)\n",
+			numnic);
+		return -EINVAL;
+	}
+
+	for (i = 0;;i++) {
+		if (kgn_nid_table[i].nid == GNILND_MAX_NID_TABLE) {
+			CERROR("could not translate %u to a NIC "
+			       "address\n", nid);
+			return -ESRCH;
+		}
+		if (kgn_nid_table[i].nid == nid) {
+			*nicaddr = kgn_nid_table[i].nicaddr;
+			return 1;
+		}
+	}
+}
+
+static int
+gemini_nicaddr_to_nid(__u32 nicaddr, __u32 *nid)
+{
+	int i;
+
+	/* GNILND_RCA_NOT_HOME, so use hardcoded table for SimNow */
+	for (i = 0;;i++) {
+		if (kgn_nid_table[i].nicaddr == GNILND_MAX_NID_TABLE) {
+			CERROR("could not translate NIC address "
+				"%u\n",
+				nicaddr);
+			return -ESRCH;
+		}
+		if (kgn_nid_table[i].nicaddr == nicaddr) {
+			*nid = kgn_nid_table[i].nid;
+			return 1;
+		}
+	}
+}
+
+static inline int
+kgnilnd_setup_nic_translation(__u32 device_id)
+{
+        int rc;
+
+	/* do lookup on first use */
+	if (unlikely(kgnilnd_hssops.nid_to_nicaddr == NULL)) {
+		rc = kgnilnd_lookup_rca_funcs();
+		if (rc)
+			return rc;
+	}
+
+	/* if we have a real function, return - we'll use those going forward */
+	if (likely(kgnilnd_hssops.nid_to_nicaddr != (void *)GNILND_NO_RCA))
+		return 0;
+
+	kgnilnd_hssops.nid_to_nicaddr = gemini_nid_to_nicaddr;
+	kgnilnd_hssops.nicaddr_to_nid = gemini_nicaddr_to_nid;
+	return 0;
+}
+
+#elif defined(CONFIG_CRAY_ARIES)
+/* for libcfs_ipif_query */
+#include <libcfs/libcfs.h>
+
+/* Aries Sim doesn't have hardcoded tables, so we'll hijack the nic_pe
+ * and decode our address and nic addr from that - the rest are just offsets */
+static __u32 aries_sim_base_nid;
+static __u32 aries_sim_nic;
+
+static int
+aries_nid_to_nicaddr(__u32 nid, int numnic, __u32 *nicaddr)
+{
+	if (numnic > 1) {
+		CERROR("manual nid2nic translation doesn't support"
+		       "multiple nic addrs (you asked for %d)\n",
+			numnic);
+		return -EINVAL;
+	}
+	if (nid < aries_sim_base_nid) {
+		CERROR("Request for invalid nid translation %u, minimum %u\n",
+		       nid, aries_sim_base_nid);
+		return -ESRCH;
+	}
+
+	*nicaddr = nid - aries_sim_base_nid;
+	return 1;
+}
+
+static int
+aries_nicaddr_to_nid(__u32 nicaddr, __u32 *nid)
+{
+	*nid = aries_sim_base_nid + nicaddr;
+	return 1;
+}
+
+/* XXX Nic: This does not support multiple device!!!! */
+static inline int
+kgnilnd_setup_nic_translation(__u32 device_id)
+{
+	char              *if_name = "ipogif0";
+	__u32              ipaddr, netmask, my_nid;
+	int                up, rc;
+
+	/* do lookup on first use */
+	if (unlikely(kgnilnd_hssops.nid_to_nicaddr == NULL)) {
+		rc = kgnilnd_lookup_rca_funcs();
+		if (rc)
+			return rc;
+	}
+
+	/* if we have a real function, return - we'll use those going forward */
+	if (likely(kgnilnd_hssops.nid_to_nicaddr != (void *)GNILND_NO_RCA))
+		return 0;
+
+	rc = libcfs_ipif_query(if_name, &up, &ipaddr, &netmask);
+	if (rc != 0) {
+		CERROR("can't get IP interface for %s: %d\n", if_name, rc);
+		return rc;
+	}
+	if (!up) {
+		CERROR("IP interface %s is down\n", if_name);
+		return -ENODEV;
+	}
+
+	my_nid = ((ipaddr >> 8) & 0xFF) + (ipaddr & 0xFF);
+	aries_sim_nic = device_id;
+	aries_sim_base_nid = my_nid - aries_sim_nic;
+
+	kgnilnd_hssops.nid_to_nicaddr = aries_nid_to_nicaddr;
+	kgnilnd_hssops.nicaddr_to_nid = aries_nicaddr_to_nid;
+
+	return 0;
+}
+#else
+#error "Undefined Network Type"
+#endif
+
+/* we use RCA types here to get the compiler to whine when we have
+ * mismatched types */
+static inline int
+kgnilnd_nid_to_nicaddrs(rca_nid_t nid, int numnic, nic_addr_t *nicaddrs)
+{
+	/* compile time checks to ensure that the RCA types match
+	 * the LNet idea of NID and NIC */
+	typecheck(__u32, nid);
+	typecheck(__u32, *nicaddrs);
+
+	LASSERTF(kgnilnd_hssops.nid_to_nicaddr != NULL, "missing setup?\n");
+
+	return kgnilnd_hssops.nid_to_nicaddr(nid, numnic, nicaddrs);
+}
+
+static inline int
+kgnilnd_nicaddr_to_nid(nic_addr_t nicaddr, rca_nid_t *nid)
+{
+	/* compile time checks to ensure that the RCA types match
+	 * the LNet idea of NID and NIC */
+	typecheck(__u32, nicaddr);
+	typecheck(__u32, nid[0]);
+
+	LASSERTF(kgnilnd_hssops.nicaddr_to_nid != NULL, "missing setup ?\n");
+
+	return kgnilnd_hssops.nicaddr_to_nid(nicaddr, nid);
+}
+
+#endif /* _GNILND_HSS_OPS_H */
diff --git a/lnet/klnds/gnilnd/gnilnd_modparams.c b/lnet/klnds/gnilnd/gnilnd_modparams.c
new file mode 100644
index 0000000..17cbfd6
--- /dev/null
+++ b/lnet/klnds/gnilnd/gnilnd_modparams.c
@@ -0,0 +1,500 @@
+/*
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ * Copyright (C) 2009-2012 Cray, Inc.
+ *
+ *   Derived from work by: Eric Barton <eric@bartonsoftware.com>
+ *   Author: Nic Henke <nic@cray.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "gnilnd.h"
+
+static int credits = 256;
+CFS_MODULE_PARM(credits, "i", int, 0444,
+		"# concurrent sends");
+
+static int peer_credits = 16;
+CFS_MODULE_PARM(peer_credits, "i", int, 0444,
+		"# LNet peer credits");
+
+/* NB - we'll not actually limit sends to this, we just size the mailbox buffer
+ * such that at most we'll have concurrent_sends * max_immediate messages
+ * in the mailbox */
+static int concurrent_sends = 0;
+CFS_MODULE_PARM(concurrent_sends, "i", int, 0444,
+		"# concurrent HW sends to 1 peer");
+
+/* default for 2k nodes @ 16 peer credits */
+static int fma_cq_size = 32768;
+CFS_MODULE_PARM(fma_cq_size, "i", int, 0444,
+		"size of the completion queue");
+
+static int timeout = GNILND_BASE_TIMEOUT;
+/* can't change @ runtime because LNet gets NI data at startup from
+ * this value */
+CFS_MODULE_PARM(timeout, "i", int, 0444,
+		"communications timeout (seconds)");
+
+/* time to wait between datagram timeout and sending of next dgram */
+static int min_reconnect_interval = GNILND_MIN_RECONNECT_TO;
+CFS_MODULE_PARM(min_reconnect_interval, "i", int, 0644,
+		"minimum connection retry interval (seconds)");
+
+/* if this goes longer than timeout, we'll timeout the TX before
+ * the dgram */
+static int max_reconnect_interval = GNILND_MAX_RECONNECT_TO;
+CFS_MODULE_PARM(max_reconnect_interval, "i", int, 0644,
+		"maximum connection retry interval (seconds)");
+
+static int max_immediate = (2<<10);
+CFS_MODULE_PARM(max_immediate, "i", int, 0644,
+		"immediate/RDMA breakpoint");
+
+#ifdef CONFIG_CRAY_GEMINI
+static int checksum = GNILND_CHECKSUM_SMSG_BTE;
+#else
+static int checksum = 0;
+#endif
+CFS_MODULE_PARM(checksum, "i", int, 0644,
+		"0: None, 1: headers, 2: short msg, 3: all traffic");
+
+static int checksum_dump = 0;
+CFS_MODULE_PARM(checksum_dump, "i", int, 0644,
+		"0: None, 1: dump log on failure, 2: payload data to D_INFO log");
+
+static int bte_hash = 1;
+CFS_MODULE_PARM(bte_hash, "i", int, 0644,
+		"enable hashing for BTE (RDMA) transfers");
+
+static int bte_adapt = 1;
+CFS_MODULE_PARM(bte_adapt, "i", int, 0644,
+		"enable adaptive request and response for BTE (RDMA) transfers");
+
+static int bte_relaxed_ordering = 1;
+CFS_MODULE_PARM(bte_relaxed_ordering, "i", int, 0644,
+		"enable relaxed ordering (PASSPW) for BTE (RDMA) transfers");
+
+static int ptag = GNI_PTAG_LND;
+CFS_MODULE_PARM(ptag, "i", int, 0444,
+		"ptag for Gemini CDM");
+
+static int max_retransmits = 1024;
+CFS_MODULE_PARM(max_retransmits, "i", int, 0644,
+		"max retransmits for FMA");
+
+static int nwildcard = 4;
+CFS_MODULE_PARM(nwildcard, "i", int, 0444,
+		"# wildcard datagrams to post per net (interface)");
+
+static int nice = -20;
+CFS_MODULE_PARM(nice, "i", int, 0444,
+		"nice value for kgnilnd threads, default -20");
+
+static int rdmaq_intervals = 4;
+CFS_MODULE_PARM(rdmaq_intervals, "i", int, 0644,
+		"# intervals per second for rdmaq throttling, default 4, 0 to disable");
+
+static int loops = 100;
+CFS_MODULE_PARM(loops, "i", int, 0644,
+		"# of loops before scheduler is friendly, default 100");
+
+static int hash_size = 503;
+CFS_MODULE_PARM(hash_size, "i", int, 0444,
+		"prime number for peer/conn hash sizing, default 503");
+
+static int peer_health = 0;
+CFS_MODULE_PARM(peer_health, "i", int, 0444,
+		"Disable peer timeout for LNet peer health, default off, > 0 to enable");
+
+static int vmap_cksum = 0;
+CFS_MODULE_PARM(vmap_cksum, "i", int, 0644,
+		"use vmap for all kiov checksumming, default off");
+
+static int mbox_per_block = GNILND_FMABLK;
+CFS_MODULE_PARM(mbox_per_block, "i", int, 0644,
+		"mailboxes per block");
+
+static int nphys_mbox = 0;
+CFS_MODULE_PARM(nphys_mbox, "i", int, 0444,
+		"# mbox to preallocate from physical memory, default 0");
+
+static int mbox_credits = GNILND_MBOX_CREDITS;
+CFS_MODULE_PARM(mbox_credits, "i", int, 0644,
+		"number of credits per mailbox");
+
+static int sched_threads = GNILND_SCHED_THREADS;
+CFS_MODULE_PARM(sched_threads, "i", int, 0444,
+		"number of threads for moving data");
+
+static int net_hash_size = 11;
+CFS_MODULE_PARM(net_hash_size, "i", int, 0444,
+		"prime number for net hash sizing, default 11");
+
+static int hardware_timeout = GNILND_HARDWARE_TIMEOUT;
+CFS_MODULE_PARM(hardware_timeout, "i", int, 0444,
+		"maximum time for traffic to get from one node to another");
+
+static int mdd_timeout = GNILND_MDD_TIMEOUT;
+CFS_MODULE_PARM(mdd_timeout, "i", int, 0644,
+		"maximum time (in minutes) for mdd to be held");
+
+kgn_tunables_t kgnilnd_tunables = {
+	.kgn_min_reconnect_interval = &min_reconnect_interval,
+	.kgn_max_reconnect_interval = &max_reconnect_interval,
+	.kgn_credits                = &credits,
+	.kgn_peer_credits           = &peer_credits,
+	.kgn_concurrent_sends       = &concurrent_sends,
+	.kgn_fma_cq_size            = &fma_cq_size,
+	.kgn_timeout                = &timeout,
+	.kgn_max_immediate          = &max_immediate,
+	.kgn_checksum               = &checksum,
+	.kgn_checksum_dump          = &checksum_dump,
+	.kgn_bte_hash               = &bte_hash,
+	.kgn_bte_adapt              = &bte_adapt,
+	.kgn_bte_relaxed_ordering   = &bte_relaxed_ordering,
+	.kgn_ptag                   = &ptag,
+	.kgn_max_retransmits        = &max_retransmits,
+	.kgn_nwildcard              = &nwildcard,
+	.kgn_nice                   = &nice,
+	.kgn_rdmaq_intervals        = &rdmaq_intervals,
+	.kgn_loops                  = &loops,
+	.kgn_peer_hash_size         = &hash_size,
+	.kgn_peer_health            = &peer_health,
+	.kgn_vmap_cksum             = &vmap_cksum,
+	.kgn_mbox_per_block         = &mbox_per_block,
+	.kgn_nphys_mbox             = &nphys_mbox,
+	.kgn_mbox_credits           = &mbox_credits,
+	.kgn_sched_threads          = &sched_threads,
+	.kgn_net_hash_size          = &net_hash_size,
+	.kgn_hardware_timeout       = &hardware_timeout,
+	.kgn_mdd_timeout            = &mdd_timeout
+};
+
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+static cfs_sysctl_table_t kgnilnd_ctl_table[] = {
+	{
+		INIT_CTL_NAME(2)
+		.procname = "min_reconnect_interval",
+		.data     = &min_reconnect_interval,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(3)
+		.procname = "max_reconnect_interval",
+		.data     = &max_reconnect_interval,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(5)
+		.procname = "credits",
+		.data     = &credits,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(6)
+		.procname = "peer_credits",
+		.data     = &peer_credits,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(7)
+		.procname = "fma_cq_size",
+		.data     = &fma_cq_size,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(8)
+		.procname = "timeout",
+		.data     = &timeout,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(9)
+		.procname = "max_immediate",
+		.data     = &max_immediate,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(10)
+		.procname = "checksum",
+		.data     = &checksum,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(11)
+		.procname = "bte_hash",
+		.data     = &bte_hash,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(12)
+		.procname = "bte_adapt",
+		.data     = &bte_adapt,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(13)
+		.procname = "ptag",
+		.data     = &ptag,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(14)
+		.procname = "nwildcard",
+		.data     = &nwildcard,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(15)
+		.procname = "bte_relaxed_ordering",
+		.data     = &bte_relaxed_ordering,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(16)
+		.procname = "checksum_dump",
+		.data     = &checksum_dump,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(17)
+		.procname = "nice",
+		.data     = &nice,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(18)
+		.procname = "rdmaq_intervals",
+		.data     = &rdmaq_intervals,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(19)
+		.procname = "loops",
+		.data     = &loops,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(20)
+		.procname = "hash_size",
+		.data     = &hash_size,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(21)
+		.procname = "peer_health",
+		.data     = &peer_health,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(22)
+		.procname = "vmap_cksum",
+		.data     = &vmap_cksum,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(23)
+		.procname = "mbox_per_block",
+		.data     = &mbox_per_block,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(24)
+		.procname = "mbox_credits"
+		.data     = &mbox_credits,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(25)
+		.procname = "sched_threads"
+		.data     = &sched_threads,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(26)
+		.procname = "net_hash_size",
+		.data     = &net_hash_size,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(27)
+		.procname = "hardware_timeout",
+		.data     = &hardware_timeout,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(28)
+		.procname = "mdd_timeout",
+		.data     = &mdd_timeout,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(29)
+		.procname = "max_retransmits"
+		.data     = &max_retransmits,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(30)
+		.procname = "concurrent_sends",
+		.data     = &concurrent_sends,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(31)
+		.procname = "nphys_mbox",
+		.data     = &nphys_mbox,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{0}
+};
+
+static cfs_sysctl_table_t kgnilnd_top_ctl_table[] = {
+	{
+		INIT_CTL_NAME(202)
+		.procname = "gnilnd",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0555,
+		.child    = kgnilnd_ctl_table
+	},
+	{       INIT_CTL_NAME(0)   }
+};
+#endif
+
+int
+kgnilnd_tunables_init()
+{
+	int rc = 0;
+
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+	kgnilnd_tunables.kgn_sysctl =
+		cfs_register_sysctl_table(kgnilnd_top_ctl_table, 0);
+
+	if (kgnilnd_tunables.kgn_sysctl == NULL)
+		CWARN("Can't setup /proc tunables\n");
+#endif
+	switch (*kgnilnd_tunables.kgn_checksum) {
+	default:
+		CERROR("Invalid checksum module parameter: %d\n",
+		       *kgnilnd_tunables.kgn_checksum);
+		rc = -EINVAL;
+		GOTO(out, rc);
+	case GNILND_CHECKSUM_OFF:
+		/* no checksumming */
+		break;
+	case GNILND_CHECKSUM_SMSG_HEADER:
+		LCONSOLE_INFO("SMSG header only checksumming enabled\n");
+		break;
+	case GNILND_CHECKSUM_SMSG:
+		LCONSOLE_INFO("SMSG checksumming enabled\n");
+		break;
+	case GNILND_CHECKSUM_SMSG_BTE:
+		LCONSOLE_INFO("SMSG + BTE checksumming enabled\n");
+		break;
+	}
+
+	if (*kgnilnd_tunables.kgn_max_immediate > GNILND_MAX_IMMEDIATE) {
+		LCONSOLE_ERROR("kgnilnd module parameter 'max_immediate' too large %d > %d\n",
+		*kgnilnd_tunables.kgn_max_immediate, GNILND_MAX_IMMEDIATE);
+		rc = -EINVAL;
+		GOTO(out, rc);
+	}
+
+	if (*kgnilnd_tunables.kgn_mbox_per_block < 1) {
+		*kgnilnd_tunables.kgn_mbox_per_block = 1;
+	}
+
+	if (*kgnilnd_tunables.kgn_concurrent_sends == 0) {
+		*kgnilnd_tunables.kgn_concurrent_sends = *kgnilnd_tunables.kgn_peer_credits;
+	} else if (*kgnilnd_tunables.kgn_concurrent_sends > *kgnilnd_tunables.kgn_peer_credits) {
+		LCONSOLE_ERROR("kgnilnd parameter 'concurrent_sends' too large: %d > %d (peer_credits)\n",
+			       *kgnilnd_tunables.kgn_concurrent_sends, *kgnilnd_tunables.kgn_peer_credits);
+		rc = -EINVAL;
+	}
+out:
+	return rc;
+}
+
+void
+kgnilnd_tunables_fini()
+{
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+	if (kgnilnd_tunables.kgn_sysctl != NULL)
+		cfs_unregister_sysctl_table(kgnilnd_tunables.kgn_sysctl);
+#endif
+}
diff --git a/lnet/klnds/gnilnd/gnilnd_proc.c b/lnet/klnds/gnilnd/gnilnd_proc.c
new file mode 100644
index 0000000..f161224
--- /dev/null
+++ b/lnet/klnds/gnilnd/gnilnd_proc.c
@@ -0,0 +1,1260 @@
+/*
+ * Copyright (C) 2009-2012 Cray, Inc.
+ *
+ *   Author: Nic Henke <nic@cray.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* this code liberated and modified from lnet/lnet/router_proc.c */
+
+#define DEBUG_SUBSYSTEM S_LND
+#include "gnilnd.h"
+#include <linux/seq_file.h>
+
+#define GNILND_PROC_STATS       "stats"
+#define GNILND_PROC_MDD         "mdd"
+#define GNILND_PROC_SMSG        "smsg"
+#define GNILND_PROC_CONN        "conn"
+#define GNILND_PROC_PEER        "peer"
+#define GNILND_PROC_CKSUM_TEST  "cksum_test"
+
+static int
+_kgnilnd_proc_run_cksum_test(int caseno, int nloops, int nob)
+{
+	lnet_kiov_t              *src, *dest;
+	struct timespec          begin, end, diff;
+	int                      niov;
+	int                      i = 0, j = 0, n;
+	__u16                    cksum, cksum2;
+	__u64                    mbytes;
+
+	LIBCFS_ALLOC(src, LNET_MAX_IOV * sizeof(lnet_kiov_t));
+	LIBCFS_ALLOC(dest, LNET_MAX_IOV * sizeof(lnet_kiov_t));
+
+	if (src == NULL || dest == NULL) {
+		CERROR("couldn't allocate iovs\n");
+		GOTO(unwind, -ENOMEM);
+	}
+
+	for (i = 0; i < LNET_MAX_IOV; i++) {
+		src[i].kiov_offset = 0;
+		src[i].kiov_len = CFS_PAGE_SIZE;
+		src[i].kiov_page = cfs_alloc_page(CFS_ALLOC_STD|CFS_ALLOC_ZERO);
+
+		if (src[i].kiov_page == NULL) {
+			CERROR("couldn't allocate page %d\n", i);
+			GOTO(unwind, -ENOMEM);
+		}
+
+		dest[i].kiov_offset = 0;
+		dest[i].kiov_len = CFS_PAGE_SIZE;
+		dest[i].kiov_page = cfs_alloc_page(CFS_ALLOC_STD|CFS_ALLOC_ZERO);
+
+		if (dest[i].kiov_page == NULL) {
+			CERROR("couldn't allocate page %d\n", i);
+			GOTO(unwind, -ENOMEM);
+		}
+	}
+
+	/* add extra 2 pages - one for offset of src, 2nd to allow dest offset */
+	niov = (nob / PAGE_SIZE) + 2;
+	if (niov > LNET_MAX_IOV) {
+		CERROR("bytes %d too large, requires niov %d > %d\n",
+			nob, niov, LNET_MAX_IOV);
+		GOTO(unwind, -E2BIG);
+	}
+
+	/* setup real data */
+	src[0].kiov_offset = 317;
+	dest[0].kiov_offset = 592;
+	switch (caseno) {
+	default:
+		/* odd -> even */
+		break;
+	case 1:
+		/* odd -> odd */
+		dest[0].kiov_offset -= 1;
+		break;
+	case 2:
+		/* even -> even */
+		src[0].kiov_offset += 1;
+		break;
+	case 3:
+		/* even -> odd */
+		src[0].kiov_offset += 1;
+		dest[0].kiov_offset -= 1;
+	}
+	src[0].kiov_len = PAGE_SIZE - src[0].kiov_offset;
+	dest[0].kiov_len = PAGE_SIZE - dest[0].kiov_offset;
+
+	for (i = 0; i < niov; i++) {
+		memset(page_address(src[i].kiov_page) + src[i].kiov_offset,
+		       0xf0 + i, src[i].kiov_len);
+	}
+
+	lnet_copy_kiov2kiov(niov, dest, 0, niov, src, 0, nob);
+
+	getnstimeofday(&begin);
+
+	for (n = 0; n < nloops; n++) {
+		CDEBUG(D_BUFFS, "case %d loop %d src %d dest %d nob %d niov %d\n",
+		       caseno, n, src[0].kiov_offset, dest[0].kiov_offset, nob, niov);
+		cksum = kgnilnd_cksum_kiov(niov, src, 0, nob - n, 1);
+		cksum2 = kgnilnd_cksum_kiov(niov, dest, 0, nob - n, 1);
+
+		if (cksum != cksum2) {
+			CERROR("case %d loop %d different checksums %x expected %x\n",
+			       j, n, cksum2, cksum);
+			GOTO(unwind, -ENOKEY);
+		}
+	}
+
+	getnstimeofday(&end);
+
+	mbytes = (nloops * nob * 2) / (1024*1024);
+
+	diff = kgnilnd_ts_sub(end, begin);
+
+	LCONSOLE_INFO("running "LPD64"MB took %ld.%ld seconds\n",
+		      mbytes, diff.tv_sec, diff.tv_nsec);
+
+unwind:
+	CDEBUG(D_NET, "freeing %d pages\n", i);
+	for (i -= 1; i >= 0; i--) {
+		if (src[i].kiov_page != NULL) {
+			cfs_free_page(src[i].kiov_page);
+		}
+		if (dest[i].kiov_page != NULL) {
+			cfs_free_page(dest[i].kiov_page);
+		}
+	}
+
+	if (src != NULL)
+		LIBCFS_FREE(src, LNET_MAX_IOV * sizeof(lnet_kiov_t));
+	if (dest != NULL)
+		LIBCFS_FREE(dest, LNET_MAX_IOV * sizeof(lnet_kiov_t));
+	return 0;
+}
+
+static int
+kgnilnd_proc_cksum_test_write(struct file *file, const char *ubuffer,
+			      unsigned long count, void *data)
+{
+	char                    dummy[256 + 1] = { '\0' };
+	int                     testno, nloops, nbytes;
+	int                     rc;
+
+	if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) {
+		CERROR("can't run cksum test, kgnilnd is not initialized yet\n");
+		return -ENOSYS;
+	}
+
+	if (count >= sizeof(dummy) || count == 0)
+		return -EINVAL;
+
+	if (copy_from_user(dummy, ubuffer, count))
+		return -EFAULT;
+
+	if (sscanf(dummy, "%d:%d:%d", &testno, &nloops, &nbytes) == 3) {
+		rc = _kgnilnd_proc_run_cksum_test(testno, nloops, nbytes);
+		if (rc < 0) {
+			RETURN(rc);
+		} else {
+			/* spurious, but lets us know the parse was ok */
+			RETURN(count);
+		}
+	}
+	RETURN(count);
+}
+
+static int
+kgnilnd_proc_stats_read(char *page, char **start, off_t off,
+			     int count, int *eof, void *data)
+{
+	kgn_device_t           *dev;
+	struct timeval          now;
+	int                     rc;
+
+	if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) {
+		rc = sprintf(page,
+			"kgnilnd is not initialized yet\n");
+		return rc;
+	}
+
+	/* only do the first device */
+	dev = &kgnilnd_data.kgn_devices[0];
+
+	/* sampling is racy, but so is reading this file! */
+	smp_rmb();
+	do_gettimeofday(&now);
+
+	rc = sprintf(page, "time: %lu.%lu\n"
+			   "ntx: %d\n"
+			   "npeers: %d\n"
+			   "nconns: %d\n"
+			   "nEPs: %d\n"
+			   "ndgrams: %d\n"
+			   "nfmablk: %d\n"
+			   "n_mdd: %d\n"
+			   "n_mdd_held: %d\n"
+			   "GART map bytes: %ld\n"
+			   "TX queued maps: %d\n"
+			   "TX phys nmaps: %d\n"
+			   "TX phys bytes: %lu\n"
+			   "TX virt nmaps: %d\n"
+			   "TX virt bytes: "LPU64"\n"
+			   "RDMAQ bytes_auth: %ld\n"
+			   "RDMAQ bytes_left: %ld\n"
+			   "RDMAQ nstalls: %d\n"
+			   "dev mutex delay: %ld\n"
+			   "dev n_yield: %d\n"
+			   "dev n_schedule: %d\n"
+			   "SMSG fast_try: %d\n"
+			   "SMSG fast_ok: %d\n"
+			   "SMSG fast_block: %d\n"
+			   "SMSG ntx: %d\n"
+			   "SMSG tx_bytes: %ld\n"
+			   "SMSG nrx: %d\n"
+			   "SMSG rx_bytes: %ld\n"
+			   "RDMA ntx: %d\n"
+			   "RDMA tx_bytes: %ld\n"
+			   "RDMA nrx: %d\n"
+			   "RDMA rx_bytes: %ld\n"
+			   "VMAP short: %d\n"
+			   "VMAP cksum: %d\n"
+			   "KMAP short: %d\n",
+		now.tv_sec, now.tv_usec,
+		atomic_read(&kgnilnd_data.kgn_ntx),
+		atomic_read(&kgnilnd_data.kgn_npeers),
+		atomic_read(&kgnilnd_data.kgn_nconns),
+		atomic_read(&dev->gnd_neps),
+		atomic_read(&dev->gnd_ndgrams),
+		atomic_read(&dev->gnd_nfmablk),
+		atomic_read(&dev->gnd_n_mdd), atomic_read(&dev->gnd_n_mdd_held),
+		atomic64_read(&dev->gnd_nbytes_map),
+		atomic_read(&dev->gnd_nq_map),
+		dev->gnd_map_nphys, dev->gnd_map_physnop * PAGE_SIZE,
+		dev->gnd_map_nvirt, dev->gnd_map_virtnob,
+		atomic64_read(&dev->gnd_rdmaq_bytes_out),
+		atomic64_read(&dev->gnd_rdmaq_bytes_ok),
+		atomic_read(&dev->gnd_rdmaq_nstalls),
+		dev->gnd_mutex_delay,
+		atomic_read(&dev->gnd_n_yield), atomic_read(&dev->gnd_n_schedule),
+		atomic_read(&dev->gnd_fast_try), atomic_read(&dev->gnd_fast_ok),
+		atomic_read(&dev->gnd_fast_block),
+		atomic_read(&dev->gnd_short_ntx), atomic64_read(&dev->gnd_short_txbytes),
+		atomic_read(&dev->gnd_short_nrx), atomic64_read(&dev->gnd_short_rxbytes),
+		atomic_read(&dev->gnd_rdma_ntx), atomic64_read(&dev->gnd_rdma_txbytes),
+		atomic_read(&dev->gnd_rdma_nrx), atomic64_read(&dev->gnd_rdma_rxbytes),
+		atomic_read(&kgnilnd_data.kgn_nvmap_short),
+		atomic_read(&kgnilnd_data.kgn_nvmap_cksum),
+		atomic_read(&kgnilnd_data.kgn_nkmap_short));
+
+	return rc;
+}
+
+static int
+kgnilnd_proc_stats_write(struct file *file, const char *ubuffer,
+		     unsigned long count, void *data)
+{
+	kgn_device_t           *dev;
+
+	if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) {
+		CERROR("kgnilnd is not initialized for stats write\n");
+		return -EINVAL;
+	}
+
+	/* only do the first device */
+	dev = &kgnilnd_data.kgn_devices[0];
+
+	atomic_set(&dev->gnd_short_ntx, 0);
+	atomic_set(&dev->gnd_short_nrx, 0);
+	atomic64_set(&dev->gnd_short_txbytes, 0);
+	atomic64_set(&dev->gnd_short_rxbytes, 0);
+	atomic_set(&dev->gnd_rdma_ntx, 0);
+	atomic_set(&dev->gnd_rdma_nrx, 0);
+	atomic_set(&dev->gnd_fast_ok, 0);
+	atomic_set(&dev->gnd_fast_try, 0);
+	atomic_set(&dev->gnd_fast_block, 0);
+	atomic64_set(&dev->gnd_rdma_txbytes, 0);
+	atomic64_set(&dev->gnd_rdma_rxbytes, 0);
+	atomic_set(&dev->gnd_rdmaq_nstalls, 0);
+	set_mb(dev->gnd_mutex_delay, 0);
+	atomic_set(&dev->gnd_n_yield, 0);
+	atomic_set(&dev->gnd_n_schedule, 0);
+	atomic_set(&kgnilnd_data.kgn_nvmap_short, 0);
+	atomic_set(&kgnilnd_data.kgn_nvmap_cksum, 0);
+	atomic_set(&kgnilnd_data.kgn_nkmap_short, 0);
+	/* sampling is racy, but so is writing this file! */
+	smp_wmb();
+	return count;
+}
+
+typedef struct {
+	kgn_device_t           *gmdd_dev;
+	kgn_tx_t               *gmdd_tx;
+	loff_t                  gmdd_off;
+} kgn_mdd_seq_iter_t;
+
+int
+kgnilnd_mdd_seq_seek(kgn_mdd_seq_iter_t *gseq, loff_t off)
+{
+	kgn_tx_t                *tx;
+	struct list_head        *r;
+	loff_t                  here;
+	int                     rc = 0;
+
+	if (off == 0) {
+		gseq->gmdd_tx = NULL;
+		gseq->gmdd_off = 0;
+		return 0;
+	}
+
+	tx = gseq->gmdd_tx;
+
+	if (tx == NULL || gseq->gmdd_off > off) {
+		/* search from start */
+		r = gseq->gmdd_dev->gnd_map_list.next;
+		here = 1;
+	} else {
+		/* continue current search */
+		r = &tx->tx_map_list;
+		here = gseq->gmdd_off;
+	}
+
+	gseq->gmdd_off = off;
+
+	while (r != &gseq->gmdd_dev->gnd_map_list) {
+		kgn_tx_t      *t;
+
+		t = list_entry(r, kgn_tx_t, tx_map_list);
+
+		if (here == off) {
+			gseq->gmdd_tx = t;
+			rc = 0;
+			goto out;
+		}
+		r = r->next;
+		here++;
+	}
+
+	gseq->gmdd_tx = NULL;
+	rc = -ENOENT;
+out:
+	return rc;
+}
+
+static void *
+kgnilnd_mdd_seq_start(struct seq_file *s, loff_t *pos)
+{
+
+	kgn_mdd_seq_iter_t      *gseq;
+	int                      rc;
+
+	if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) {
+		return NULL;
+	}
+
+	LIBCFS_ALLOC(gseq, sizeof(*gseq));
+	if (gseq == NULL) {
+		CERROR("could not allocate mdd sequence iterator\n");
+		return NULL;
+	}
+
+	/* only doing device 0 for now */
+	gseq->gmdd_dev = &kgnilnd_data.kgn_devices[0];
+	gseq->gmdd_tx = NULL;
+
+	/* need to lock map while we poke - huge disturbance
+	 * but without it, no way to get the data printed */
+	spin_lock(&gseq->gmdd_dev->gnd_map_lock);
+
+	/* set private to gseq for stop */
+	s->private = gseq;
+
+	rc = kgnilnd_mdd_seq_seek(gseq, *pos);
+	if (rc == 0)
+		return gseq;
+	else
+		return NULL;
+}
+
+static void
+kgnilnd_mdd_seq_stop(struct seq_file *s, void *iter)
+{
+	kgn_mdd_seq_iter_t     *gseq = s->private;
+
+	if (gseq != NULL) {
+		spin_unlock(&gseq->gmdd_dev->gnd_map_lock);
+		LIBCFS_FREE(gseq, sizeof(*gseq));
+	}
+}
+
+static void *
+kgnilnd_mdd_seq_next(struct seq_file *s, void *iter, loff_t *pos)
+{
+	kgn_mdd_seq_iter_t     *gseq = iter;
+	int                     rc;
+	loff_t                  next = *pos + 1;
+
+	rc = kgnilnd_mdd_seq_seek(gseq, next);
+	if (rc != 0) {
+		return NULL;
+	}
+	*pos = next;
+	return gseq;
+}
+
+static int
+kgnilnd_mdd_seq_show(struct seq_file *s, void *iter)
+{
+	kgn_mdd_seq_iter_t     *gseq = iter;
+	kgn_tx_t               *tx;
+	__u64                   nob;
+	__u32                   physnop;
+	int                     id;
+	int                     buftype;
+	gni_mem_handle_t        hndl;
+
+	if (gseq->gmdd_off == 0) {
+		seq_printf(s, "%s %22s %16s %8s %8s %37s\n",
+			"tx", "tx_id", "nob", "physnop",
+			"buftype", "mem handle");
+		return 0;
+	}
+
+	tx = gseq->gmdd_tx;
+	LASSERT(tx != NULL);
+
+	id = tx->tx_id.txe_smsg_id;
+	nob = tx->tx_nob;
+	physnop = tx->tx_phys_npages;
+	buftype = tx->tx_buftype;
+	hndl.qword1 = tx->tx_map_key.qword1;
+	hndl.qword2 = tx->tx_map_key.qword2;
+
+	seq_printf(s, "%p %x %16"LPF64"u %8d %#8x "LPX64"."LPX64"x\n",
+		tx, id, nob, physnop, buftype,
+		hndl.qword1, hndl.qword2);
+
+	return 0;
+}
+
+static struct seq_operations kgn_mdd_sops = {
+	.start = kgnilnd_mdd_seq_start,
+	.stop  = kgnilnd_mdd_seq_stop,
+	.next  = kgnilnd_mdd_seq_next,
+	.show  = kgnilnd_mdd_seq_show,
+
+};
+
+static int
+kgnilnd_mdd_seq_open(struct inode *inode, struct file *file)
+{
+	struct seq_file       *sf;
+	int                    rc;
+
+	rc = seq_open(file, &kgn_mdd_sops);
+	if (rc == 0) {
+		sf = file->private_data;
+
+		/* NULL means we've not yet open() */
+		sf->private = NULL;
+	}
+	return rc;
+}
+
+static struct file_operations kgn_mdd_fops = {
+	.owner   = THIS_MODULE,
+	.open    = kgnilnd_mdd_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+typedef struct {
+	__u64                   gsmsg_version;
+	kgn_device_t           *gsmsg_dev;
+	kgn_fma_memblock_t     *gsmsg_fmablk;
+	loff_t                  gsmsg_off;
+} kgn_smsg_seq_iter_t;
+
+int
+kgnilnd_smsg_seq_seek(kgn_smsg_seq_iter_t *gseq, loff_t off)
+{
+	kgn_fma_memblock_t             *fmablk;
+	kgn_device_t                   *dev;
+	struct list_head               *r;
+	loff_t                          here;
+	int                             rc = 0;
+
+	/* offset 0 is the header, so we start real entries at
+	 * here == off == 1 */
+	if (off == 0) {
+		gseq->gsmsg_fmablk = NULL;
+		gseq->gsmsg_off = 0;
+		return 0;
+	}
+
+	fmablk = gseq->gsmsg_fmablk;
+	dev = gseq->gsmsg_dev;
+
+	spin_lock(&dev->gnd_fmablk_lock);
+
+	if (fmablk != NULL &&
+		gseq->gsmsg_version != atomic_read(&dev->gnd_fmablk_vers)) {
+		/* list changed */
+		rc = -ESTALE;
+		goto out;
+	}
+
+	if (fmablk == NULL || gseq->gsmsg_off > off) {
+		/* search from start */
+		r = dev->gnd_fma_buffs.next;
+		here = 1;
+	} else {
+		/* continue current search */
+		r = &fmablk->gnm_bufflist;
+		here = gseq->gsmsg_off;
+	}
+
+	gseq->gsmsg_version = atomic_read(&dev->gnd_fmablk_vers);
+	gseq->gsmsg_off = off;
+
+	while (r != &dev->gnd_fma_buffs) {
+		kgn_fma_memblock_t      *t;
+
+		t = list_entry(r, kgn_fma_memblock_t, gnm_bufflist);
+
+		if (here == off) {
+			gseq->gsmsg_fmablk = t;
+			rc = 0;
+			goto out;
+		}
+		r = r->next;
+		here++;
+	}
+
+	gseq->gsmsg_fmablk = NULL;
+	rc = -ENOENT;
+out:
+	spin_unlock(&dev->gnd_fmablk_lock);
+	return rc;
+}
+
+static void *
+kgnilnd_smsg_seq_start(struct seq_file *s, loff_t *pos)
+{
+
+	kgn_smsg_seq_iter_t     *gseq;
+	int                      rc;
+
+	if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) {
+		return NULL;
+	}
+
+	LIBCFS_ALLOC(gseq, sizeof(*gseq));
+	if (gseq == NULL) {
+		CERROR("could not allocate smsg sequence iterator\n");
+		return NULL;
+	}
+
+	/* only doing device 0 for now */
+	gseq->gsmsg_dev = &kgnilnd_data.kgn_devices[0];
+	gseq->gsmsg_fmablk = NULL;
+	rc = kgnilnd_smsg_seq_seek(gseq, *pos);
+	if (rc == 0)
+		return gseq;
+
+	LIBCFS_FREE(gseq, sizeof(*gseq));
+	return NULL;
+}
+
+static void
+kgnilnd_smsg_seq_stop(struct seq_file *s, void *iter)
+{
+	kgn_smsg_seq_iter_t     *gseq = iter;
+
+	if (gseq != NULL)
+		LIBCFS_FREE(gseq, sizeof(*gseq));
+}
+
+static void *
+kgnilnd_smsg_seq_next(struct seq_file *s, void *iter, loff_t *pos)
+{
+	kgn_smsg_seq_iter_t    *gseq = iter;
+	int                     rc;
+	loff_t                  next = *pos + 1;
+
+	rc = kgnilnd_smsg_seq_seek(gseq, next);
+	if (rc != 0) {
+		LIBCFS_FREE(gseq, sizeof(*gseq));
+		return NULL;
+	}
+	*pos = next;
+	return gseq;
+}
+
+static int
+kgnilnd_smsg_seq_show(struct seq_file *s, void *iter)
+{
+	kgn_smsg_seq_iter_t    *gseq = iter;
+	kgn_fma_memblock_t     *fmablk;
+	kgn_device_t           *dev;
+	int                     avail_mboxs, held_mboxs, num_mboxs;
+	unsigned int            blk_size;
+	int                     live;
+	kgn_fmablk_state_t      state;
+	gni_mem_handle_t        hndl;
+
+	if (gseq->gsmsg_off == 0) {
+		seq_printf(s, "%5s %4s %6s/%5s/%5s %9s %18s %37s\n",
+			"blk#", "type", "avail", "held", "total", "size",
+			"fmablk", "mem handle");
+		return 0;
+	}
+
+	fmablk = gseq->gsmsg_fmablk;
+	dev = gseq->gsmsg_dev;
+	LASSERT(fmablk != NULL);
+
+	spin_lock(&dev->gnd_fmablk_lock);
+
+	if (gseq->gsmsg_version != atomic_read(&dev->gnd_fmablk_vers)) {
+		/* list changed */
+		spin_unlock(&dev->gnd_fmablk_lock);
+		return -ESTALE;
+	}
+
+	live = fmablk->gnm_hold_timeout == 0;
+	/* none are available if it isn't live... */
+	avail_mboxs = live ? fmablk->gnm_avail_mboxs : 0;
+	held_mboxs = fmablk->gnm_held_mboxs;
+	num_mboxs = fmablk->gnm_num_mboxs;
+	blk_size = fmablk->gnm_blk_size;
+	state = fmablk->gnm_state;
+	hndl.qword1 = fmablk->gnm_hndl.qword1;
+	hndl.qword2 = fmablk->gnm_hndl.qword2;
+
+	spin_unlock(&dev->gnd_fmablk_lock);
+
+	if (live) {
+		seq_printf(s, "%5d %4s %6d/%5d/%5d %9d %18p   "LPX64"."LPX64"\n",
+			   (int) gseq->gsmsg_off, kgnilnd_fmablk_state2str(state),
+			   avail_mboxs, held_mboxs, num_mboxs, blk_size,
+			   fmablk, hndl.qword1, hndl.qword2);
+	} else {
+		seq_printf(s, "%5d %4s %6d/%5d/%5d %9d %18p %37s\n",
+			   (int) gseq->gsmsg_off, kgnilnd_fmablk_state2str(state),
+			   avail_mboxs, held_mboxs, num_mboxs, blk_size,
+			   fmablk, "PURGATORY.HOLD");
+	}
+
+	return 0;
+}
+
+static struct seq_operations kgn_smsg_sops = {
+	.start = kgnilnd_smsg_seq_start,
+	.stop  = kgnilnd_smsg_seq_stop,
+	.next  = kgnilnd_smsg_seq_next,
+	.show  = kgnilnd_smsg_seq_show,
+
+};
+
+static int
+kgnilnd_smsg_seq_open(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *dp = PDE(inode);
+	struct seq_file       *sf;
+	int                    rc;
+
+	rc = seq_open(file, &kgn_smsg_sops);
+	if (rc == 0) {
+		sf = file->private_data;
+		sf->private = dp->data;
+	}
+
+	return rc;
+}
+
+static struct file_operations kgn_smsg_fops = {
+	.owner   = THIS_MODULE,
+	.open    = kgnilnd_smsg_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+typedef struct {
+	__u64                   gconn_version;
+	struct list_head       *gconn_list;
+	kgn_conn_t             *gconn_conn;
+	loff_t                  gconn_off;
+	int                     gconn_hashidx;
+} kgn_conn_seq_iter_t;
+
+int
+kgnilnd_conn_seq_seek(kgn_conn_seq_iter_t *gseq, loff_t off)
+{
+	struct list_head       *list, *tmp;
+	loff_t                  here = 0;
+	int                     rc = 0;
+
+	if (off == 0) {
+		gseq->gconn_hashidx = 0;
+		gseq->gconn_list = NULL;
+	}
+
+	if (off > atomic_read(&kgnilnd_data.kgn_nconns)) {
+		gseq->gconn_list = NULL;
+		rc = -ENOENT;
+	}
+
+	read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+	if (gseq->gconn_list != NULL &&
+		gseq->gconn_version != kgnilnd_data.kgn_conn_version) {
+		/* list changed */
+		rc = -ESTALE;
+		goto out;
+	}
+
+	if ((gseq->gconn_list == NULL) ||
+		(gseq->gconn_off > off) ||
+		(gseq->gconn_hashidx >= *kgnilnd_tunables.kgn_peer_hash_size)) {
+		/* search from start */
+		gseq->gconn_hashidx = 0;
+		list = &kgnilnd_data.kgn_conns[gseq->gconn_hashidx];
+		here = 0;
+	} else {
+		/* continue current search */
+		list = gseq->gconn_list;
+	}
+
+	gseq->gconn_version = kgnilnd_data.kgn_conn_version;
+	gseq->gconn_off = off;
+
+start_list:
+
+	list_for_each(tmp, list) {
+		if (here == off) {
+			kgn_conn_t *conn;
+			conn = list_entry(tmp, kgn_conn_t, gnc_hashlist);
+			gseq->gconn_conn = conn;
+			rc = 0;
+			goto out;
+		}
+		here++;
+	}
+	/* if we got through this hash bucket with 'off' still to go, try next*/
+	gseq->gconn_hashidx++;
+	if ((here <= off) &&
+		(gseq->gconn_hashidx < *kgnilnd_tunables.kgn_peer_hash_size)) {
+		list = &kgnilnd_data.kgn_conns[gseq->gconn_hashidx];
+		goto start_list;
+	}
+
+	gseq->gconn_list = NULL;
+	rc = -ENOENT;
+out:
+	read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+	return rc;
+}
+
+static void *
+kgnilnd_conn_seq_start(struct seq_file *s, loff_t *pos)
+{
+
+	kgn_conn_seq_iter_t     *gseq;
+	int                      rc;
+
+	if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) {
+		return NULL;
+	}
+
+	LIBCFS_ALLOC(gseq, sizeof(*gseq));
+	if (gseq == NULL) {
+		CERROR("could not allocate conn sequence iterator\n");
+		return NULL;
+	}
+
+	/* only doing device 0 for now */
+	gseq->gconn_list = NULL;
+	rc = kgnilnd_conn_seq_seek(gseq, *pos);
+	if (rc == 0)
+		return gseq;
+
+	LIBCFS_FREE(gseq, sizeof(*gseq));
+	return NULL;
+}
+
+static void
+kgnilnd_conn_seq_stop(struct seq_file *s, void *iter)
+{
+	kgn_conn_seq_iter_t     *gseq = iter;
+
+	if (gseq != NULL)
+		LIBCFS_FREE(gseq, sizeof(*gseq));
+}
+
+static void *
+kgnilnd_conn_seq_next(struct seq_file *s, void *iter, loff_t *pos)
+{
+	kgn_conn_seq_iter_t    *gseq = iter;
+	int                     rc;
+	loff_t                  next = *pos + 1;
+
+	rc = kgnilnd_conn_seq_seek(gseq, next);
+	if (rc != 0) {
+		LIBCFS_FREE(gseq, sizeof(*gseq));
+		return NULL;
+	}
+	*pos = next;
+	return gseq;
+}
+
+static int
+kgnilnd_conn_seq_show(struct seq_file *s, void *iter)
+{
+	kgn_conn_seq_iter_t    *gseq = iter;
+	kgn_peer_t             *peer = NULL;
+	kgn_conn_t             *conn;
+
+	/* there is no header data for conns, so offset 0 is the first
+	 * real entry. */
+
+	conn = gseq->gconn_conn;
+	LASSERT(conn != NULL);
+
+	read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+	if (gseq->gconn_list != NULL &&
+		gseq->gconn_version != kgnilnd_data.kgn_conn_version) {
+		/* list changed */
+		read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+		return -ESTALE;
+	}
+
+	/* instead of saving off the data, just refcount */
+	kgnilnd_conn_addref(conn);
+	if (conn->gnc_peer) {
+		/* don't use link - after unlock it could get nuked */
+		peer = conn->gnc_peer;
+		kgnilnd_peer_addref(peer);
+	}
+
+	read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+	seq_printf(s, "%p->%s [%d] q %d/%d/%d "
+		"tx sq %u %dms/%dms "
+		"rx sq %u %dms/%dms "
+		"noop r/s %d/%d w/s/cq %lds/%lds/%lds "
+		"sched a/d %lds/%lds "
+		"tx_re "LPD64" TO %ds %s\n",
+		conn, peer ? libcfs_nid2str(peer->gnp_nid) : "<?>",
+		atomic_read(&conn->gnc_refcount),
+		kgnilnd_count_list(&conn->gnc_fmaq),
+		atomic_read(&conn->gnc_nlive_fma),
+		atomic_read(&conn->gnc_nlive_rdma),
+		conn->gnc_tx_seq,
+		jiffies_to_msecs(jiffies - conn->gnc_last_tx),
+		jiffies_to_msecs(jiffies - conn->gnc_last_tx_cq),
+		conn->gnc_rx_seq,
+		jiffies_to_msecs(jiffies - conn->gnc_last_rx),
+		jiffies_to_msecs(jiffies - conn->gnc_last_rx_cq),
+		atomic_read(&conn->gnc_reaper_noop),
+		atomic_read(&conn->gnc_sched_noop),
+		cfs_duration_sec(jiffies - conn->gnc_last_noop_want),
+		cfs_duration_sec(jiffies - conn->gnc_last_noop_sent),
+		cfs_duration_sec(jiffies - conn->gnc_last_noop_cq),
+		cfs_duration_sec(jiffies - conn->gnc_last_sched_ask),
+		cfs_duration_sec(jiffies - conn->gnc_last_sched_do),
+		conn->gnc_tx_retrans, conn->gnc_timeout,
+		kgnilnd_conn_state2str(conn));
+
+	if (peer)
+		kgnilnd_peer_decref(peer);
+	kgnilnd_conn_decref(conn);
+
+	return 0;
+}
+
+static struct seq_operations kgn_conn_sops = {
+	.start = kgnilnd_conn_seq_start,
+	.stop  = kgnilnd_conn_seq_stop,
+	.next  = kgnilnd_conn_seq_next,
+	.show  = kgnilnd_conn_seq_show,
+
+};
+
+static int
+kgnilnd_conn_seq_open(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *dp = PDE(inode);
+	struct seq_file       *sf;
+	int                    rc;
+
+	rc = seq_open(file, &kgn_conn_sops);
+	if (rc == 0) {
+		sf = file->private_data;
+		sf->private = dp->data;
+	}
+
+	return rc;
+}
+
+static struct file_operations kgn_conn_fops = {
+	.owner   = THIS_MODULE,
+	.open    = kgnilnd_conn_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+typedef struct {
+	__u64                   gpeer_version;
+	struct list_head       *gpeer_list;
+	kgn_peer_t             *gpeer_peer;
+	loff_t                  gpeer_off;
+	int                     gpeer_hashidx;
+} kgn_peer_seq_iter_t;
+
+int
+kgnilnd_peer_seq_seek(kgn_peer_seq_iter_t *gseq, loff_t off)
+{
+	struct list_head       *list, *tmp;
+	loff_t                  here = 0;
+	int                     rc = 0;
+
+	if (off == 0) {
+		gseq->gpeer_hashidx = 0;
+		gseq->gpeer_list = NULL;
+	}
+
+	if (off > atomic_read(&kgnilnd_data.kgn_npeers)) {
+		gseq->gpeer_list = NULL;
+		rc = -ENOENT;
+	}
+
+	read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+	if (gseq->gpeer_list != NULL &&
+		gseq->gpeer_version != kgnilnd_data.kgn_peer_version) {
+		/* list changed */
+		rc = -ESTALE;
+		goto out;
+	}
+
+	if ((gseq->gpeer_list == NULL) ||
+		(gseq->gpeer_off > off) ||
+		(gseq->gpeer_hashidx >= *kgnilnd_tunables.kgn_peer_hash_size)) {
+		/* search from start */
+		gseq->gpeer_hashidx = 0;
+		list = &kgnilnd_data.kgn_peers[gseq->gpeer_hashidx];
+		here = 0;
+	} else {
+		/* continue current search */
+		list = gseq->gpeer_list;
+	}
+
+	gseq->gpeer_version = kgnilnd_data.kgn_peer_version;
+	gseq->gpeer_off = off;
+
+start_list:
+
+	list_for_each(tmp, list) {
+		if (here == off) {
+			kgn_peer_t *peer;
+			peer = list_entry(tmp, kgn_peer_t, gnp_list);
+			gseq->gpeer_peer = peer;
+			rc = 0;
+			goto out;
+		}
+		here++;
+	}
+	/* if we got through this hash bucket with 'off' still to go, try next*/
+	gseq->gpeer_hashidx++;
+	if ((here <= off) &&
+		(gseq->gpeer_hashidx < *kgnilnd_tunables.kgn_peer_hash_size)) {
+		list = &kgnilnd_data.kgn_peers[gseq->gpeer_hashidx];
+		goto start_list;
+	}
+
+	gseq->gpeer_list = NULL;
+	rc = -ENOENT;
+out:
+	read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+	return rc;
+}
+
+static void *
+kgnilnd_peer_seq_start(struct seq_file *s, loff_t *pos)
+{
+
+	kgn_peer_seq_iter_t     *gseq;
+	int                      rc;
+
+	if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) {
+		return NULL;
+	}
+
+	LIBCFS_ALLOC(gseq, sizeof(*gseq));
+	if (gseq == NULL) {
+		CERROR("could not allocate peer sequence iterator\n");
+		return NULL;
+	}
+
+	/* only doing device 0 for now */
+	gseq->gpeer_list = NULL;
+	rc = kgnilnd_peer_seq_seek(gseq, *pos);
+	if (rc == 0)
+		return gseq;
+
+	LIBCFS_FREE(gseq, sizeof(*gseq));
+	return NULL;
+}
+
+static void
+kgnilnd_peer_seq_stop(struct seq_file *s, void *iter)
+{
+	kgn_peer_seq_iter_t     *gseq = iter;
+
+	if (gseq != NULL)
+		LIBCFS_FREE(gseq, sizeof(*gseq));
+}
+
+static void *
+kgnilnd_peer_seq_next(struct seq_file *s, void *iter, loff_t *pos)
+{
+	kgn_peer_seq_iter_t    *gseq = iter;
+	int                     rc;
+	loff_t                  next = *pos + 1;
+
+	rc = kgnilnd_peer_seq_seek(gseq, next);
+	if (rc != 0) {
+		LIBCFS_FREE(gseq, sizeof(*gseq));
+		return NULL;
+	}
+	*pos = next;
+	return gseq;
+}
+
+static int
+kgnilnd_peer_seq_show(struct seq_file *s, void *iter)
+{
+	kgn_peer_seq_iter_t    *gseq = iter;
+	kgn_peer_t             *peer;
+	kgn_conn_t             *conn;
+	char                   conn_str;
+	int                    purg_count = 0;
+	/* there is no header data for peers, so offset 0 is the first
+	 * real entry. */
+
+	peer = gseq->gpeer_peer;
+	LASSERT(peer != NULL);
+
+	read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+	if (gseq->gpeer_list != NULL &&
+		gseq->gpeer_version != kgnilnd_data.kgn_peer_version) {
+		/* list changed */
+		read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+		return -ESTALE;
+	}
+
+	/* instead of saving off the data, just refcount */
+	kgnilnd_peer_addref(peer);
+	conn = kgnilnd_find_conn_locked(peer);
+
+	if (peer->gnp_connecting) {
+		conn_str = 'S';
+	} else if (conn != NULL) {
+		conn_str = 'C';
+	} else {
+		conn_str = 'D';
+	}
+
+	list_for_each_entry(conn, &peer->gnp_conns, gnc_list) {
+		if (conn->gnc_in_purgatory) {
+			purg_count++;
+		}
+	}
+
+	read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+	seq_printf(s, "%p->%s [%d] NIC 0x%x q %d conn %c purg %d "
+		"last %d@%dms dgram %d@%dms "
+		"reconn %dms to %lus \n",
+		peer, libcfs_nid2str(peer->gnp_nid),
+		atomic_read(&peer->gnp_refcount),
+		peer->gnp_host_id,
+		kgnilnd_count_list(&peer->gnp_tx_queue),
+		conn_str,
+		purg_count,
+		peer->gnp_last_errno,
+		jiffies_to_msecs(jiffies - peer->gnp_last_alive),
+		peer->gnp_last_dgram_errno,
+		jiffies_to_msecs(jiffies - peer->gnp_last_dgram_time),
+		peer->gnp_reconnect_interval != 0
+			? jiffies_to_msecs(jiffies - peer->gnp_reconnect_time)
+			: 0,
+		peer->gnp_reconnect_interval);
+
+	kgnilnd_peer_decref(peer);
+
+	return 0;
+}
+
+static struct seq_operations kgn_peer_sops = {
+	.start = kgnilnd_peer_seq_start,
+	.stop  = kgnilnd_peer_seq_stop,
+	.next  = kgnilnd_peer_seq_next,
+	.show  = kgnilnd_peer_seq_show,
+};
+
+static int
+kgnilnd_peer_seq_open(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *dp = PDE(inode);
+	struct seq_file       *sf;
+	int                    rc;
+
+	rc = seq_open(file, &kgn_peer_sops);
+	if (rc == 0) {
+		sf = file->private_data;
+		sf->private = dp->data;
+	}
+
+	return rc;
+}
+
+static struct file_operations kgn_peer_fops = {
+	.owner   = THIS_MODULE,
+	.open    = kgnilnd_peer_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+static struct proc_dir_entry *kgn_proc_root;
+
+void
+kgnilnd_proc_init(void)
+{
+	struct proc_dir_entry *pde;
+	int             rc = 0;
+	ENTRY;
+
+	/* setup dir */
+	kgn_proc_root = proc_mkdir(libcfs_lnd2modname(GNILND), NULL);
+	if (kgn_proc_root == NULL) {
+		CERROR("couldn't create proc dir %s\n",
+			libcfs_lnd2modname(GNILND));
+		return;
+	}
+
+	/* Initialize CKSUM_TEST */
+	pde = create_proc_entry(GNILND_PROC_CKSUM_TEST, 0200, kgn_proc_root);
+	if (pde == NULL) {
+		CERROR("couldn't create proc entry %s\n", GNILND_PROC_CKSUM_TEST);
+		rc = -ENOENT;
+		GOTO(remove_dir, rc);
+	}
+
+	pde->data = NULL;
+	pde->write_proc = kgnilnd_proc_cksum_test_write;
+
+	/* Initialize STATS */
+	pde = create_proc_entry(GNILND_PROC_STATS, 0644, kgn_proc_root);
+	if (pde == NULL) {
+		CERROR("couldn't create proc entry %s\n", GNILND_PROC_STATS);
+		rc = -ENOENT;
+		GOTO(remove_test, rc);
+	}
+
+	pde->data = NULL;
+	pde->read_proc = kgnilnd_proc_stats_read;
+	pde->write_proc = kgnilnd_proc_stats_write;
+
+	/* Initialize MDD */
+	pde = create_proc_entry(GNILND_PROC_MDD, 0444, kgn_proc_root);
+	if (pde == NULL) {
+		CERROR("couldn't create proc entry %s\n", GNILND_PROC_MDD);
+		rc = -ENOENT;
+		GOTO(remove_stats, rc);
+	}
+
+	pde->data = NULL;
+	pde->proc_fops = &kgn_mdd_fops;
+
+	/* Initialize SMSG */
+	pde = create_proc_entry(GNILND_PROC_SMSG, 0444, kgn_proc_root);
+	if (pde == NULL) {
+		CERROR("couldn't create proc entry %s\n", GNILND_PROC_SMSG);
+		rc = -ENOENT;
+		GOTO(remove_mdd, rc);
+	}
+
+	pde->data = NULL;
+	pde->proc_fops = &kgn_smsg_fops;
+
+	/* Initialize CONN */
+	pde = create_proc_entry(GNILND_PROC_CONN, 0444, kgn_proc_root);
+	if (pde == NULL) {
+		CERROR("couldn't create proc entry %s\n", GNILND_PROC_CONN);
+		rc = -ENOENT;
+		GOTO(remove_smsg, rc);
+	}
+
+	pde->data = NULL;
+	pde->proc_fops = &kgn_conn_fops;
+
+	/* Initialize PEER */
+	pde = create_proc_entry(GNILND_PROC_PEER, 0444, kgn_proc_root);
+	if (pde == NULL) {
+		CERROR("couldn't create proc entry %s\n", GNILND_PROC_PEER);
+		rc = -ENOENT;
+		GOTO(remove_conn, rc);
+	}
+
+	pde->data = NULL;
+	pde->proc_fops = &kgn_peer_fops;
+	RETURN_EXIT;
+
+remove_conn:
+	remove_proc_entry(GNILND_PROC_CONN, kgn_proc_root);
+remove_smsg:
+	remove_proc_entry(GNILND_PROC_SMSG, kgn_proc_root);
+remove_mdd:
+	remove_proc_entry(GNILND_PROC_MDD, kgn_proc_root);
+remove_stats:
+	remove_proc_entry(GNILND_PROC_STATS, kgn_proc_root);
+remove_test:
+	remove_proc_entry(GNILND_PROC_CKSUM_TEST, kgn_proc_root);
+remove_dir:
+	remove_proc_entry(kgn_proc_root->name, NULL);
+
+	RETURN_EXIT;
+}
+
+void
+kgnilnd_proc_fini(void)
+{
+	remove_proc_entry(GNILND_PROC_PEER, kgn_proc_root);
+	remove_proc_entry(GNILND_PROC_CONN, kgn_proc_root);
+	remove_proc_entry(GNILND_PROC_MDD, kgn_proc_root);
+	remove_proc_entry(GNILND_PROC_SMSG, kgn_proc_root);
+	remove_proc_entry(GNILND_PROC_STATS, kgn_proc_root);
+	remove_proc_entry(GNILND_PROC_CKSUM_TEST, kgn_proc_root);
+	remove_proc_entry(kgn_proc_root->name, NULL);
+}
diff --git a/lnet/klnds/gnilnd/gnilnd_stack.c b/lnet/klnds/gnilnd/gnilnd_stack.c
new file mode 100644
index 0000000..10ae493
--- /dev/null
+++ b/lnet/klnds/gnilnd/gnilnd_stack.c
@@ -0,0 +1,564 @@
+/*
+ * Copyright (C) 2012 Cray, Inc.
+ *
+ *   Author: Nic Henke <nic@cray.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#include "gnilnd.h"
+
+/* Advance all timeouts by nap_time seconds. */
+void
+kgnilnd_bump_timeouts(__u32 nap_time, char *reason)
+{
+	int                     i;
+	kgn_peer_t             *peer;
+	kgn_conn_t             *conn;
+	kgn_tx_t               *tx;
+	kgn_device_t           *dev;
+	kgn_dgram_t            *dgram;
+
+	LCONSOLE_INFO("%s: bumping all timeouts by %ds\n", reason, nap_time);
+
+	LASSERTF(GNILND_IS_QUIESCED, "gnilnd not quiesced %d != %d\n",
+		 atomic_read(&kgnilnd_data.kgn_nquiesce),
+		 atomic_read(&kgnilnd_data.kgn_nthreads));
+
+	/* requiring that the threads are paused ensures a couple of things:
+	 * - combined code paths for stack reset and quiesce event as stack reset
+	 *   runs with the threads paused
+	 * - prevents traffic to the Gemini during a quiesce period
+	 * - reduces the locking requirements
+	*/
+
+	for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
+		list_for_each_entry(peer, &kgnilnd_data.kgn_peers[i], gnp_list) {
+
+			/* we can reconnect again at any time */
+			peer->gnp_reconnect_time = jiffies;
+			/* reset now that network is healthy */
+			peer->gnp_reconnect_interval = 0;
+			/* tell LNet dude is still alive */
+			kgnilnd_peer_alive(peer);
+
+			list_for_each_entry(tx, &peer->gnp_tx_queue, tx_list) {
+				tx->tx_qtime = jiffies;
+			}
+
+			list_for_each_entry(conn, &peer->gnp_conns, gnc_list) {
+				unsigned long           timeout;
+
+				timeout = cfs_time_seconds(conn->gnc_timeout);
+
+				/* bump last_rx/last_rx_cq on all conns - including
+				 * closed ones, this will have the effect of
+				 * bumping the purgatory timers for those */
+				conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies;
+
+				/* we don't timeout based on old gnc_last_tx, so
+				 * we'll back it up and schedule the conn to trigger
+				 * a NOOP */
+				conn->gnc_last_tx = jiffies - timeout;
+				kgnilnd_schedule_conn(conn);
+			}
+		}
+	}
+
+	for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+		dev = &kgnilnd_data.kgn_devices[i];
+		for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size - 1); i++) {
+			list_for_each_entry(dgram, &dev->gnd_dgrams[i], gndg_list) {
+				dgram->gndg_post_time = jiffies;
+			}
+		}
+	}
+}
+
+/* Quiesce or wake up the stack.  The caller must hold the kgn_quiesce_sem semaphore
+ * on entry, which holds off any pending stack shutdown.   */
+void
+kgnilnd_quiesce_wait(char *reason)
+{
+	int             i;
+
+	if (kgnilnd_data.kgn_quiesce_trigger) {
+		unsigned long   quiesce_deadline, quiesce_to;
+		/* FREEZE TAG!!!! */
+
+		/* morning sunshine */
+		spin_lock(&kgnilnd_data.kgn_reaper_lock);
+		wake_up_all(&kgnilnd_data.kgn_reaper_waitq);
+		spin_unlock(&kgnilnd_data.kgn_reaper_lock);
+
+		for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+			kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
+
+			wake_up_all(&dev->gnd_waitq);
+			wake_up_all(&dev->gnd_dgram_waitq);
+			wake_up_all(&dev->gnd_dgping_waitq);
+		}
+
+		/* we'll wait for 10x the timeout for the threads to pause */
+		quiesce_to = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout * 10);
+		quiesce_deadline = (long) jiffies + quiesce_to;
+
+		/* wait for everyone to check-in as quiesced */
+		i = 1;
+		while (!GNILND_IS_QUIESCED) {
+			i++;
+			LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
+				 "%s: Waiting for %d threads to pause\n",
+				 reason,
+				 atomic_read(&kgnilnd_data.kgn_nthreads) -
+				 atomic_read(&kgnilnd_data.kgn_nquiesce));
+			CFS_RACE(CFS_FAIL_GNI_QUIESCE_RACE);
+			cfs_pause(cfs_time_seconds(1 * i));
+
+			LASSERTF(quiesce_deadline > jiffies,
+				 "couldn't quiesce threads in %lu seconds, falling over now\n",
+				 cfs_duration_sec(quiesce_to));
+		}
+
+		LCONSOLE_WARN("%s: All threads paused!\n", reason);
+		/* XXX Nic: Is there a set of counters we can grab here to
+		 * ensure that there is no traffic until quiesce is over ?*/
+	} else {
+		/* GO! GO! GO! */
+
+		for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+			kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
+			kgnilnd_schedule_dgram(dev);
+		}
+
+		/* wait for everyone to check-in as running - they will be spinning
+		 * and looking, so no need to poke any waitq */
+		i = 1;
+		while (atomic_read(&kgnilnd_data.kgn_nquiesce) > 0) {
+			i++;
+			LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
+				 "%s: Waiting for %d threads to wake up\n",
+				  reason,
+				  atomic_read(&kgnilnd_data.kgn_nquiesce));
+			cfs_pause(cfs_time_seconds(1 * i));
+		}
+
+		LCONSOLE_WARN("%s: All threads awake!\n", reason);
+	}
+}
+
+/* Reset the stack.  */
+void
+kgnilnd_reset_stack(void)
+{
+	int              i, rc = 0;
+	kgn_net_t       *net;
+	kgn_peer_t      *peer, *peerN;
+	LIST_HEAD	 (souls);
+	char            *reason = "critical hardware error";
+	__u32            seconds;
+	unsigned long    start, end;
+	ENTRY;
+
+	/* Race with del_peer and its atomics */
+	CFS_RACE(CFS_FAIL_GNI_RACE_RESET);
+
+	if (kgnilnd_data.kgn_init != GNILND_INIT_ALL) {
+		CERROR("can't reset the stack, gnilnd is not initialized\n");
+		RETURN_EXIT;
+	}
+
+	/* First make sure we are not already quiesced - we panic if so,
+	 * as that could leave software in a bad state */
+	LASSERTF(kgnilnd_data.kgn_quiesce_trigger == GNILND_QUIESCE_IDLE,
+		"can't reset the stack, already doing so: trigger %d\n",
+		 kgnilnd_data.kgn_quiesce_trigger);
+
+	set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_RESET);
+
+	/* wake up the dgram waitq thread - but after trigger set to make sure it
+	 * goes into quiesce */
+	CFS_RACE(CFS_FAIL_GNI_WC_DGRAM_FREE);
+	/* same for scheduler that is dropping state transitiosn */
+	CFS_RACE(CFS_FAIL_GNI_DROP_CLOSING);
+	CFS_RACE(CFS_FAIL_GNI_DROP_DESTROY_EP);
+
+	kgnilnd_quiesce_wait(reason);
+
+	start = jiffies;
+
+	kgnilnd_data.kgn_in_reset = 1;
+	kgnilnd_data.kgn_nresets++;
+	LCONSOLE_WARN("%s: resetting all resources (count %d)\n",
+		      reason, kgnilnd_data.kgn_nresets);
+
+	for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) {
+		list_for_each_entry(net, &kgnilnd_data.kgn_nets[i], gnn_list) {
+			rc = kgnilnd_cancel_net_dgrams(net);
+			LASSERTF(rc == 0, "couldn't cleanup datagrams: %d\n", rc);
+		}
+	}
+
+	/* error -ENOTRECOVERABLE is stack reset */
+	kgnilnd_del_conn_or_peer(NULL, LNET_NID_ANY, GNILND_DEL_CONN, -ENOTRECOVERABLE);
+
+	for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+		kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
+		kgnilnd_cancel_wc_dgrams(dev);
+		kgnilnd_wait_for_canceled_dgrams(dev);
+	}
+
+	/* manually do some conn processing ala kgnilnd_process_conns */
+	for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+		kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
+		kgn_conn_t      *conn;
+		int              conn_sched;
+
+		/* go find all the closed conns that need to be nuked - the
+		 * scheduler thread isn't running to do this for us */
+
+		CDEBUG(D_NET, "will try to clear up %d ready_conns\n",
+			kgnilnd_count_list(&dev->gnd_ready_conns));
+
+		/* use while/list_first_entry loop to ensure we can handle any
+		 * DESTROY_EP conns added from kgnilnd_complete_closed_conn */
+		while (!list_empty(&dev->gnd_ready_conns)) {
+			conn = list_first_entry(&dev->gnd_ready_conns,
+						kgn_conn_t, gnc_schedlist);
+			conn_sched = xchg(&conn->gnc_scheduled, GNILND_CONN_PROCESS);
+
+			LASSERTF(conn_sched != GNILND_CONN_IDLE &&
+				 conn_sched != GNILND_CONN_PROCESS,
+				 "conn %p on ready list but in bad state: %d\n",
+				 conn, conn_sched);
+
+			list_del_init(&conn->gnc_schedlist);
+
+			if (conn->gnc_state == GNILND_CONN_CLOSING) {
+				/* bump to CLOSED to fake out send of CLOSE */
+				conn->gnc_state = GNILND_CONN_CLOSED;
+				conn->gnc_close_sent = 1;
+			}
+
+			if (conn->gnc_state == GNILND_CONN_DESTROY_EP) {
+				kgnilnd_destroy_conn_ep(conn);
+			} else {
+				kgnilnd_complete_closed_conn(conn);
+			}
+
+			/* there really shouldn't be any other states here -
+			 * they would have been cleared out in the del_peer_or_conn or the dgram
+			 * aborts above.
+			 * there is an LASSERTF in kgnilnd_complete_closed_conn that will take
+			 * care of catching anything else for us */
+
+			kgnilnd_schedule_process_conn(conn, -1);
+
+			kgnilnd_conn_decref(conn);
+		}
+	}
+
+	/* don't let the little weasily purgatory conns hide from us */
+	for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
+		list_for_each_entry_safe(peer, peerN, &kgnilnd_data.kgn_peers[i], gnp_list) {
+			kgn_conn_t       *conn, *connN;
+
+			list_for_each_entry_safe(conn, connN, &peer->gnp_conns, gnc_list) {
+				kgnilnd_detach_purgatory_locked(conn, &souls);
+			}
+		}
+	}
+
+	CDEBUG(D_NET, "about to release %d purgatory entries\n",
+		kgnilnd_count_list(&souls));
+
+	kgnilnd_release_purgatory_list(&souls);
+
+	/* validate we are now clean */
+	for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+		kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
+
+		/* now all the cons/mboxes should be cleaned up, including purgatory
+		 * so go through and release the MDDs for our persistent PHYS fma_blks
+		 */
+		kgnilnd_unmap_phys_fmablk(dev);
+
+		LASSERTF(atomic_read(&dev->gnd_nfmablk) == 0,
+			"reset failed: fma blocks still live %d\n",
+			atomic_read(&dev->gnd_nfmablk));
+
+		LASSERTF(atomic_read(&dev->gnd_neps) == 0,
+			"reset failed: EP handles still live %d\n",
+			atomic_read(&dev->gnd_neps));
+	}
+
+	LASSERTF(atomic_read(&kgnilnd_data.kgn_nconns) == 0,
+		"reset failed: conns left %d\n",
+		atomic_read(&kgnilnd_data.kgn_nconns));
+
+	/* fine to have peers left - they are waiting for new conns
+	 * but should not be holding any open HW resources */
+
+	/* like the last part of kgnilnd_base_shutdown() */
+
+	CFS_RACE(CFS_FAIL_GNI_SR_DOWN_RACE);
+
+	for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+		kgnilnd_dev_fini(&kgnilnd_data.kgn_devices[i]);
+	}
+
+	/* no need to free and recreate the TX descriptors
+	 * we nuked all the ones that could be using HW resources in
+	 * kgnilnd_close_matching_conns and asserted it worked in
+	 * kgnilnd_dev_fini */
+
+	/* At this point, all HW is torn down, start to reset */
+
+	/* only reset our known devs */
+	for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+		kgn_device_t    *dev = &kgnilnd_data.kgn_devices[i];
+		rc = kgnilnd_dev_init(dev);
+		LASSERTF(rc == 0, "dev_init failed for dev %d\n", i);
+		kgnilnd_map_phys_fmablk(dev);
+		LASSERTF(rc == 0, "map_phys_fmablk failed for dev %d\n", i);
+		rc = kgnilnd_setup_wildcard_dgram(dev);
+		LASSERTF(rc == 0, "couldnt setup datagrams on dev %d: %d\n",
+			i, rc);
+	}
+
+	/* Now the fun restarts... - release the hounds! */
+
+	end = jiffies;
+	seconds = cfs_duration_sec((long)end - start);
+	kgnilnd_bump_timeouts(seconds, reason);
+
+	kgnilnd_data.kgn_in_reset = 0;
+	set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_IDLE);
+	kgnilnd_quiesce_wait(reason);
+	LCONSOLE_WARN("%s reset of all hardware resources\n",
+		rc ? "failed" : "successful");
+
+	RETURN_EXIT;
+}
+
+/* A thread that handles quiece and reset hardware events.
+ * We do the same thing regardless of which device reported the event. */
+int
+kgnilnd_ruhroh_thread(void *arg)
+{
+	int                i = 1;
+	DEFINE_WAIT(wait);
+
+	cfs_daemonize("kgnilnd_rr");
+	cfs_block_allsigs();
+	set_user_nice(current, *kgnilnd_tunables.kgn_nice);
+	kgnilnd_data.kgn_ruhroh_running = 1;
+
+	while (1) {
+
+		/* Block until there's a request..  A reset request could come in
+		 * while we're handling a quiesce one, or vice versa.
+		 * Keep processing requests until there are none.*/
+		prepare_to_wait(&kgnilnd_data.kgn_ruhroh_waitq, &wait, TASK_INTERRUPTIBLE);
+		while (!(kgnilnd_data.kgn_ruhroh_shutdown ||
+				kgnilnd_data.kgn_needs_reset || kgnilnd_data.kgn_needs_pause))
+			schedule();
+		finish_wait(&kgnilnd_data.kgn_ruhroh_waitq, &wait);
+
+	       /* Exit if the driver is shutting down. */
+		if (kgnilnd_data.kgn_ruhroh_shutdown)
+			break;
+
+		/* Serialize with driver startup and shutdown. */
+		down(&kgnilnd_data.kgn_quiesce_sem);
+
+	       CDEBUG(D_NET, "trigger %d reset %d to_bump %d pause %d\n",
+			kgnilnd_data.kgn_quiesce_trigger,
+			kgnilnd_data.kgn_needs_reset,
+			kgnilnd_data.kgn_bump_info_rdy,
+			kgnilnd_data.kgn_needs_pause);
+
+		/* Do we need to do a pause/quiesce? */
+		if (kgnilnd_data.kgn_needs_pause) {
+
+			/* Pause all other kgnilnd threads. */
+			set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_HW_QUIESCE);
+			kgnilnd_quiesce_wait("hardware quiesce flag");
+
+			/* If the hardware quiesce flag is set, wait for it to clear.
+			 * This should happen relatively quickly, so we wait for it.
+			 * This will hold up the eventd thread, but on everything but
+			 * the simulator, this is ok-- there is one thread per core.
+			 *
+			 * Handle (possibly multiple) quiesce events while we wait. The
+			 * memory barrier ensures that the core doesn't start fetching
+			 * kgn_bump_info_rdy before it fetches kgn_needs_pause, and
+			 * matches the second mb in kgnilnd_quiesce_end_callback(). */
+			smp_rmb();
+			while (kgnilnd_hw_in_quiesce() || kgnilnd_data.kgn_bump_info_rdy) {
+
+				i++;
+				LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
+						"Waiting for hardware quiesce flag to clear\n");
+				cfs_pause(cfs_time_seconds(1 * i));
+
+				/* If we got a quiesce event with bump info, DO THE BUMP!. */
+				if (kgnilnd_data.kgn_bump_info_rdy) {
+					/* reset console rate limiting for each event */
+					i = 1;
+
+					/* Make sure the core doesn't start fetching
+					 * kgni_quiesce_seconds until after it sees
+					 * kgn_bump_info_rdy set.  This is the match to the
+					 * first mb in kgnilnd_quiesce_end_callback(). */
+					smp_rmb();
+					(void) kgnilnd_bump_timeouts(kgnilnd_data.kgn_quiesce_secs,
+							       "hardware quiesce callback");
+					set_mb(kgnilnd_data.kgn_quiesce_secs, 0);
+					set_mb(kgnilnd_data.kgn_bump_info_rdy, 0);
+				}
+		      }
+
+			/* Reset the kgn_needs_pause flag before coming out of
+			 * the pause.  This ordering avoids a race with the
+			 * setting of this flag in kgnilnd_pause_threads().  */
+			set_mb(kgnilnd_data.kgn_needs_pause, 0);
+
+			/* ok, let the kids back into the pool */
+			set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_IDLE);
+			kgnilnd_quiesce_wait("hardware quiesce");
+		}
+
+		/* Do a stack reset if needed. */
+		if (kgnilnd_data.kgn_needs_reset) {
+			kgnilnd_reset_stack();
+			set_mb(kgnilnd_data.kgn_needs_reset, 0);
+		}
+
+		up(&kgnilnd_data.kgn_quiesce_sem);
+	}
+
+	kgnilnd_data.kgn_ruhroh_running = 0;
+	return 0;
+}
+
+/* Set pause request flag.  Any functions that
+ * call this one are responsible for ensuring that
+ * variables they set up are visible on other cores before
+ * this flag setting.  This executes in interrupt or kernel
+ * thread context.  */
+void
+kgnilnd_pause_threads(void)
+{
+	/* only device 0 gets the handle, see kgnilnd_dev_init */
+	kgn_device_t  *dev = &kgnilnd_data.kgn_devices[0];
+	LASSERTF(dev != NULL, "dev 0 is NULL\n");
+
+	/* If we're currently in a pause triggered by the pause flag,
+	 * there's no need to set it again.  We clear the kgn_needs_pause
+	 * flag before we reset kgn_quiesce_trigger to avoid a race.  The
+	 * read memory barrier matches the setmb() on the trigger in
+	 * kgnilnd_ruhroh_task().					*/
+	smp_rmb();
+	if (!(kgnilnd_data.kgn_quiesce_trigger == GNILND_QUIESCE_HW_QUIESCE &&
+			GNILND_IS_QUIESCED)) {
+		 CDEBUG(D_NET, "requesting thread pause\n");
+
+		kgnilnd_data.kgn_needs_pause = 1;
+
+		wake_up(&kgnilnd_data.kgn_ruhroh_waitq);
+	} else {
+	    CDEBUG(D_NET, "thread pause already underway\n");
+	}
+}
+
+/* Return non-zero if the GNI hardware quiesce flag is set */
+int
+kgnilnd_hw_in_quiesce(void)
+{
+	/* only device 0 gets the handle, see kgnilnd_dev_init */
+	kgn_device_t      *dev0 = &kgnilnd_data.kgn_devices[0];
+
+	LASSERTF(dev0 != NULL, "dev 0 is NULL\n");
+
+	smp_rmb();
+	return kgnilnd_get_quiesce_status(dev0->gnd_handle) != 0;
+}
+
+
+/* If the GNI hardware quiesce flag is set, initiate our pause and
+ * return non-zero.  Also return non-zero if the stack is shutting down. */
+int
+kgnilnd_check_hw_quiesce(void)
+{
+	if (likely(!kgnilnd_hw_in_quiesce()))
+		return 0;
+
+	if (!kgnilnd_data.kgn_ruhroh_shutdown) {
+		CDEBUG(D_NET, "initiating thread pause\n");
+		kgnilnd_pause_threads();
+	} else {
+		CDEBUG(D_NET, "thread pause bypassed because of shutdown\n");
+	}
+
+	return 1;
+}
+
+/* Callback from kngi with the quiesce duration.  This executes
+ * in interrupt context.                                        */
+void
+kgnilnd_quiesce_end_callback(gni_nic_handle_t nic_handle, uint64_t msecs)
+{
+	/* only device 0 gets the handle, see kgnilnd_dev_init */
+	kgn_device_t  *dev = &kgnilnd_data.kgn_devices[0];
+	LASSERTF(dev != NULL, "dev 0 is NULL\n");
+
+	if (!kgnilnd_data.kgn_ruhroh_shutdown) {
+
+		CDEBUG(D_NET, "requesting timeout bump by "LPD64" msecs\n", msecs);
+
+		/* Save the bump interval and request the bump.
+		 * The memory barrier ensures that the interval is in place before
+		 * the bump flag can be seen (in case a core is already running the
+		 * ruhroh task), and that the bump request flag in place before
+		 * the pause request can be seen (to ensure a core doesn't miss the bump
+		 * request flag).       */
+		/* If another callback occurred before the ruhroh task
+		 * finished processing the first bump request, we'd over-write its info.
+		 * Nic says that callbacks occur so slowly that this isn't an issue.    */
+		set_mb(kgnilnd_data.kgn_quiesce_secs, msecs / MSEC_PER_SEC);
+		set_mb(kgnilnd_data.kgn_bump_info_rdy, 1);
+		kgnilnd_pause_threads();
+	} else {
+		CDEBUG(D_NET, "timeout bump bypassed because of shutdown\n");
+	}
+}
+
+void
+kgnilnd_critical_error(struct gni_err *err_handle)
+{
+	/* only device 0 gets the handle, see kgnilnd_dev_init */
+	kgn_device_t  *dev = &kgnilnd_data.kgn_devices[0];
+	LASSERTF(dev != NULL, "dev 0 is NULL\n");
+
+	if (!kgnilnd_data.kgn_ruhroh_shutdown) {
+		CDEBUG(D_NET, "requesting stack reset\n");
+		kgnilnd_data.kgn_needs_reset = 1;
+		wake_up(&kgnilnd_data.kgn_ruhroh_waitq);
+	} else {
+		CDEBUG(D_NET, "stack reset bypassed because of shutdown\n");
+	}
+}
diff --git a/lnet/klnds/gnilnd/gnilnd_sysctl.c b/lnet/klnds/gnilnd/gnilnd_sysctl.c
new file mode 100644
index 0000000..cd33d3e
--- /dev/null
+++ b/lnet/klnds/gnilnd/gnilnd_sysctl.c
@@ -0,0 +1,252 @@
+/*
+ * Copyright (C) 2012 Cray, Inc.
+ *
+ *   Author: Nic Henke <nic@cray.com>
+ *   Author: James Shimek <jshimek@cray.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* this code liberated and modified from Lustre */
+
+#define DEBUG_SUBSYSTEM S_LND
+
+#include "gnilnd.h"
+
+typedef struct kgn_sysctl_data {
+	int                     ksd_pause_trigger;
+	int                     ksd_quiesce_secs;
+	int                     ksd_rdmaq_override;
+} kgn_sysctl_data_t;
+
+static kgn_sysctl_data_t        kgnilnd_sysctl;
+
+#if defined(CONFIG_SYSCTL)
+
+static cfs_sysctl_table_header_t *kgnilnd_table_header = NULL;
+#ifndef HAVE_SYSCTL_UNNUMBERED
+
+enum {
+	GNILND_VERSION = 1,
+	GNILND_THREAD_PAUSE,
+	GNILND_HW_QUIESCE,
+	GNILND_STACK_RESET,
+	GNILND_RDMAQ_OVERRIDE,
+};
+#else
+#define GNILND_VERSION             CTL_UNNUMBERED
+#define GNILND_THREAD_PAUSE        CTL_UNNUMBERED
+#define GNILND_HW_QUIESCE          CTL_UNNUMBERED
+#define GNILND_STACK_RESET         CTL_UNNUMBERED
+#define GNILND_RDMAQ_OVERRIDE      CTL_UNNUMBERED
+#endif
+
+static int LL_PROC_PROTO(proc_toggle_thread_pause)
+{
+	int  old_val = kgnilnd_sysctl.ksd_pause_trigger;
+	int  rc = 0;
+	ENTRY;
+
+	rc = ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	if (!write) {
+		/* read */
+		RETURN(rc);
+	}
+
+	if (kgnilnd_data.kgn_init != GNILND_INIT_ALL) {
+		rc = -EINVAL;
+		RETURN(rc);
+	}
+
+	if (old_val != kgnilnd_sysctl.ksd_pause_trigger) {
+		down(&kgnilnd_data.kgn_quiesce_sem);
+		CDEBUG(D_NET, "setting quiesce_trigger %d\n", old_val);
+		kgnilnd_data.kgn_quiesce_trigger = kgnilnd_sysctl.ksd_pause_trigger;
+		kgnilnd_quiesce_wait("admin sysctl");
+		up(&kgnilnd_data.kgn_quiesce_sem);
+	}
+
+	RETURN(rc);
+}
+
+static int LL_PROC_PROTO(proc_hw_quiesce)
+{
+	int              rc = 0;
+	kgn_device_t    *dev;
+	ENTRY;
+
+	rc = ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	if (!write) {
+		/* read */
+		RETURN(rc);
+	}
+
+	if (kgnilnd_data.kgn_init != GNILND_INIT_ALL) {
+		rc = -EINVAL;
+		RETURN(rc);
+	}
+
+
+	/* only device 0 gets the handle, see kgnilnd_dev_init */
+	dev = &kgnilnd_data.kgn_devices[0];
+
+	LASSERTF(dev != NULL, "dev 0 is NULL\n");
+
+	kgnilnd_quiesce_end_callback(dev->gnd_handle,
+				     kgnilnd_sysctl.ksd_quiesce_secs * MSEC_PER_SEC);
+
+	RETURN(rc);
+}
+
+int LL_PROC_PROTO(proc_trigger_stack_reset)
+{
+	int              rc = 0;
+	int                i = 1;
+       kgn_device_t    *dev;
+	ENTRY;
+
+	if (!write) {
+		/* read */
+		rc = ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+		RETURN(rc);
+	}
+
+	/* only device 0 gets the handle, see kgnilnd_dev_init */
+	dev = &kgnilnd_data.kgn_devices[0];
+
+	LASSERTF(dev != NULL, "dev 0 is NULL\n");
+
+	kgnilnd_critical_error(dev->gnd_err_handle);
+
+	/* Wait for the reset to complete.  This prevents any races in testing
+	 * where we'd immediately try to send traffic again */
+       while (kgnilnd_data.kgn_needs_reset != 0) {
+	       i++;
+	       LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
+			       "Waiting for stack reset request to clear\n");
+	       cfs_pause(cfs_time_seconds(1 * i));
+       }
+
+	RETURN(rc);
+}
+
+static int LL_PROC_PROTO(proc_toggle_rdmaq_override)
+{
+	int  old_val = kgnilnd_sysctl.ksd_rdmaq_override;
+	int  rc = 0;
+	ENTRY;
+
+	rc = ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	if (!write) {
+		/* read */
+		RETURN(rc);
+	}
+
+	if (kgnilnd_data.kgn_init != GNILND_INIT_ALL) {
+		rc = -EINVAL;
+		RETURN(rc);
+	}
+
+	if (old_val != kgnilnd_sysctl.ksd_rdmaq_override) {
+		long    new_mb = kgnilnd_sysctl.ksd_rdmaq_override * (long)(1024*1024);
+		LCONSOLE_INFO("changing RDMAQ override to %d mbytes/sec\n",
+			      kgnilnd_sysctl.ksd_rdmaq_override);
+		/* override proc is mbytes, but we calc in bytes */
+		kgnilnd_data.kgn_rdmaq_override = new_mb;
+		smp_wmb();
+	}
+
+	RETURN(rc);
+}
+
+static cfs_sysctl_table_t kgnilnd_table[] = {
+	/*
+	 * NB No .strategy entries have been provided since sysctl(8) prefers
+	 * to go via /proc for portability.
+	 */
+	{
+		INIT_CTL_NAME(GNILND_VERSION)
+		.procname = "version",
+		.data     = KGNILND_BUILD_REV,
+		.maxlen   = sizeof(KGNILND_BUILD_REV),
+		.mode     = 0444,
+		.proc_handler = &proc_dostring
+	},
+	{
+		INIT_CTL_NAME(GNILND_THREAD_PAUSE)
+		.procname = "thread_pause",
+		.data     = &kgnilnd_sysctl.ksd_pause_trigger,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_toggle_thread_pause,
+	},
+	{
+		INIT_CTL_NAME(GNILND_HW_QUIESCE)
+		.procname = "hw_quiesce",
+		.data     = &kgnilnd_sysctl.ksd_quiesce_secs,
+		.maxlen   = sizeof(__u32),
+		.mode     = 0644,
+		.proc_handler = &proc_hw_quiesce,
+	},
+	{
+		INIT_CTL_NAME(GNILND_STACK_RESET)
+		.procname = "stack_reset",
+		.data     = NULL,
+		.maxlen   = sizeof(int),
+		.mode     = 0600,
+		.proc_handler = &proc_trigger_stack_reset,
+	},
+	{
+		INIT_CTL_NAME(GNILND_RDMAQ_OVERRIDE)
+		.procname = "rdmaq_override",
+		.data     = &kgnilnd_sysctl.ksd_rdmaq_override,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_toggle_rdmaq_override,
+	},
+	{       INIT_CTL_NAME(0)   }
+};
+
+static cfs_sysctl_table_t kgnilnd_top_table[2] = {
+	{
+		INIT_CTL_NAME(CTL_GNILND)
+		.procname = "kgnilnd",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0555,
+		.child    = kgnilnd_table
+	},
+	{       INIT_CTL_NAME(0)   }
+};
+
+void kgnilnd_insert_sysctl(void)
+{
+	if (kgnilnd_table_header == NULL)
+		kgnilnd_table_header = cfs_register_sysctl_table(kgnilnd_top_table, 0);
+}
+
+void kgnilnd_remove_sysctl(void)
+{
+	if (kgnilnd_table_header != NULL)
+		cfs_unregister_sysctl_table(kgnilnd_table_header);
+
+	kgnilnd_table_header = NULL;
+}
+
+#else
+void kgnilnd_insert_sysctl(void) {}
+void kgnilnd_remove_sysctl(void) {}
+#endif
diff --git a/lnet/klnds/gnilnd/gnilnd_version.h b/lnet/klnds/gnilnd/gnilnd_version.h
new file mode 100644
index 0000000..10f6278
--- /dev/null
+++ b/lnet/klnds/gnilnd/gnilnd_version.h
@@ -0,0 +1 @@
+#define KGNILND_BUILD_REV        SVN_CODE_REV
diff --git a/lnet/utils/debug.c b/lnet/utils/debug.c
index 21b5de5..d720f3d 100644
--- a/lnet/utils/debug.c
+++ b/lnet/utils/debug.c
@@ -856,6 +856,7 @@ static struct mod_paths {
 	{ "kmxlnd", "lnet/klnds/mxlnd" },
 	{ "ko2iblnd", "lnet/klnds/o2iblnd" },
 	{ "kptllnd", "lnet/klnds/ptllnd" },
+	{ "kgnilnd", "lnet/klnds/gnilnd"},
 	{ "kqswlnd", "lnet/klnds/qswlnd" },
 	{ "kralnd", "lnet/klnds/ralnd" },
 	{ "ksocklnd", "lnet/klnds/socklnd" },
diff --git a/lnet/utils/portals.c b/lnet/utils/portals.c
index dedb75d..3c09a8d 100644
--- a/lnet/utils/portals.c
+++ b/lnet/utils/portals.c
@@ -567,7 +567,7 @@ jt_ptl_print_peers (int argc, char **argv)
         int                      rc;
 
         if (!g_net_is_compatible (argv[0], SOCKLND, RALND, PTLLND, MXLND,
-                                  O2IBLND, 0))
+				  O2IBLND, GNILND, 0))
                 return -1;
 
         for (index = 0;;index++) {
@@ -620,6 +620,26 @@ jt_ptl_print_peers (int argc, char **argv)
 				ptl_ipaddr_2_str(data.ioc_u32[0], buffer[1],
 						 sizeof(buffer[1]), 1),
                                 data.ioc_u32[1]); /* peer port */
+		} else if (g_net_is_compatible(NULL, GNILND, 0)) {
+			int disconn = data.ioc_flags >> 16;
+			char *state;
+
+			if (disconn)
+				state = "D";
+			else
+				state = data.ioc_flags & 0xffff ? "C" : "U";
+
+			printf ("%-20s (%d) %s [%d] "LPU64" "
+				"sq %d/%d tx %d/%d/%d\n",
+				libcfs_nid2str(data.ioc_nid), /* peer nid */
+				data.ioc_net, /* gemini device id */
+				state, /* peer is Connecting, Up, or Down */
+				data.ioc_count,   /* peer refcount */
+				data.ioc_u64[0], /* peerstamp */
+				data.ioc_u32[2], data.ioc_u32[3], /* tx and rx seq */
+				/* fmaq, nfma, nrdma */
+				data.ioc_u32[0], data.ioc_u32[1], data.ioc_u32[4]
+				);
                 } else {
                         printf ("%-20s [%d]\n",
                                 libcfs_nid2str(data.ioc_nid), data.ioc_count);
@@ -647,11 +667,12 @@ jt_ptl_add_peer (int argc, char **argv)
         int                      port = 0;
         int                      rc;
 
-        if (!g_net_is_compatible (argv[0], SOCKLND, RALND, 0))
+	if (!g_net_is_compatible (argv[0], SOCKLND, RALND,
+				  GNILND, 0))
                 return -1;
 
         if (argc != 4) {
-                fprintf (stderr, "usage(tcp,ra): %s nid ipaddr port\n",
+		fprintf (stderr, "usage(tcp,ra,gni): %s nid ipaddr port\n",
                          argv[0]);
                 return 0;
         }
@@ -699,7 +720,7 @@ jt_ptl_del_peer (int argc, char **argv)
         int                      rc;
 
         if (!g_net_is_compatible (argv[0], SOCKLND, RALND, MXLND, PTLLND,
-                                  O2IBLND, 0))
+				  O2IBLND, GNILND, 0))
                 return -1;
 
         if (g_net_is_compatible(NULL, SOCKLND, 0)) {
@@ -768,7 +789,8 @@ jt_ptl_print_connections (int argc, char **argv)
         int                      index;
         int                      rc;
 
-        if (!g_net_is_compatible (argv[0], SOCKLND, RALND, MXLND, O2IBLND, 0))
+	if (!g_net_is_compatible (argv[0], SOCKLND, RALND, MXLND, O2IBLND,
+				  GNILND, 0))
                 return -1;
 
         for (index = 0; ; index++) {
@@ -808,6 +830,10 @@ jt_ptl_print_connections (int argc, char **argv)
                         printf ("%s mtu %d\n",
                                 libcfs_nid2str(data.ioc_nid),
                                 data.ioc_u32[0]); /* path MTU */
+		} else if (g_net_is_compatible (NULL, GNILND, 0)) {
+			printf ("%-20s [%d]\n",
+				libcfs_nid2str(data.ioc_nid),
+				data.ioc_u32[0] /* device id */);
                 } else {
                         printf ("%s\n", libcfs_nid2str(data.ioc_nid));
                 }
@@ -837,7 +863,8 @@ int jt_ptl_disconnect(int argc, char **argv)
                 return 0;
         }
 
-        if (!g_net_is_compatible (NULL, SOCKLND, RALND, MXLND, O2IBLND, 0))
+	if (!g_net_is_compatible (NULL, SOCKLND, RALND, MXLND, O2IBLND,
+				  GNILND, 0))
                 return 0;
 
         if (argc >= 2 &&
@@ -879,7 +906,7 @@ int jt_ptl_push_connection (int argc, char **argv)
                 return 0;
         }
 
-        if (!g_net_is_compatible (argv[0], SOCKLND, 0))
+	if (!g_net_is_compatible (argv[0], SOCKLND, GNILND, 0))
                 return -1;
 
         if (argc > 1 &&