AC_SUBST(RALND)
])
+#
+# LN_CONFIG_GNILND
+#
+# check whether to use the Gemini Network Interface lnd
+#
+AC_DEFUN([LN_CONFIG_GNILND],
+[#### Gemini Network Interface
+AC_MSG_CHECKING([whether to enable GNI lnd])
+AC_ARG_ENABLE([gni],
+ AC_HELP_STRING([--enable-gni],
+ [enable GNI lnd]),
+ [],[enable_gni='no'])
+AC_MSG_RESULT([$enable_gni])
+
+if test x$enable_gni = xyes ; then
+ AC_MSG_CHECKING([if GNI kernel headers are present])
+ # placeholder
+ # GNICPPFLAGS was set in spec file
+ EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS"
+ EXTRA_KCFLAGS="$EXTRA_KCFLAGS $GNICPPFLAGS"
+ LB_LINUX_TRY_COMPILE([
+ #include <linux/types.h>
+ #include <gni_pub.h>
+ ],[
+ gni_cdm_handle_t kgni_domain;
+ gni_return_t rc;
+ int rrc;
+
+ rc = gni_cdm_create(0, 1, 1, 0, &kgni_domain);
+
+ rrc = (rc == GNI_RC_SUCCESS) ? 0 : 1;
+
+ return rrc;
+ ],[
+ AC_MSG_RESULT([yes])
+ GNILND="gnilnd"
+ ],[
+ AC_MSG_RESULT([no])
+ AC_MSG_ERROR([can't compile gnilnd with given GNICPPFLAGS: $GNICPPFLAGS])
+ ])
+ # at this point, we have gnilnd basic support, now check for extra features
+ AC_MSG_CHECKING([to use RCA in gnilnd])
+ LB_LINUX_TRY_COMPILE([
+ #include <linux/types.h>
+ #include <gni_pub.h>
+ #include <krca_lib.h>
+ ],[
+ gni_cdm_handle_t kgni_domain;
+ gni_return_t rc;
+ krca_ticket_t ticket = KRCA_NULL_TICKET;
+ int rrc;
+ __u32 nid = 0, nic_addr;
+
+ rc = gni_cdm_create(0, 1, 1, 0, &kgni_domain);
+
+ rrc = (rc == GNI_RC_SUCCESS) ? 0 : 1;
+
+ rrc += krca_nid_to_nicaddrs(nid, 1, &nic_addr);
+
+ rrc += krca_register(&ticket, RCA_MAKE_SERVICE_INDEX(RCA_IO_CLASS, 9), 99, 0);
+
+ return rrc;
+ ],[
+ AC_MSG_RESULT([yes])
+ GNICPPFLAGS="$GNICPPFLAGS -DGNILND_USE_RCA=1"
+ GNILNDRCA="gnilndrca"
+ ],[
+ AC_MSG_RESULT([no])
+ ])
+ EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save"
+fi
+AC_SUBST(GNICPPFLAGS)
+AC_SUBST(GNILNDRCA)
+AC_SUBST(GNILND)
+])
#
#
# LN_CONFIG_USERSPACE
#
-# This is defined but empty because it is called from
+# This is defined but empty because it is called from
# build/autconf/lustre-build.m4 which is shared by all branches.
#
AC_DEFUN([LN_CONFIG_USERSPACE],
LN_CONFIG_QUADRICS
LN_CONFIG_O2IB
LN_CONFIG_RALND
+LN_CONFIG_GNILND
LN_CONFIG_PTLLND
LN_CONFIG_MX
# 2.6.32
AM_CONDITIONAL(BUILD_MXLND, test x$MXLND = "xmxlnd")
AM_CONDITIONAL(BUILD_O2IBLND, test x$O2IBLND = "xo2iblnd")
AM_CONDITIONAL(BUILD_RALND, test x$RALND = "xralnd")
+AM_CONDITIONAL(BUILD_GNILND, test x$GNILND = "xgnilnd")
+AM_CONDITIONAL(BUILD_GNILND_RCA, test x$GNILNDRCA = "xgnilndrca")
AM_CONDITIONAL(BUILD_PTLLND, test x$PTLLND = "xptllnd")
AM_CONDITIONAL(BUILD_USOCKLND, test x$USOCKLND = "xusocklnd")
])
lnet/klnds/qswlnd/autoMakefile
lnet/klnds/ralnd/Makefile
lnet/klnds/ralnd/autoMakefile
+lnet/klnds/gnilnd/Makefile
+lnet/klnds/gnilnd/autoMakefile
lnet/klnds/socklnd/Makefile
lnet/klnds/socklnd/autoMakefile
lnet/klnds/ptllnd/Makefile
@BUILD_MXLND_TRUE@subdir-m += mxlnd
@BUILD_RALND_TRUE@subdir-m += ralnd
+@BUILD_GNILND_TRUE@subdir-m += gnilnd
@BUILD_O2IBLND_TRUE@subdir-m += o2iblnd
@BUILD_QSWLND_TRUE@subdir-m += qswlnd
@BUILD_PTLLND_TRUE@subdir-m += ptllnd
# Lustre is a trademark of Sun Microsystems, Inc.
#
-SUBDIRS = socklnd qswlnd mxlnd ralnd ptllnd o2iblnd
+SUBDIRS = socklnd qswlnd mxlnd ralnd gnilnd ptllnd o2iblnd
--- /dev/null
+MODULES := kgnilnd
+kgnilnd-objs := gnilnd.o gnilnd_cb.o gnilnd_modparams.o gnilnd_debug.o gnilnd_proc.o \
+ gnilnd_sysctl.o gnilnd_stack.o gnilnd_conn.o
+
+EXTRA_POST_CFLAGS := -D"SVN_CODE_REV=KBUILD_STR(${SVN_CODE_REV})" @GNICPPFLAGS@
+
+EXTRA_DIST = $(kgnilnd-objs:%.o=%.c) gnilnd.h gnilnd_api_wrap.h
+
+@INCLUDE_RULES@
--- /dev/null
+# Copyright (C) 2009 Cray, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+if MODULES
+if BUILD_GNILND
+modulenet_DATA = kgnilnd$(KMODEXT)
+endif
+endif
+
+MOSTLYCLEANFILES = @MOSTLYCLEANFILES@
--- /dev/null
+/*
+ * Copyright (C) 2012 Cray, Inc.
+ *
+ * Author: Igor Gorodetsky <iogordet@cray.com>
+ * Author: Nic Henke <nic@cray.com>
+ * Author: James Shimek <jshimek@cray.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#include "gnilnd.h"
+
+/* Primary entry points from LNET. There are no guarantees against reentrance. */
+lnd_t the_kgnilnd = {
+ .lnd_type = GNILND,
+ .lnd_startup = kgnilnd_startup,
+ .lnd_shutdown = kgnilnd_shutdown,
+ .lnd_ctl = kgnilnd_ctl,
+ .lnd_send = kgnilnd_send,
+ .lnd_recv = kgnilnd_recv,
+ .lnd_eager_recv = kgnilnd_eager_recv,
+ .lnd_query = kgnilnd_query,
+};
+
+kgn_data_t kgnilnd_data;
+kgn_hssops_t kgnilnd_hssops;
+
+/* needs write_lock on kgn_peer_conn_lock */
+int
+kgnilnd_close_stale_conns_locked(kgn_peer_t *peer, kgn_conn_t *newconn)
+{
+ kgn_conn_t *conn;
+ struct list_head *ctmp, *cnxt;
+ int loopback;
+ int count = 0;
+
+ loopback = peer->gnp_nid == peer->gnp_net->gnn_ni->ni_nid;
+
+ list_for_each_safe(ctmp, cnxt, &peer->gnp_conns) {
+ conn = list_entry(ctmp, kgn_conn_t, gnc_list);
+
+ if (conn->gnc_state != GNILND_CONN_ESTABLISHED)
+ continue;
+
+ if (conn == newconn)
+ continue;
+
+ if (conn->gnc_device != newconn->gnc_device)
+ continue;
+
+ /* This is a two connection loopback - one talking to the other */
+ if (loopback &&
+ newconn->gnc_my_connstamp == conn->gnc_peer_connstamp &&
+ newconn->gnc_peer_connstamp == conn->gnc_my_connstamp) {
+ CDEBUG(D_NET, "skipping prune of %p, "
+ "loopback and matching stamps"
+ " connstamp "LPU64"("LPU64")"
+ " peerstamp "LPU64"("LPU64")\n",
+ conn, newconn->gnc_my_connstamp,
+ conn->gnc_peer_connstamp,
+ newconn->gnc_peer_connstamp,
+ conn->gnc_my_connstamp);
+ continue;
+ }
+
+ if (conn->gnc_peerstamp != newconn->gnc_peerstamp) {
+ LASSERTF(conn->gnc_peerstamp < newconn->gnc_peerstamp,
+ "conn 0x%p peerstamp "LPU64" >= "
+ "newconn 0x%p peerstamp "LPU64"\n",
+ conn, conn->gnc_peerstamp,
+ newconn, newconn->gnc_peerstamp);
+
+ CDEBUG(D_NET, "Closing stale conn nid: %s "
+ " peerstamp:"LPX64"("LPX64")\n",
+ libcfs_nid2str(peer->gnp_nid),
+ conn->gnc_peerstamp, newconn->gnc_peerstamp);
+ } else {
+
+ LASSERTF(conn->gnc_peer_connstamp < newconn->gnc_peer_connstamp,
+ "conn 0x%p peer_connstamp "LPU64" >= "
+ "newconn 0x%p peer_connstamp "LPU64"\n",
+ conn, conn->gnc_peer_connstamp,
+ newconn, newconn->gnc_peer_connstamp);
+
+ CDEBUG(D_NET, "Closing stale conn nid: %s"
+ " connstamp:"LPU64"("LPU64")\n",
+ libcfs_nid2str(peer->gnp_nid),
+ conn->gnc_peer_connstamp, newconn->gnc_peer_connstamp);
+ }
+
+ count++;
+ kgnilnd_close_conn_locked(conn, -ESTALE);
+ }
+
+ if (count != 0) {
+ CWARN("Closed %d stale conns to %s\n", count, libcfs_nid2str(peer->gnp_nid));
+ }
+
+ RETURN(count);
+}
+
+int
+kgnilnd_conn_isdup_locked(kgn_peer_t *peer, kgn_conn_t *newconn)
+{
+ kgn_conn_t *conn;
+ struct list_head *tmp;
+ int loopback;
+ ENTRY;
+
+ loopback = peer->gnp_nid == peer->gnp_net->gnn_ni->ni_nid;
+
+ list_for_each(tmp, &peer->gnp_conns) {
+ conn = list_entry(tmp, kgn_conn_t, gnc_list);
+ CDEBUG(D_NET, "checking conn 0x%p for peer %s"
+ " lo %d new "LPU64" existing "LPU64
+ " new peer "LPU64" existing peer "LPU64
+ " new dev %p existing dev %p\n",
+ conn, libcfs_nid2str(peer->gnp_nid),
+ loopback,
+ newconn->gnc_peerstamp, conn->gnc_peerstamp,
+ newconn->gnc_peer_connstamp, conn->gnc_peer_connstamp,
+ newconn->gnc_device, conn->gnc_device);
+
+ /* conn is in the process of closing */
+ if (conn->gnc_state != GNILND_CONN_ESTABLISHED)
+ continue;
+
+ /* 'newconn' is from an earlier version of 'peer'!!! */
+ if (newconn->gnc_peerstamp < conn->gnc_peerstamp)
+ RETURN(1);
+
+ /* 'conn' is from an earlier version of 'peer': it will be
+ * removed when we cull stale conns later on... */
+ if (newconn->gnc_peerstamp > conn->gnc_peerstamp)
+ continue;
+
+ /* Different devices are OK */
+ if (conn->gnc_device != newconn->gnc_device)
+ continue;
+
+ /* It's me connecting to myself */
+ if (loopback &&
+ newconn->gnc_my_connstamp == conn->gnc_peer_connstamp &&
+ newconn->gnc_peer_connstamp == conn->gnc_my_connstamp)
+ continue;
+
+ /* 'newconn' is an earlier connection from 'peer'!!! */
+ if (newconn->gnc_peer_connstamp < conn->gnc_peer_connstamp)
+ RETURN(2);
+
+ /* 'conn' is an earlier connection from 'peer': it will be
+ * removed when we cull stale conns later on... */
+ if (newconn->gnc_peer_connstamp > conn->gnc_peer_connstamp)
+ continue;
+
+ /* 'newconn' has the SAME connection stamp; 'peer' isn't
+ * playing the game... */
+ RETURN(3);
+ }
+
+ RETURN(0);
+}
+
+int
+kgnilnd_create_conn(kgn_conn_t **connp, kgn_device_t *dev)
+{
+ kgn_conn_t *conn;
+ gni_return_t rrc;
+ int rc = 0;
+
+ LASSERT (!in_interrupt());
+ atomic_inc(&kgnilnd_data.kgn_nconns);
+
+ /* divide by 2 to allow for complete reset and immediate reconnect */
+ if (atomic_read(&kgnilnd_data.kgn_nconns) >= GNILND_MAX_CQID/2) {
+ CERROR("Too many conn are live: %d > %d\n",
+ atomic_read(&kgnilnd_data.kgn_nconns), GNILND_MAX_CQID/2);
+ atomic_dec(&kgnilnd_data.kgn_nconns);
+ return -E2BIG;
+ }
+
+ LIBCFS_ALLOC(conn, sizeof(*conn));
+ if (conn == NULL) {
+ atomic_dec(&kgnilnd_data.kgn_nconns);
+ return -ENOMEM;
+ }
+
+ LIBCFS_ALLOC(conn->gnc_tx_ref_table, GNILND_MAX_MSG_ID * sizeof(void *));
+ if (conn->gnc_tx_ref_table == NULL) {
+ CERROR("Can't allocate conn tx_ref_table\n");
+ rc = -ENOMEM;
+ GOTO(failed, rc);
+ }
+
+ atomic_set(&conn->gnc_refcount, 1);
+ atomic_set(&conn->gnc_reaper_noop, 0);
+ atomic_set(&conn->gnc_sched_noop, 0);
+ INIT_LIST_HEAD(&conn->gnc_list);
+ INIT_LIST_HEAD(&conn->gnc_hashlist);
+ INIT_LIST_HEAD(&conn->gnc_schedlist);
+ INIT_LIST_HEAD(&conn->gnc_fmaq);
+ INIT_LIST_HEAD(&conn->gnc_mdd_list);
+ spin_lock_init(&conn->gnc_list_lock);
+ spin_lock_init(&conn->gnc_tx_lock);
+
+ /* set tx id to nearly the end to make sure we find wrapping
+ * issues soon */
+ conn->gnc_next_tx = (int) GNILND_MAX_MSG_ID - 10;
+
+ /* if this fails, we have conflicts and MAX_TX is too large */
+ CLASSERT(GNILND_MAX_MSG_ID < GNILND_MSGID_CLOSE);
+
+ /* get a new unique CQ id for this conn */
+ write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+ conn->gnc_my_connstamp = kgnilnd_data.kgn_connstamp++;
+ conn->gnc_cqid = kgnilnd_get_cqid_locked();
+ write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+ if (conn->gnc_cqid == 0) {
+ CERROR("Could not allocate unique CQ ID for conn 0x%p\n", conn);
+ rc = -E2BIG;
+ GOTO(failed, rc);
+ }
+
+ CDEBUG(D_NET, "alloc cqid %u for conn 0x%p\n",
+ conn->gnc_cqid, conn);
+
+ /* need to be set before gnc_ephandle to allow kgnilnd_destroy_conn_ep to
+ * check context */
+ conn->gnc_device = dev;
+
+ conn->gnc_timeout = MAX(*kgnilnd_tunables.kgn_timeout,
+ GNILND_MIN_TIMEOUT);
+ kgnilnd_update_reaper_timeout(conn->gnc_timeout);
+
+ /* this is the ep_handle for doing SMSG & BTE */
+ mutex_lock(&dev->gnd_cq_mutex);
+ rrc = kgnilnd_ep_create(dev->gnd_handle, dev->gnd_snd_fma_cqh,
+ &conn->gnc_ephandle);
+ mutex_unlock(&dev->gnd_cq_mutex);
+ if (rrc != GNI_RC_SUCCESS) {
+ rc = -ENETDOWN;
+ GOTO(failed, rc);
+ }
+
+ CDEBUG(D_NET, "created conn 0x%p ep_hndl 0x%p\n",
+ conn, conn->gnc_ephandle);
+
+ /* add ref for EP canceling */
+ kgnilnd_conn_addref(conn);
+ atomic_inc(&dev->gnd_neps);
+
+ *connp = conn;
+ return 0;
+
+failed:
+ atomic_dec(&kgnilnd_data.kgn_nconns);
+ LIBCFS_FREE(conn->gnc_tx_ref_table, GNILND_MAX_MSG_ID * sizeof(void *));
+ LIBCFS_FREE(conn, sizeof(*conn));
+ return rc;
+}
+
+/* needs to be called with kgn_peer_conn_lock held (read or write) */
+kgn_conn_t *
+kgnilnd_find_conn_locked(kgn_peer_t *peer)
+{
+ kgn_conn_t *conn = NULL;
+ ENTRY;
+
+ /* if we are in reset, this conn is going to die soon */
+ if (unlikely(kgnilnd_data.kgn_in_reset)) {
+ RETURN(NULL);
+ }
+
+ /* just return the first ESTABLISHED connection */
+ list_for_each_entry(conn, &peer->gnp_conns, gnc_list) {
+ /* kgnilnd_finish_connect doesn't put connections on the
+ * peer list until they are actually established */
+ LASSERTF(conn->gnc_state >= GNILND_CONN_ESTABLISHED,
+ "found conn %p state %s on peer %p (%s)\n",
+ conn, kgnilnd_conn_state2str(conn), peer,
+ libcfs_nid2str(peer->gnp_nid));
+ if (conn->gnc_state != GNILND_CONN_ESTABLISHED)
+ continue;
+
+ RETURN(conn);
+ }
+ RETURN(NULL);
+}
+
+/* needs write_lock on kgn_peer_conn_lock held */
+kgn_conn_t *
+kgnilnd_find_or_create_conn_locked(kgn_peer_t *peer) {
+
+ kgn_device_t *dev = peer->gnp_net->gnn_dev;
+ kgn_conn_t *conn;
+
+ conn = kgnilnd_find_conn_locked(peer);
+
+ if (conn != NULL) {
+ return conn;
+ }
+
+ /* if the peer was previously connecting, check if we should
+ * trigger another connection attempt yet. */
+ if (time_before(jiffies, peer->gnp_reconnect_time)) {
+ return NULL;
+ }
+
+ /* This check prevents us from creating a new connection to a peer while we are
+ * still in the process of closing an existing connection to the peer.
+ */
+ list_for_each_entry(conn, &peer->gnp_conns, gnc_list) {
+ if (conn->gnc_ephandle != NULL) {
+ CDEBUG(D_NET, "Not connecting non-null ephandle found peer 0x%p->%s\n", peer,
+ libcfs_nid2str(peer->gnp_nid));
+ return NULL;
+ }
+ }
+
+ if (peer->gnp_connecting != GNILND_PEER_IDLE) {
+ /* if we are not connecting, fire up a new connection */
+ /* or if we are anything but IDLE DONT start a new connection */
+ return NULL;
+ }
+
+ CDEBUG(D_NET, "starting connect to %s\n",
+ libcfs_nid2str(peer->gnp_nid));
+ peer->gnp_connecting = GNILND_PEER_CONNECT;
+ kgnilnd_peer_addref(peer); /* extra ref for connd */
+
+ spin_lock(&dev->gnd_connd_lock);
+ list_add_tail(&peer->gnp_connd_list, &dev->gnd_connd_peers);
+ spin_unlock(&dev->gnd_connd_lock);
+
+ kgnilnd_schedule_dgram(dev);
+ CDEBUG(D_NETTRACE, "scheduling new connect\n");
+
+ return NULL;
+}
+
+/* Caller is responsible for deciding if/when to call this */
+void
+kgnilnd_destroy_conn_ep(kgn_conn_t *conn)
+{
+ gni_return_t rrc;
+ gni_ep_handle_t tmp_ep;
+
+ /* only if we actually initialized it,
+ * then set NULL to tell kgnilnd_destroy_conn to leave it alone */
+
+ tmp_ep = xchg(&conn->gnc_ephandle, NULL);
+ if (tmp_ep != NULL) {
+ /* we never re-use the EP, so unbind is not needed */
+ mutex_lock(&conn->gnc_device->gnd_cq_mutex);
+ rrc = kgnilnd_ep_destroy(tmp_ep);
+
+ mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+
+ /* if this fails, it could hork up kgni smsg retransmit and others
+ * since we could free the SMSG mbox memory, etc. */
+ LASSERTF(rrc == GNI_RC_SUCCESS, "rrc %d conn 0x%p ep 0x%p\n",
+ rrc, conn, conn->gnc_ephandle);
+
+ atomic_dec(&conn->gnc_device->gnd_neps);
+
+ /* clear out count added in kgnilnd_close_conn_locked
+ * conn will have a peer once it hits finish_connect, where it
+ * is the first spot we'll mark it ESTABLISHED as well */
+ if (conn->gnc_peer) {
+ kgnilnd_admin_decref(conn->gnc_peer->gnp_dirty_eps);
+ }
+
+ /* drop ref for EP */
+ kgnilnd_conn_decref(conn);
+ }
+}
+
+void
+kgnilnd_destroy_conn(kgn_conn_t *conn)
+{
+ LASSERTF(!in_interrupt() &&
+ !conn->gnc_scheduled &&
+ !conn->gnc_in_purgatory &&
+ conn->gnc_ephandle == NULL &&
+ list_empty(&conn->gnc_list) &&
+ list_empty(&conn->gnc_hashlist) &&
+ list_empty(&conn->gnc_schedlist) &&
+ list_empty(&conn->gnc_mdd_list),
+ "conn 0x%p->%s IRQ %d sched %d purg %d ep 0x%p lists %d/%d/%d/%d\n",
+ conn, conn->gnc_peer ? libcfs_nid2str(conn->gnc_peer->gnp_nid)
+ : "<?>",
+ !!in_interrupt(), conn->gnc_scheduled,
+ conn->gnc_in_purgatory,
+ conn->gnc_ephandle,
+ list_empty(&conn->gnc_list),
+ list_empty(&conn->gnc_hashlist),
+ list_empty(&conn->gnc_schedlist),
+ list_empty(&conn->gnc_mdd_list));
+
+ /* Tripping these is especially bad, as it means we have items on the
+ * lists that didn't keep their refcount on the connection - or
+ * somebody evil released their own */
+ LASSERTF(list_empty(&conn->gnc_fmaq) &&
+ atomic_read(&conn->gnc_nlive_fma) == 0 &&
+ atomic_read(&conn->gnc_nlive_rdma) == 0,
+ "conn 0x%p fmaq %d@0x%p nfma %d nrdma %d\n",
+ conn, kgnilnd_count_list(&conn->gnc_fmaq), &conn->gnc_fmaq,
+ atomic_read(&conn->gnc_nlive_fma), atomic_read(&conn->gnc_nlive_rdma));
+
+ CDEBUG(D_NET, "destroying conn %p ephandle %p error %d\n",
+ conn, conn->gnc_ephandle, conn->gnc_error);
+
+ /* if there is an FMA blk left here, we'll tear it down */
+ if (conn->gnc_fma_blk) {
+ kgnilnd_release_mbox(conn, 0);
+ }
+
+ if (conn->gnc_peer != NULL)
+ kgnilnd_peer_decref(conn->gnc_peer);
+
+ if (conn->gnc_tx_ref_table != NULL) {
+ LIBCFS_FREE(conn->gnc_tx_ref_table,
+ GNILND_MAX_MSG_ID * sizeof(void *));
+ }
+
+ LIBCFS_FREE(conn, sizeof(*conn));
+ atomic_dec(&kgnilnd_data.kgn_nconns);
+}
+
+/* peer_alive and peer_notify done in the style of the o2iblnd */
+void
+kgnilnd_peer_alive(kgn_peer_t *peer)
+{
+ set_mb(peer->gnp_last_alive, jiffies);
+}
+
+void
+kgnilnd_peer_notify(kgn_peer_t *peer, int error)
+{
+ int tell_lnet = 0;
+ int nnets = 0;
+ int rc;
+ int i, j;
+ kgn_conn_t *conn;
+ kgn_net_t **nets;
+ kgn_net_t *net;
+
+
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_DONT_NOTIFY))
+ return;
+
+ /* Tell LNet we are giving ups on this peer - but only
+ * if it isn't already reconnected or trying to reconnect */
+ read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+ /* use kgnilnd_find_conn_locked to avoid any conns in the process of being nuked
+ *
+ * don't tell LNet if we are in reset - we assume that everyone will be able to
+ * reconnect just fine
+ */
+ conn = kgnilnd_find_conn_locked(peer);
+
+ CDEBUG(D_NETTRACE, "peer 0x%p->%s ting %d conn 0x%p, rst %d error %d\n",
+ peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting, conn,
+ kgnilnd_data.kgn_in_reset, error);
+
+ if ((peer->gnp_connecting == GNILND_PEER_IDLE) &&
+ (conn == NULL) &&
+ (!kgnilnd_data.kgn_in_reset) &&
+ (!kgnilnd_conn_clean_errno(error))) {
+ tell_lnet = 1;
+ }
+
+ read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+ if (!tell_lnet) {
+ /* short circuit if we dont need to notify Lnet */
+ return;
+ }
+
+ rc = down_read_trylock(&kgnilnd_data.kgn_net_rw_sem);
+
+ if (rc) {
+ /* dont do this if this fails since LNET is in shutdown or something else
+ */
+
+ for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) {
+ list_for_each_entry(net , &kgnilnd_data.kgn_nets[i], gnn_list) {
+ /* if gnn_shutdown set for any net shutdown is in progress just return */
+ if (net->gnn_shutdown) {
+ up_read(&kgnilnd_data.kgn_net_rw_sem);
+ return;
+ }
+ nnets++;
+ }
+ }
+
+ if (nnets == 0) {
+ /* shutdown in progress most likely */
+ up_read(&kgnilnd_data.kgn_net_rw_sem);
+ return;
+ }
+
+ LIBCFS_ALLOC(nets, nnets * sizeof(*nets));
+
+ if (nets == NULL) {
+ up_read(&kgnilnd_data.kgn_net_rw_sem);
+ CERROR("Failed to allocate nets[%d]\n", nnets);
+ return;
+ }
+
+ j = 0;
+ for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) {
+ list_for_each_entry(net, &kgnilnd_data.kgn_nets[i], gnn_list) {
+ nets[j] = net;
+ kgnilnd_net_addref(net);
+ j++;
+ }
+ }
+ up_read(&kgnilnd_data.kgn_net_rw_sem);
+
+ for (i = 0; i < nnets; i++) {
+ lnet_nid_t peer_nid;
+
+ net = nets[i];
+
+ peer_nid = kgnilnd_lnd2lnetnid(net->gnn_ni->ni_nid,
+ peer->gnp_nid);
+
+ CDEBUG(D_NET, "peer 0x%p->%s last_alive %lu (%lus ago)\n",
+ peer, libcfs_nid2str(peer_nid), peer->gnp_last_alive,
+ cfs_duration_sec(jiffies - peer->gnp_last_alive));
+
+ lnet_notify(net->gnn_ni, peer_nid, 0, peer->gnp_last_alive);
+
+
+ kgnilnd_net_decref(net);
+ }
+
+ LIBCFS_FREE(nets, nnets * sizeof(*nets));
+ }
+}
+
+/* need write_lock on kgn_peer_conn_lock */
+void
+kgnilnd_close_conn_locked(kgn_conn_t *conn, int error)
+{
+ kgn_peer_t *peer = conn->gnc_peer;
+ ENTRY;
+
+ LASSERT(!in_interrupt());
+
+ /* store error for tx completion */
+ conn->gnc_error = error;
+ peer->gnp_last_errno = error;
+
+ /* use real error from peer if possible */
+ if (error == -ECONNRESET) {
+ error = conn->gnc_peer_error;
+ }
+
+ /* if we NETERROR, make sure it is rate limited */
+ if (!kgnilnd_conn_clean_errno(error)) {
+ CNETERR("closing conn to %s: error %d\n",
+ libcfs_nid2str(peer->gnp_nid), error);
+ } else {
+ CDEBUG(D_NET, "closing conn to %s: error %d\n",
+ libcfs_nid2str(peer->gnp_nid), error);
+ }
+
+ LASSERTF(conn->gnc_state == GNILND_CONN_ESTABLISHED,
+ "conn %p to %s with bogus state %s\n", conn,
+ libcfs_nid2str(conn->gnc_peer->gnp_nid),
+ kgnilnd_conn_state2str(conn));
+ LASSERT(!list_empty(&conn->gnc_hashlist));
+ LASSERT(!list_empty(&conn->gnc_list));
+
+
+ /* mark peer count here so any place the EP gets destroyed will
+ * open up the peer count so that a new ESTABLISHED conn is then free
+ * to send new messages -- sending before the previous EPs are destroyed
+ * could end up with messages on the network for the old conn _after_
+ * the new conn and break the mbox safety protocol */
+ kgnilnd_admin_addref(conn->gnc_peer->gnp_dirty_eps);
+
+ /* Remove from conn hash table: no new callbacks */
+ list_del_init(&conn->gnc_hashlist);
+ kgnilnd_data.kgn_conn_version++;
+
+ /* if we are in reset, go right to CLOSED as there is no scheduler
+ * thread to move from CLOSING to CLOSED */
+ if (unlikely(kgnilnd_data.kgn_in_reset)) {
+ conn->gnc_state = GNILND_CONN_CLOSED;
+ } else {
+ conn->gnc_state = GNILND_CONN_CLOSING;
+ }
+
+ /* leave on peer->gnp_conns to make sure we don't let the reaper
+ * or others try to unlink this peer until the conn is fully
+ * processed for closing */
+
+ if (kgnilnd_check_purgatory_conn(conn)) {
+ kgnilnd_add_purgatory_locked(conn, conn->gnc_peer);
+ }
+
+ /* Reset RX timeout to ensure we wait for an incoming CLOSE
+ * for the full timeout. If we get a CLOSE we know the
+ * peer has stopped all RDMA. Otherwise if we wait for
+ * the full timeout we can also be sure all RDMA has stopped. */
+ conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies;
+ mb();
+
+ /* schedule sending CLOSE - if we are in quiesce, this adds to
+ * gnd_ready_conns and allows us to find it in quiesce processing */
+ kgnilnd_schedule_conn(conn);
+
+ /* lose peer's ref */
+ kgnilnd_conn_decref(conn);
+ /* -1 for conn table */
+ kgnilnd_conn_decref(conn);
+
+ EXIT;
+}
+
+void
+kgnilnd_close_conn(kgn_conn_t *conn, int error)
+{
+ write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+ /* need to check the state here - this call is racy and we don't
+ * know the state until after the lock is grabbed */
+ if (conn->gnc_state == GNILND_CONN_ESTABLISHED) {
+ kgnilnd_close_conn_locked(conn, error);
+ }
+ write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+}
+
+void
+kgnilnd_complete_closed_conn(kgn_conn_t *conn)
+{
+ LIST_HEAD (sinners);
+ kgn_tx_t *tx, *txn;
+ int nlive = 0;
+ int nrdma = 0;
+ int nq_rdma = 0;
+ int logmsg;
+ ENTRY;
+
+ /* Dump log on cksum error - wait until complete phase to let
+ * RX of error happen */
+ if (*kgnilnd_tunables.kgn_checksum_dump &&
+ (conn != NULL && conn->gnc_peer_error == -ENOKEY)) {
+ libcfs_debug_dumplog();
+ }
+
+ /* _CLOSED set in kgnilnd_process_fmaq once we decide to
+ * send the CLOSE or not */
+ LASSERTF(conn->gnc_state == GNILND_CONN_CLOSED,
+ "conn 0x%p->%s with bad state %s\n",
+ conn, conn->gnc_peer ?
+ libcfs_nid2str(conn->gnc_peer->gnp_nid) :
+ "<?>",
+ kgnilnd_conn_state2str(conn));
+
+ LASSERT(list_empty(&conn->gnc_hashlist));
+
+ /* we've sent the close, start nuking */
+
+ /* we don't use lists to track things that we can get out of the
+ * tx_ref table... */
+
+ /* need to hold locks for tx_list_state, sampling it is too racy:
+ * - the lock actually protects tx != NULL, but we can't take the proper
+ * lock until we check tx_list_state, which would be too late and
+ * we could have the TX change under us.
+ * gnd_rdmaq_lock and gnd_lock and not used together, so taking both
+ * should be fine */
+ spin_lock(&conn->gnc_device->gnd_rdmaq_lock);
+ spin_lock(&conn->gnc_device->gnd_lock);
+
+ for (nrdma = 0; nrdma < GNILND_MAX_MSG_ID; nrdma++) {
+ tx = conn->gnc_tx_ref_table[nrdma];
+
+ if (tx != NULL) {
+ /* only print the first error and if not CLOSE, we often don't see
+ * CQ events for that by the time we get here... and really don't care */
+ if (nlive || tx->tx_msg.gnm_type == GNILND_MSG_CLOSE)
+ tx->tx_state |= GNILND_TX_QUIET_ERROR;
+ nlive++;
+ GNIDBG_TX(D_NET, tx, "cleaning up on close, nlive %d", nlive);
+
+ /* don't worry about gnc_lock here as nobody else should be
+ * touching this conn */
+ kgnilnd_tx_del_state_locked(tx, NULL, conn, GNILND_TX_ALLOCD);
+ list_add_tail(&tx->tx_list, &sinners);
+ }
+ }
+ spin_unlock(&conn->gnc_device->gnd_lock);
+ spin_unlock(&conn->gnc_device->gnd_rdmaq_lock);
+
+ /* nobody should have marked this as needing scheduling after
+ * we called close - so only ref should be us handling it */
+ LASSERTF(conn->gnc_scheduled == GNILND_CONN_PROCESS,
+ "conn 0x%p scheduled %d\n", conn, conn->gnc_scheduled);
+
+ /* now reset a few to actual counters... */
+ nrdma = atomic_read(&conn->gnc_nlive_rdma);
+ nq_rdma = atomic_read(&conn->gnc_nq_rdma);
+
+ if (!list_empty(&sinners)) {
+ list_for_each_entry_safe(tx, txn, &sinners, tx_list) {
+ /* clear tx_list to make tx_add_list_locked happy */
+ list_del_init(&tx->tx_list);
+ /* The error codes determine if we hold onto the MDD */
+ kgnilnd_tx_done(tx, conn->gnc_error);
+ }
+ }
+
+ logmsg = (nlive + nrdma + nq_rdma);
+
+ if (logmsg) {
+ if (conn->gnc_peer_error != 0) {
+ CNETERR("Closed conn 0x%p->%s (errno %d, peer errno %d): "
+ "canceled %d TX, %d/%d RDMA\n",
+ conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+ conn->gnc_error, conn->gnc_peer_error,
+ nlive, nq_rdma, nrdma);
+ } else {
+ CNETERR("Closed conn 0x%p->%s (errno %d): "
+ "canceled %d TX, %d/%d RDMA\n",
+ conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+ conn->gnc_error,
+ nlive, nq_rdma, nrdma);
+ }
+ }
+
+ kgnilnd_destroy_conn_ep(conn);
+
+ /* Bug 765042 - race this with completing a new conn to same peer - we need
+ * finish_connect to detach purgatory before we can do it ourselves here */
+ CFS_RACE(CFS_FAIL_GNI_FINISH_PURG);
+
+ /* now it is safe to remove from peer list - anyone looking at
+ * gnp_conns now is free to unlink if not on purgatory */
+ write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+ conn->gnc_state = GNILND_CONN_DONE;
+
+ /* Decrement counter if we are marked by del_conn_or_peers for closing
+ */
+ if (conn->gnc_needs_closing)
+ kgnilnd_admin_decref(kgnilnd_data.kgn_npending_conns);
+
+ /* Remove from peer's list of valid connections if its not in purgatory */
+ if (!conn->gnc_in_purgatory) {
+ list_del_init(&conn->gnc_list);
+ }
+
+ /* NB - only unlinking if we set pending in del_peer_locked from admin or
+ * shutdown */
+ if (kgnilnd_peer_active(conn->gnc_peer) &&
+ conn->gnc_peer->gnp_pending_unlink &&
+ kgnilnd_can_unlink_peer_locked(conn->gnc_peer)) {
+ kgnilnd_unlink_peer_locked(conn->gnc_peer);
+ }
+
+ write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+ /* I'm telling Mommy! - use peer_error if they initiated close */
+ kgnilnd_peer_notify(conn->gnc_peer,
+ conn->gnc_error == -ECONNRESET ? conn->gnc_peer_error
+ : conn->gnc_error);
+
+ EXIT;
+}
+
+int
+kgnilnd_set_conn_params(kgn_dgram_t *dgram)
+{
+ kgn_conn_t *conn = dgram->gndg_conn;
+ kgn_connreq_t *connreq = &dgram->gndg_conn_in;
+ kgn_gniparams_t *rem_param = &connreq->gncr_gnparams;
+ gni_return_t rrc;
+ int rc = 0;
+
+ /* set timeout vals in conn early so we can use them for the NAK */
+
+ /* use max of the requested and our timeout, peer will do the same */
+ conn->gnc_timeout = MAX(conn->gnc_timeout, connreq->gncr_timeout);
+
+ /* only ep_bind really mucks around with the CQ */
+ /* only ep bind if we are not connecting to ourself and the dstnid is not a wildcard. this check
+ * is necessary as you can only bind an ep once and we must make sure we dont bind when already bound.
+ */
+ if (connreq->gncr_dstnid != LNET_NID_ANY && dgram->gndg_conn_out.gncr_dstnid != connreq->gncr_srcnid) {
+ mutex_lock(&conn->gnc_device->gnd_cq_mutex);
+ rrc = kgnilnd_ep_bind(conn->gnc_ephandle,
+ connreq->gncr_gnparams.gnpr_host_id,
+ conn->gnc_cqid);
+ mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+ if (rrc != GNI_RC_SUCCESS) {
+ rc = -ECONNABORTED;
+ goto return_out;
+ }
+ }
+
+ rrc = kgnilnd_ep_set_eventdata(conn->gnc_ephandle, conn->gnc_cqid,
+ connreq->gncr_gnparams.gnpr_cqid);
+ if (rrc != GNI_RC_SUCCESS) {
+ rc = -ECONNABORTED;
+ goto cleanup_out;
+ }
+
+ /* Initialize SMSG */
+ rrc = kgnilnd_smsg_init(conn->gnc_ephandle, &conn->gnpr_smsg_attr,
+ &connreq->gncr_gnparams.gnpr_smsg_attr);
+ if (unlikely(rrc == GNI_RC_INVALID_PARAM)) {
+ gni_smsg_attr_t *local = &conn->gnpr_smsg_attr;
+ gni_smsg_attr_t *remote = &connreq->gncr_gnparams.gnpr_smsg_attr;
+ /* help folks figure out if there is a tunable off, etc. */
+ LCONSOLE_ERROR("SMSG attribute mismatch. Data from local/remote:"
+ " type %d/%d msg_maxsize %u/%u"
+ " mbox_maxcredit %u/%u. Please check kgni"
+ " logs for further data\n",
+ local->msg_type, remote->msg_type,
+ local->msg_maxsize, remote->msg_maxsize,
+ local->mbox_maxcredit, remote->mbox_maxcredit);
+ }
+ if (rrc != GNI_RC_SUCCESS) {
+ rc = -ECONNABORTED;
+ goto cleanup_out;
+ }
+
+ /* log this for help in debuggin SMSG buffer re-use */
+ CDEBUG(D_NET, "conn %p src %s dst %s smsg %p acquired"
+ " local cqid %u SMSG %p->%u hndl "LPX64"."LPX64
+ " remote cqid %u SMSG %p->%u hndl "LPX64"."LPX64"\n",
+ conn, libcfs_nid2str(connreq->gncr_srcnid),
+ libcfs_nid2str(connreq->gncr_dstnid),
+ &conn->gnpr_smsg_attr,
+ conn->gnc_cqid,
+ conn->gnpr_smsg_attr.msg_buffer,
+ conn->gnpr_smsg_attr.mbox_offset,
+ conn->gnpr_smsg_attr.mem_hndl.qword1,
+ conn->gnpr_smsg_attr.mem_hndl.qword2,
+ rem_param->gnpr_cqid,
+ rem_param->gnpr_smsg_attr.msg_buffer,
+ rem_param->gnpr_smsg_attr.mbox_offset,
+ rem_param->gnpr_smsg_attr.mem_hndl.qword1,
+ rem_param->gnpr_smsg_attr.mem_hndl.qword2);
+
+ conn->gnc_peerstamp = connreq->gncr_peerstamp;
+ conn->gnc_peer_connstamp = connreq->gncr_connstamp;
+
+ /* We update the reaper timeout once we have a valid conn and timeout */
+ kgnilnd_update_reaper_timeout(GNILND_TO2KA(conn->gnc_timeout));
+
+ return 0;
+
+cleanup_out:
+ rrc = kgnilnd_ep_unbind(conn->gnc_ephandle);
+ /* not sure I can just let this fly */
+ LASSERTF(rrc == GNI_RC_SUCCESS,
+ "bad rc from gni_ep_unbind trying to cleanup: %d\n", rrc);
+
+return_out:
+ LASSERTF(rc != 0, "SOFTWARE BUG: rc == 0\n");
+ CERROR("Error setting connection params from %s: %d\n",
+ libcfs_nid2str(connreq->gncr_srcnid), rc);
+ return rc;
+}
+
+/* needs down_read on kgn_net_rw_sem held from before this call until
+ * after the write_lock on kgn_peer_conn_lock - this ensures we stay sane
+ * with kgnilnd_shutdown - it'll get the sem and set shutdown, then get the
+ * kgn_peer_conn_lock to start del_peer'ing. If we hold the sem until after
+ * kgn_peer_conn_lock is held, we guarantee that nobody calls
+ * kgnilnd_add_peer_locked without checking gnn_shutdown */
+int
+kgnilnd_create_peer_safe(kgn_peer_t **peerp, lnet_nid_t nid, kgn_net_t *net)
+{
+ kgn_peer_t *peer;
+ int rc;
+
+ LASSERT(nid != LNET_NID_ANY);
+
+ /* We dont pass the net around in the dgram anymore so here is where we find it
+ * this will work unless its in shutdown or the nid has a net that is invalid.
+ * Either way error code needs to be returned in that case.
+ *
+ * If the net passed in is not NULL then we can use it, this alleviates looking it
+ * when the calling function has access to the data.
+ */
+ if (net == NULL) {
+ rc = kgnilnd_find_net(nid, &net);
+ if (rc < 0)
+ return rc;
+ } else {
+ /* find net adds a reference on the net if we are not using
+ * it we must do it manually so the net references are
+ * correct when tearing down the net
+ */
+ kgnilnd_net_addref(net);
+ }
+
+ LIBCFS_ALLOC(peer, sizeof(*peer));
+ if (peer == NULL) {
+ kgnilnd_net_decref(net);
+ return -ENOMEM;
+ }
+ peer->gnp_nid = nid;
+
+ /* translate from nid to nic addr & store */
+ rc = kgnilnd_nid_to_nicaddrs(LNET_NIDADDR(nid), 1, &peer->gnp_host_id);
+ if (rc <= 0) {
+ kgnilnd_net_decref(net);
+ LIBCFS_FREE(peer, sizeof(*peer));
+ return -ESRCH;
+ }
+ CDEBUG(D_NET, "peer 0x%p->%s -> NIC 0x%x\n", peer,
+ libcfs_nid2str(nid), peer->gnp_host_id);
+
+ atomic_set(&peer->gnp_refcount, 1); /* 1 ref for caller */
+ atomic_set(&peer->gnp_dirty_eps, 0);
+
+ INIT_LIST_HEAD(&peer->gnp_list);
+ INIT_LIST_HEAD(&peer->gnp_connd_list);
+ INIT_LIST_HEAD(&peer->gnp_conns);
+ INIT_LIST_HEAD(&peer->gnp_tx_queue);
+
+ /* the first reconnect should happen immediately, so we leave
+ * gnp_reconnect_interval set to 0 */
+
+ LASSERTF(net != NULL, "peer 0x%p->%s with NULL net\n",
+ peer, libcfs_nid2str(nid));
+
+ /* must have kgn_net_rw_sem held for this... */
+ if (net->gnn_shutdown) {
+ /* shutdown has started already */
+ kgnilnd_net_decref(net);
+ LIBCFS_FREE(peer, sizeof(*peer));
+ return -ESHUTDOWN;
+ }
+
+ peer->gnp_net = net;
+
+ atomic_inc(&kgnilnd_data.kgn_npeers);
+
+ *peerp = peer;
+ return 0;
+}
+
+void
+kgnilnd_destroy_peer(kgn_peer_t *peer)
+{
+ CDEBUG(D_NET, "peer %s %p deleted\n",
+ libcfs_nid2str(peer->gnp_nid), peer);
+ LASSERTF(atomic_read(&peer->gnp_refcount) == 0,
+ "peer 0x%p->%s refs %d\n",
+ peer, libcfs_nid2str(peer->gnp_nid),
+ atomic_read(&peer->gnp_refcount));
+ LASSERTF(atomic_read(&peer->gnp_dirty_eps) == 0,
+ "peer 0x%p->%s dirty eps %d\n",
+ peer, libcfs_nid2str(peer->gnp_nid),
+ atomic_read(&peer->gnp_dirty_eps));
+ LASSERTF(peer->gnp_net != NULL, "peer %p (%s) with NULL net\n",
+ peer, libcfs_nid2str(peer->gnp_nid));
+ LASSERTF(!kgnilnd_peer_active(peer),
+ "peer 0x%p->%s\n",
+ peer, libcfs_nid2str(peer->gnp_nid));
+ LASSERTF(peer->gnp_connecting == GNILND_PEER_IDLE || peer->gnp_connecting == GNILND_PEER_KILL,
+ "peer 0x%p->%s, connecting %d\n",
+ peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
+ LASSERTF(list_empty(&peer->gnp_conns),
+ "peer 0x%p->%s\n",
+ peer, libcfs_nid2str(peer->gnp_nid));
+ LASSERTF(list_empty(&peer->gnp_tx_queue),
+ "peer 0x%p->%s\n",
+ peer, libcfs_nid2str(peer->gnp_nid));
+ LASSERTF(list_empty(&peer->gnp_connd_list),
+ "peer 0x%p->%s\n",
+ peer, libcfs_nid2str(peer->gnp_nid));
+
+ /* NB a peer's connections keep a reference on their peer until
+ * they are destroyed, so we can be assured that _all_ state to do
+ * with this peer has been cleaned up when its refcount drops to
+ * zero. */
+
+ atomic_dec(&kgnilnd_data.kgn_npeers);
+ kgnilnd_net_decref(peer->gnp_net);
+
+ LIBCFS_FREE(peer, sizeof(*peer));
+}
+
+/* the conn might not have made it all the way through to a connected
+ * state - but we need to purgatory any conn that a remote peer might
+ * have seen through a posted dgram as well */
+void
+kgnilnd_add_purgatory_locked(kgn_conn_t *conn, kgn_peer_t *peer)
+{
+ kgn_mbox_info_t *mbox = NULL;
+ ENTRY;
+
+ /* NB - the caller should own conn by removing him from the
+ * scheduler thread when finishing the close */
+
+ LASSERTF(peer != NULL, "conn %p with NULL peer\n", conn);
+
+ /* If this is still true, need to add the calls to unlink back in and
+ * figure out how to close the hole on loopback conns */
+ LASSERTF(kgnilnd_peer_active(peer), "can't use inactive peer %s (%p)"
+ " we'll never recover the resources\n",
+ libcfs_nid2str(peer->gnp_nid), peer);
+
+ CDEBUG(D_NET, "conn %p peer %p dev %p\n", conn, peer,
+ conn->gnc_device);
+
+ /* add ref for mbox purgatory hold */
+ kgnilnd_peer_addref(peer);
+ kgnilnd_conn_addref(conn);
+ conn->gnc_in_purgatory = 1;
+
+ mbox = &conn->gnc_fma_blk->gnm_mbox_info[conn->gnc_mbox_id];
+ mbox->mbx_prev_nid = peer->gnp_nid;
+ mbox->mbx_add_purgatory = jiffies;
+ kgnilnd_release_mbox(conn, 1);
+
+ LASSERTF(list_empty(&conn->gnc_mdd_list),
+ "conn 0x%p->%s with active purgatory hold MDD %d\n",
+ conn, libcfs_nid2str(peer->gnp_nid),
+ kgnilnd_count_list(&conn->gnc_mdd_list));
+
+ EXIT;
+}
+
+/* Instead of detaching everything from purgatory here we just mark the conn as needing
+ * detach, when the reaper checks the conn the next time it will detach it.
+ * Calling function requires write_lock held on kgn_peer_conn_lock
+ */
+void
+kgnilnd_mark_for_detach_purgatory_all_locked(kgn_peer_t *peer) {
+ kgn_conn_t *conn;
+
+ list_for_each_entry(conn, &peer->gnp_conns, gnc_list) {
+ if (conn->gnc_in_purgatory && !conn->gnc_needs_detach) {
+ conn->gnc_needs_detach = 1;
+ kgnilnd_admin_addref(kgnilnd_data.kgn_npending_detach);
+ }
+ }
+}
+
+/* Calling function needs a write_lock held on kgn_peer_conn_lock */
+void
+kgnilnd_detach_purgatory_locked(kgn_conn_t *conn, struct list_head *conn_list)
+{
+ kgn_mbox_info_t *mbox = NULL;
+
+ /* if needed, add the conn purgatory data to the list passed in */
+ if (conn->gnc_in_purgatory) {
+ CDEBUG(D_NET, "peer %p->%s purg_conn %p@%s mdd_list #tx %d\n",
+ conn->gnc_peer, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+ conn, kgnilnd_conn_state2str(conn),
+ kgnilnd_count_list(&conn->gnc_mdd_list));
+
+ mbox = &conn->gnc_fma_blk->gnm_mbox_info[conn->gnc_mbox_id];
+ mbox->mbx_detach_of_purgatory = jiffies;
+
+ /* conn->gnc_list is the entry point on peer->gnp_conns, so detaching it
+ * here removes it from the list of 'valid' peer connections.
+ * We put the current conn onto a list of conns to call kgnilnd_release_purgatory_locked()
+ * and as such the caller of kgnilnd_detach_purgatory_locked() now owns that conn, since its not
+ * on the peer's conn_list anymore.
+ */
+
+ kgnilnd_peer_decref(conn->gnc_peer);
+ list_del_init(&conn->gnc_list);
+
+ /* NB - only unlinking if we set pending in del_peer_locked from admin or
+ * shutdown */
+ if (kgnilnd_peer_active(conn->gnc_peer) &&
+ conn->gnc_peer->gnp_pending_unlink &&
+ kgnilnd_can_unlink_peer_locked(conn->gnc_peer)) {
+ kgnilnd_unlink_peer_locked(conn->gnc_peer);
+ }
+ /* The reaper will not call detach unless the conn is fully through kgnilnd_complete_closed_conn.
+ * If the conn is not in a DONE state somehow we are attempting to detach even though
+ * the conn has not been fully cleaned up. If we detach while the conn is still closing
+ * we will end up with an orphaned connection that has valid ep_handle, that is not on a
+ * peer.
+ */
+
+ LASSERTF(conn->gnc_state == GNILND_CONN_DONE, "Conn in invalid state %p@%s \n",
+ conn, kgnilnd_conn_state2str(conn));
+
+ /* move from peer to the delayed release list */
+ list_add_tail(&conn->gnc_list, conn_list);
+ }
+}
+
+void
+kgnilnd_release_purgatory_list(struct list_head *conn_list)
+{
+ kgn_device_t *dev;
+ kgn_conn_t *conn, *connN;
+ kgn_mdd_purgatory_t *gmp, *gmpN;
+
+ list_for_each_entry_safe(conn, connN, conn_list, gnc_list) {
+ dev = conn->gnc_device;
+
+ kgnilnd_release_mbox(conn, -1);
+ conn->gnc_in_purgatory = 0;
+
+ list_del_init(&conn->gnc_list);
+
+ /* gnc_needs_detach is set in kgnilnd_del_conn_or_peer. It is used to keep track
+ * of conns that have been marked for detach by kgnilnd_del_conn_or_peer.
+ * The function uses kgn_npending_detach to verify the conn has
+ * actually been detached.
+ */
+
+ if (conn->gnc_needs_detach)
+ kgnilnd_admin_decref(kgnilnd_data.kgn_npending_detach);
+
+ /* if this guy is really dead (we are doing release from reaper),
+ * make sure we tell LNet - if this is from other context,
+ * the checks in the function will prevent an errant
+ * notification */
+ kgnilnd_peer_notify(conn->gnc_peer, conn->gnc_error);
+
+ list_for_each_entry_safe(gmp, gmpN, &conn->gnc_mdd_list,
+ gmp_list) {
+ CDEBUG(D_NET,
+ "dev %p releasing held mdd "LPX64"."LPX64"\n",
+ conn->gnc_device, gmp->gmp_map_key.qword1,
+ gmp->gmp_map_key.qword2);
+
+ atomic_dec(&dev->gnd_n_mdd_held);
+ kgnilnd_mem_mdd_release(conn->gnc_device->gnd_handle,
+ &gmp->gmp_map_key);
+ /* ignoring the return code - if kgni/ghal can't find it
+ * it must be released already */
+
+ list_del_init(&gmp->gmp_list);
+ LIBCFS_FREE(gmp, sizeof(*gmp));
+ }
+ /* lose conn ref for purgatory */
+ kgnilnd_conn_decref(conn);
+ }
+}
+
+/* needs write_lock on kgnilnd_data.kgn_peer_conn_lock held */
+void
+kgnilnd_peer_increase_reconnect_locked(kgn_peer_t *peer)
+{
+ int current_to;
+
+ current_to = peer->gnp_reconnect_interval;
+
+ /* we'll try to reconnect fast the first time, then back-off */
+ if (current_to == 0) {
+ peer->gnp_reconnect_time = jiffies - 1;
+ current_to = *kgnilnd_tunables.kgn_min_reconnect_interval;
+ } else {
+ peer->gnp_reconnect_time = jiffies + cfs_time_seconds(current_to);
+ /* add 50% of min timeout & retry */
+ current_to += *kgnilnd_tunables.kgn_min_reconnect_interval / 2;
+ }
+
+ current_to = MIN(current_to,
+ *kgnilnd_tunables.kgn_max_reconnect_interval);
+
+ peer->gnp_reconnect_interval = current_to;
+ CDEBUG(D_NET, "peer %s can reconnect at %lu interval %lu\n",
+ libcfs_nid2str(peer->gnp_nid), peer->gnp_reconnect_time,
+ peer->gnp_reconnect_interval);
+}
+
+/* needs kgnilnd_data.kgn_peer_conn_lock held */
+kgn_peer_t *
+kgnilnd_find_peer_locked(lnet_nid_t nid)
+{
+ struct list_head *peer_list = kgnilnd_nid2peerlist(nid);
+ kgn_peer_t *peer;
+
+ /* Chopping nid down to only NIDADDR using LNET_NIDADDR so we only
+ * have a single peer per device instead of a peer per nid/net combo.
+ */
+
+ list_for_each_entry(peer, peer_list, gnp_list) {
+ if (LNET_NIDADDR(nid) != LNET_NIDADDR(peer->gnp_nid))
+ continue;
+
+ CDEBUG(D_NET, "got peer [%p] -> %s c %d (%d)\n",
+ peer, libcfs_nid2str(nid),
+ peer->gnp_connecting,
+ atomic_read(&peer->gnp_refcount));
+ return peer;
+ }
+ return NULL;
+}
+
+/* need write_lock on kgn_peer_conn_lock */
+void
+kgnilnd_unlink_peer_locked(kgn_peer_t *peer)
+{
+ LASSERTF(list_empty(&peer->gnp_conns),
+ "peer 0x%p->%s\n",
+ peer, libcfs_nid2str(peer->gnp_nid));
+ LASSERTF(list_empty(&peer->gnp_tx_queue),
+ "peer 0x%p->%s\n",
+ peer, libcfs_nid2str(peer->gnp_nid));
+ LASSERTF(kgnilnd_peer_active(peer),
+ "peer 0x%p->%s\n",
+ peer, libcfs_nid2str(peer->gnp_nid));
+ CDEBUG(D_NET, "unlinking peer 0x%p->%s\n",
+ peer, libcfs_nid2str(peer->gnp_nid));
+
+ list_del_init(&peer->gnp_list);
+ kgnilnd_data.kgn_peer_version++;
+ kgnilnd_admin_decref(kgnilnd_data.kgn_npending_unlink);
+ /* lose peerlist's ref */
+ kgnilnd_peer_decref(peer);
+}
+
+int
+kgnilnd_get_peer_info(int index,
+ kgn_peer_t **found_peer,
+ lnet_nid_t *id, __u32 *nic_addr,
+ int *refcount, int *connecting)
+{
+ struct list_head *ptmp;
+ kgn_peer_t *peer;
+ int i;
+ int rc = -ENOENT;
+
+ read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+ for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
+
+ list_for_each(ptmp, &kgnilnd_data.kgn_peers[i]) {
+ peer = list_entry(ptmp, kgn_peer_t, gnp_list);
+
+ if (peer->gnp_nid != *id)
+ continue;
+
+ if (index-- > 0)
+ continue;
+
+ CDEBUG(D_NET, "found peer %p (%s) at index %d\n",
+ peer, libcfs_nid2str(peer->gnp_nid), index);
+
+ *found_peer = peer;
+ *id = peer->gnp_nid;
+ *nic_addr = peer->gnp_host_id;
+ *refcount = atomic_read(&peer->gnp_refcount);
+ *connecting = peer->gnp_connecting;
+
+ rc = 0;
+ goto out;
+ }
+ }
+out:
+ read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+ if (rc)
+ CDEBUG(D_NET, "no gni peer at index %d\n", index);
+ return rc;
+}
+
+/* requires write_lock on kgn_peer_conn_lock held */
+void
+kgnilnd_add_peer_locked(lnet_nid_t nid, kgn_peer_t *new_stub_peer, kgn_peer_t **peerp)
+{
+ kgn_peer_t *peer, *peer2;
+
+ LASSERTF(new_stub_peer != NULL, "bad stub peer for nid %s\n",
+ libcfs_nid2str(nid));
+
+ peer2 = kgnilnd_find_peer_locked(nid);
+ if (peer2 != NULL) {
+ /* A peer was created during the lock transition, so drop
+ * the new one we created */
+ kgnilnd_peer_decref(new_stub_peer);
+ peer = peer2;
+ } else {
+ peer = new_stub_peer;
+ /* peer table takes existing ref on peer */
+
+ LASSERTF(!kgnilnd_peer_active(peer),
+ "peer 0x%p->%s already in peer table\n",
+ peer, libcfs_nid2str(peer->gnp_nid));
+ list_add_tail(&peer->gnp_list,
+ kgnilnd_nid2peerlist(nid));
+ kgnilnd_data.kgn_peer_version++;
+ }
+
+ LASSERTF(peer->gnp_net != NULL, "peer 0x%p->%s with NULL net\n",
+ peer, libcfs_nid2str(peer->gnp_nid));
+ *peerp = peer;
+}
+
+int
+kgnilnd_add_peer(kgn_net_t *net, lnet_nid_t nid, kgn_peer_t **peerp)
+{
+ kgn_peer_t *peer;
+ int rc;
+ ENTRY;
+
+ if (nid == LNET_NID_ANY)
+ return -EINVAL;
+
+ /* NB - this will not block during normal operations -
+ * the only writer of this is in the startup/shutdown path. */
+ rc = down_read_trylock(&kgnilnd_data.kgn_net_rw_sem);
+ if (!rc) {
+ rc = -ESHUTDOWN;
+ RETURN(rc);
+ }
+ rc = kgnilnd_create_peer_safe(&peer, nid, net);
+ if (rc != 0) {
+ up_read(&kgnilnd_data.kgn_net_rw_sem);
+ RETURN(rc);
+ }
+
+ write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+ up_read(&kgnilnd_data.kgn_net_rw_sem);
+
+ kgnilnd_add_peer_locked(nid, peer, peerp);
+
+ CDEBUG(D_NET, "peer 0x%p->%s connecting %d\n",
+ peerp, libcfs_nid2str((*peerp)->gnp_nid),
+ (*peerp)->gnp_connecting);
+
+ write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+ RETURN(0);
+}
+
+/* needs write_lock on kgn_peer_conn_lock */
+void
+kgnilnd_cancel_peer_connect_locked(kgn_peer_t *peer, struct list_head *zombies)
+{
+ kgn_tx_t *tx, *txn;
+
+ /* we do care about state of gnp_connecting - we could be between
+ * reconnect attempts, so try to find the dgram and cancel the TX
+ * anyways. If we are in the process of posting DONT do anything;
+ * once it fails or succeeds we can nuke the connect attempt.
+ * We have no idea where in kgnilnd_post_dgram we are so we cant
+ * attempt to cancel until the function is done.
+ */
+
+ /* make sure peer isn't in process of connecting or waiting for connect*/
+ spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
+ if (!(list_empty(&peer->gnp_connd_list))) {
+ list_del_init(&peer->gnp_connd_list);
+ /* remove connd ref */
+ kgnilnd_peer_decref(peer);
+ }
+ spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
+
+ if (peer->gnp_connecting == GNILND_PEER_POSTING || peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
+ peer->gnp_connecting = GNILND_PEER_NEEDS_DEATH;
+ /* We are in process of posting right now the xchg set it up for us to
+ * cancel the connect so we are finished for now */
+ } else {
+ /* no need for exchange we have the peer lock and its ready for us to nuke */
+ LASSERTF(peer->gnp_connecting != GNILND_PEER_POSTING,
+ "Peer in invalid state 0x%p->%s, connecting %d\n",
+ peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
+ peer->gnp_connecting = GNILND_PEER_IDLE;
+ set_mb(peer->gnp_last_dgram_errno, -ETIMEDOUT);
+ kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev,
+ peer->gnp_nid);
+ }
+
+ /* The least we can do is nuke the tx's no matter what.... */
+ list_for_each_entry_safe(tx, txn, &peer->gnp_tx_queue, tx_list) {
+ kgnilnd_tx_del_state_locked(tx, peer, NULL,
+ GNILND_TX_ALLOCD);
+ list_add_tail(&tx->tx_list, zombies);
+ }
+}
+
+/* needs write_lock on kgn_peer_conn_lock */
+void
+kgnilnd_del_peer_locked(kgn_peer_t *peer, int error)
+{
+ /* this peer could be passive and only held for purgatory,
+ * take a ref to ensure it doesn't disappear in this function */
+ kgnilnd_peer_addref(peer);
+
+ CFS_RACE(CFS_FAIL_GNI_FIND_TARGET);
+
+ /* if purgatory release cleared it out, don't try again */
+ if (kgnilnd_peer_active(peer)) {
+ /* always do this to allow kgnilnd_start_connect and
+ * kgnilnd_finish_connect to catch this before they
+ * wrap up their operations */
+ if (kgnilnd_can_unlink_peer_locked(peer)) {
+ /* already released purgatory, so only active
+ * conns hold it */
+ kgnilnd_unlink_peer_locked(peer);
+ } else {
+ kgnilnd_close_peer_conns_locked(peer, error);
+ /* peer unlinks itself when last conn is closed */
+ }
+ }
+
+ /* we are done, release back to the wild */
+ kgnilnd_peer_decref(peer);
+}
+
+int
+kgnilnd_del_conn_or_peer(kgn_net_t *net, lnet_nid_t nid, int command,
+ int error)
+{
+ LIST_HEAD (souls);
+ LIST_HEAD (zombies);
+ struct list_head *ptmp, *pnxt;
+ kgn_peer_t *peer;
+ int lo;
+ int hi;
+ int i;
+ int rc = -ENOENT;
+
+ write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+ if (nid != LNET_NID_ANY)
+ lo = hi = kgnilnd_nid2peerlist(nid) - kgnilnd_data.kgn_peers;
+ else {
+ lo = 0;
+ hi = *kgnilnd_tunables.kgn_peer_hash_size - 1;
+ /* wildcards always succeed */
+ rc = 0;
+ }
+
+ for (i = lo; i <= hi; i++) {
+ list_for_each_safe(ptmp, pnxt, &kgnilnd_data.kgn_peers[i]) {
+ peer = list_entry(ptmp, kgn_peer_t, gnp_list);
+
+ LASSERTF(peer->gnp_net != NULL,
+ "peer %p (%s) with NULL net\n",
+ peer, libcfs_nid2str(peer->gnp_nid));
+
+ if (net != NULL && peer->gnp_net != net)
+ continue;
+
+ if (!(nid == LNET_NID_ANY || LNET_NIDADDR(peer->gnp_nid) == LNET_NIDADDR(nid)))
+ continue;
+
+ /* In both cases, we want to stop any in-flight
+ * connect attempts */
+ kgnilnd_cancel_peer_connect_locked(peer, &zombies);
+
+ switch (command) {
+ case GNILND_DEL_CONN:
+ kgnilnd_close_peer_conns_locked(peer, error);
+ break;
+ case GNILND_DEL_PEER:
+ peer->gnp_pending_unlink = 1;
+ kgnilnd_admin_addref(kgnilnd_data.kgn_npending_unlink);
+ kgnilnd_mark_for_detach_purgatory_all_locked(peer);
+ kgnilnd_del_peer_locked(peer, error);
+ break;
+ case GNILND_CLEAR_PURGATORY:
+ /* Mark everything ready for detach reaper will cleanup
+ * once we release the kgn_peer_conn_lock
+ */
+ kgnilnd_mark_for_detach_purgatory_all_locked(peer);
+ peer->gnp_last_errno = -EISCONN;
+ /* clear reconnect so he can reconnect soon */
+ peer->gnp_reconnect_time = 0;
+ peer->gnp_reconnect_interval = 0;
+ break;
+ default:
+ CERROR("bad command %d\n", command);
+ LBUG();
+ }
+ /* we matched something */
+ rc = 0;
+ }
+ }
+
+ write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+ /* release all of the souls found held in purgatory */
+ kgnilnd_release_purgatory_list(&souls);
+
+ /* nuke peer TX */
+ kgnilnd_txlist_done(&zombies, error);
+
+ /* This function does not return until the commands it initiated have completed,
+ * since they have to work there way through the other threads. In the case of shutdown
+ * threads are not woken up until after this call is initiated so we cannot wait, we just
+ * need to return. The same applies for stack reset we shouldnt wait as the reset thread
+ * handles closing.
+ */
+
+ CFS_RACE(CFS_FAIL_GNI_RACE_RESET);
+
+ if (error == -ENOTRECOVERABLE || error == -ESHUTDOWN) {
+ return rc;
+ }
+
+ i = 4;
+ while (atomic_read(&kgnilnd_data.kgn_npending_conns) ||
+ atomic_read(&kgnilnd_data.kgn_npending_detach) ||
+ atomic_read(&kgnilnd_data.kgn_npending_unlink)) {
+
+ cfs_pause(cfs_time_seconds(1));
+ i++;
+
+ CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, "Waiting on %d peers %d closes %d detaches\n",
+ atomic_read(&kgnilnd_data.kgn_npending_unlink),
+ atomic_read(&kgnilnd_data.kgn_npending_conns),
+ atomic_read(&kgnilnd_data.kgn_npending_detach));
+ }
+
+ return rc;
+}
+
+kgn_conn_t *
+kgnilnd_get_conn_by_idx(int index)
+{
+ kgn_peer_t *peer;
+ struct list_head *ptmp;
+ kgn_conn_t *conn;
+ struct list_head *ctmp;
+ int i;
+
+
+ for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
+ read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+ list_for_each(ptmp, &kgnilnd_data.kgn_peers[i]) {
+
+ peer = list_entry(ptmp, kgn_peer_t, gnp_list);
+
+ list_for_each(ctmp, &peer->gnp_conns) {
+ conn = list_entry(ctmp, kgn_conn_t, gnc_list);
+
+ if (conn->gnc_state != GNILND_CONN_ESTABLISHED)
+ continue;
+
+ if (index-- > 0)
+ continue;
+
+ CDEBUG(D_NET, "++conn[%p] -> %s (%d)\n", conn,
+ libcfs_nid2str(conn->gnc_peer->gnp_nid),
+ atomic_read(&conn->gnc_refcount));
+ kgnilnd_conn_addref(conn);
+ read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+ return conn;
+ }
+ }
+ read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+ }
+
+ return NULL;
+}
+
+int
+kgnilnd_get_conn_info(kgn_peer_t *peer,
+ int *device_id, __u64 *peerstamp,
+ int *tx_seq, int *rx_seq,
+ int *fmaq_len, int *nfma, int *nrdma)
+{
+ kgn_conn_t *conn;
+ int rc = 0;
+
+ read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+ conn = kgnilnd_find_conn_locked(peer);
+ if (conn == NULL) {
+ rc = -ENOENT;
+ goto out;
+ }
+
+ *device_id = conn->gnc_device->gnd_host_id;
+ *peerstamp = conn->gnc_peerstamp;
+ *tx_seq = conn->gnc_tx_seq;
+ *rx_seq = conn->gnc_rx_seq;
+ *fmaq_len = kgnilnd_count_list(&conn->gnc_fmaq);
+ *nfma = atomic_read(&conn->gnc_nlive_fma);
+ *nrdma = atomic_read(&conn->gnc_nlive_rdma);
+out:
+ read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+ return rc;
+}
+
+/* needs write_lock on kgn_peer_conn_lock */
+int
+kgnilnd_close_peer_conns_locked(kgn_peer_t *peer, int why)
+{
+ kgn_conn_t *conn;
+ struct list_head *ctmp, *cnxt;
+ int count = 0;
+
+ list_for_each_safe(ctmp, cnxt, &peer->gnp_conns) {
+ conn = list_entry(ctmp, kgn_conn_t, gnc_list);
+
+ if (conn->gnc_state != GNILND_CONN_ESTABLISHED)
+ continue;
+
+ count++;
+ /* we mark gnc_needs closing and increment kgn_npending_conns so that
+ * kgnilnd_del_conn_or_peer can wait on the other threads closing
+ * and cleaning up the connection.
+ */
+ if (!conn->gnc_needs_closing) {
+ conn->gnc_needs_closing = 1;
+ kgnilnd_admin_addref(kgnilnd_data.kgn_npending_conns);
+ }
+ kgnilnd_close_conn_locked(conn, why);
+ }
+ return count;
+}
+
+int
+kgnilnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
+{
+ struct libcfs_ioctl_data *data = arg;
+ kgn_net_t *net = ni->ni_data;
+ int rc = -EINVAL;
+
+ LASSERT(ni == net->gnn_ni);
+
+ switch (cmd) {
+ case IOC_LIBCFS_GET_PEER: {
+ lnet_nid_t nid = 0;
+ kgn_peer_t *peer = NULL;
+ __u32 nic_addr = 0;
+ __u64 peerstamp = 0;
+ int peer_refcount = 0, peer_connecting = 0;
+ int device_id = 0;
+ int tx_seq = 0, rx_seq = 0;
+ int fmaq_len = 0, nfma = 0, nrdma = 0;
+
+ rc = kgnilnd_get_peer_info(data->ioc_count, &peer,
+ &nid, &nic_addr, &peer_refcount,
+ &peer_connecting);
+ if (rc)
+ break;
+
+ /* Barf */
+ /* LNET_MKNID is used to mask from lnet the multiplexing/demultiplexing of connections and peers
+ * LNET assumes a conn and peer per net, the LNET_MKNID/LNET_NIDADDR allows us to let Lnet see what it
+ * wants to see instead of the underlying network that is being used to send the data
+ */
+ data->ioc_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), LNET_NIDADDR(nid));
+ data->ioc_flags = peer_connecting;
+ data->ioc_count = peer_refcount;
+
+ rc = kgnilnd_get_conn_info(peer, &device_id, &peerstamp,
+ &tx_seq, &rx_seq, &fmaq_len,
+ &nfma, &nrdma);
+
+ /* This is allowable - a persistent peer could not
+ * have a connection */
+ if (rc) {
+ /* flag to indicate we are not connected -
+ * need to print as such */
+ data->ioc_flags |= (1<<16);
+ rc = 0;
+ } else {
+ /* still barf */
+ data->ioc_net = device_id;
+ data->ioc_u64[0] = peerstamp;
+ data->ioc_u32[0] = fmaq_len;
+ data->ioc_u32[1] = nfma;
+ data->ioc_u32[2] = tx_seq;
+ data->ioc_u32[3] = rx_seq;
+ data->ioc_u32[4] = nrdma;
+ }
+ break;
+ }
+ case IOC_LIBCFS_ADD_PEER: {
+ /* just dummy value to allow using common interface */
+ kgn_peer_t *peer;
+ rc = kgnilnd_add_peer(net, data->ioc_nid, &peer);
+ break;
+ }
+ case IOC_LIBCFS_DEL_PEER: {
+ /* NULL is passed in so it affects all peers in existence without regard to network
+ * as the peer may not exist on the network LNET believes it to be on.
+ */
+ rc = kgnilnd_del_conn_or_peer(NULL, data->ioc_nid,
+ GNILND_DEL_PEER, -EUCLEAN);
+ break;
+ }
+ case IOC_LIBCFS_GET_CONN: {
+ kgn_conn_t *conn = kgnilnd_get_conn_by_idx(data->ioc_count);
+
+ if (conn == NULL)
+ rc = -ENOENT;
+ else {
+ rc = 0;
+ /* LNET_MKNID is used to build the correct address based on what LNET wants to see instead of
+ * the generic connection that is used to send the data
+ */
+ data->ioc_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), LNET_NIDADDR(conn->gnc_peer->gnp_nid));
+ data->ioc_u32[0] = conn->gnc_device->gnd_id;
+ kgnilnd_conn_decref(conn);
+ }
+ break;
+ }
+ case IOC_LIBCFS_CLOSE_CONNECTION: {
+ /* use error = -ENETRESET to indicate it was lctl disconnect */
+ /* NULL is passed in so it affects all the nets as the connection is virtual
+ * and may not exist on the network LNET believes it to be on.
+ */
+ rc = kgnilnd_del_conn_or_peer(NULL, data->ioc_nid,
+ GNILND_DEL_CONN, -ENETRESET);
+ break;
+ }
+ case IOC_LIBCFS_PUSH_CONNECTION: {
+ /* we use this to flush purgatory */
+ rc = kgnilnd_del_conn_or_peer(NULL, data->ioc_nid,
+ GNILND_CLEAR_PURGATORY, -EUCLEAN);
+ break;
+ }
+ case IOC_LIBCFS_REGISTER_MYNID: {
+ /* Ignore if this is a noop */
+ if (data->ioc_nid == ni->ni_nid) {
+ rc = 0;
+ } else {
+ CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
+ libcfs_nid2str(data->ioc_nid),
+ libcfs_nid2str(ni->ni_nid));
+ rc = -EINVAL;
+ }
+ break;
+ }
+ }
+
+ return rc;
+}
+
+void
+kgnilnd_query(lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
+{
+ kgn_net_t *net = ni->ni_data;
+ kgn_tx_t *tx;
+ kgn_peer_t *peer = NULL;
+ kgn_conn_t *conn = NULL;
+ lnet_process_id_t id = {.nid = nid, .pid = LUSTRE_SRV_LNET_PID};
+ ENTRY;
+
+ /* I expect to find him, so only take a read lock */
+ read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+ peer = kgnilnd_find_peer_locked(nid);
+ if (peer != NULL) {
+ /* LIE if in a quiesce - we will update the timeouts after,
+ * but we don't want sends failing during it */
+ if (kgnilnd_data.kgn_quiesce_trigger) {
+ *when = jiffies;
+ read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+ GOTO(out, 0);
+ }
+
+ /* Update to best guess, might refine on later checks */
+ *when = peer->gnp_last_alive;
+
+ /* we have a peer, how about a conn? */
+ conn = kgnilnd_find_conn_locked(peer);
+
+ if (conn == NULL) {
+ /* if there is no conn, check peer last errno to see if clean disconnect
+ * - if it was, we lie to LNet because we believe a TX would complete
+ * on reconnect */
+ if (kgnilnd_conn_clean_errno(peer->gnp_last_errno)) {
+ *when = jiffies;
+ }
+ /* we still want to fire a TX and new conn in this case */
+ } else {
+ /* gnp_last_alive is valid, run for the hills */
+ read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+ GOTO(out, 0);
+ }
+ }
+ /* if we get here, either we have no peer or no conn for him, so fire off
+ * new TX to trigger conn setup */
+ read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+ /* if we couldn't find him, we'll fire up a TX and get connected -
+ * if we don't do this, after ni_peer_timeout, LNet will declare him dead.
+ * So really we treat kgnilnd_query as a bit of a 'connect now' type
+ * event because it'll only do this when it wants to send
+ *
+ * Use a real TX for this to get the proper gnp_tx_queue behavior, etc
+ * normally we'd use kgnilnd_send_ctlmsg for this, but we don't really
+ * care that this goes out quickly since we already know we need a new conn
+ * formed */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_NOOP_SEND))
+ return;
+
+ tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP, ni->ni_nid);
+ if (tx != NULL) {
+ kgnilnd_launch_tx(tx, net, &id);
+ }
+out:
+ CDEBUG(D_NETTRACE, "peer 0x%p->%s when %lu\n", peer,
+ libcfs_nid2str(nid), *when);
+ EXIT;
+}
+
+int
+kgnilnd_dev_init(kgn_device_t *dev)
+{
+ gni_return_t rrc;
+ int rc = 0;
+ unsigned int cq_size;
+ ENTRY;
+
+ /* size of these CQs should be able to accommodate the outgoing
+ * RDMA and SMSG transactions. Since we really don't know what we
+ * really need here, we'll take credits * 2 * 3 to allow a bunch.
+ * We need to dig into this more with the performance work. */
+ cq_size = *kgnilnd_tunables.kgn_credits * 2 * 3;
+
+ rrc = kgnilnd_cdm_create(dev->gnd_id, *kgnilnd_tunables.kgn_ptag,
+ GNILND_COOKIE, 0,
+ &dev->gnd_domain);
+ if (rrc != GNI_RC_SUCCESS) {
+ CERROR("Can't create CDM %d (%d)\n", dev->gnd_id, rrc);
+ rc = -ENODEV;
+ GOTO(failed, rc);
+ }
+
+ rrc = kgnilnd_cdm_attach(dev->gnd_domain, dev->gnd_id,
+ &dev->gnd_host_id, &dev->gnd_handle);
+ if (rrc != GNI_RC_SUCCESS) {
+ CERROR("Can't attach CDM to device %d (%d)\n",
+ dev->gnd_id, rrc);
+ rc = -ENODEV;
+ GOTO(failed, rc);
+ }
+
+ rc = kgnilnd_setup_nic_translation(dev->gnd_host_id);
+ if (rc != 0) {
+ rc = -ENODEV;
+ GOTO(failed, rc);
+ }
+
+ /* only dev 0 gets the errors - no need to reset the stack twice
+ * - this works because we have a single PTAG, if we had more
+ * then we'd need to have multiple handlers */
+ if (dev->gnd_id == 0) {
+ rrc = kgnilnd_subscribe_errors(dev->gnd_handle, GNI_ERRMASK_CRITICAL,
+ 0, NULL, kgnilnd_critical_error,
+ &dev->gnd_err_handle);
+ if (rrc != GNI_RC_SUCCESS) {
+ CERROR("Can't subscribe for errors on device %d: rc %d\n",
+ dev->gnd_id, rrc);
+ rc = -ENODEV;
+ GOTO(failed, rc);
+ }
+
+ rc = kgnilnd_set_quiesce_callback(dev->gnd_handle,
+ kgnilnd_quiesce_end_callback);
+ if (rc != GNI_RC_SUCCESS) {
+ CERROR("Can't subscribe for quiesce callback on device %d: rc %d\n",
+ dev->gnd_id, rrc);
+ rc = -ENODEV;
+ GOTO(failed, rc);
+ }
+ }
+
+ rc = kgnilnd_nicaddr_to_nid(dev->gnd_host_id, &dev->gnd_nid);
+ if (rc < 0) {
+ /* log messages during startup */
+ if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) {
+ CERROR("couldn't translate host_id 0x%x to nid. rc %d\n",
+ dev->gnd_host_id, rc);
+ }
+ rc = -ESRCH;
+ GOTO(failed, rc);
+ }
+ CDEBUG(D_NET, "NIC %x -> NID %d\n", dev->gnd_host_id, dev->gnd_nid);
+
+ rrc = kgnilnd_cq_create(dev->gnd_handle, cq_size,
+ 0, kgnilnd_device_callback,
+ dev->gnd_id, &dev->gnd_snd_rdma_cqh);
+ if (rrc != GNI_RC_SUCCESS) {
+ CERROR("Can't create rdma send cq size %u for device "
+ "%d (%d)\n", cq_size, dev->gnd_id, rrc);
+ rc = -EINVAL;
+ GOTO(failed, rc);
+ }
+
+ rrc = kgnilnd_cq_create(dev->gnd_handle, cq_size,
+ 0, kgnilnd_device_callback, dev->gnd_id,
+ &dev->gnd_snd_fma_cqh);
+ if (rrc != GNI_RC_SUCCESS) {
+ CERROR("Can't create fma send cq size %u for device %d (%d)\n",
+ cq_size, dev->gnd_id, rrc);
+ rc = -EINVAL;
+ GOTO(failed, rc);
+ }
+
+ /* This one we size differently - overflows are possible and it needs to be
+ * sized based on machine size */
+ rrc = kgnilnd_cq_create(dev->gnd_handle,
+ *kgnilnd_tunables.kgn_fma_cq_size,
+ 0, kgnilnd_device_callback, dev->gnd_id,
+ &dev->gnd_rcv_fma_cqh);
+ if (rrc != GNI_RC_SUCCESS) {
+ CERROR("Can't create fma cq size %d for device %d (%d)\n",
+ *kgnilnd_tunables.kgn_fma_cq_size, dev->gnd_id, rrc);
+ rc = -EINVAL;
+ GOTO(failed, rc);
+ }
+
+ RETURN(0);
+
+failed:
+ kgnilnd_dev_fini(dev);
+ RETURN(rc);
+}
+
+void
+kgnilnd_dev_fini(kgn_device_t *dev)
+{
+ gni_return_t rrc;
+ ENTRY;
+
+ /* At quiesce or rest time, need to loop through and clear gnd_ready_conns ?*/
+ LASSERTF(list_empty(&dev->gnd_ready_conns) &&
+ list_empty(&dev->gnd_map_tx) &&
+ list_empty(&dev->gnd_rdmaq),
+ "dev 0x%p ready_conns %d@0x%p map_tx %d@0x%p rdmaq %d@0x%p\n",
+ dev, kgnilnd_count_list(&dev->gnd_ready_conns), &dev->gnd_ready_conns,
+ kgnilnd_count_list(&dev->gnd_map_tx), &dev->gnd_map_tx,
+ kgnilnd_count_list(&dev->gnd_rdmaq), &dev->gnd_rdmaq);
+
+ /* These should follow from tearing down all connections */
+ LASSERTF(dev->gnd_map_nphys == 0 && dev->gnd_map_physnop == 0,
+ "%d physical mappings of %d pages still mapped\n",
+ dev->gnd_map_nphys, dev->gnd_map_physnop);
+
+ LASSERTF(dev->gnd_map_nvirt == 0 && dev->gnd_map_virtnob == 0,
+ "%d virtual mappings of "LPU64" bytes still mapped\n",
+ dev->gnd_map_nvirt, dev->gnd_map_virtnob);
+
+ LASSERTF(atomic_read(&dev->gnd_n_mdd) == 0 &&
+ atomic_read(&dev->gnd_n_mdd_held) == 0 &&
+ atomic64_read(&dev->gnd_nbytes_map) == 0,
+ "%d SMSG mappings of %ld bytes still mapped or held %d\n",
+ atomic_read(&dev->gnd_n_mdd),
+ atomic64_read(&dev->gnd_nbytes_map), atomic_read(&dev->gnd_n_mdd_held));
+
+ LASSERT(list_empty(&dev->gnd_map_list));
+
+ /* What other assertions needed to ensure all connections torn down ? */
+
+ /* check all counters == 0 (EP, MDD, etc) */
+
+ /* if we are resetting due to quiese (stack reset), don't check
+ * thread states */
+ LASSERTF(kgnilnd_data.kgn_quiesce_trigger ||
+ atomic_read(&kgnilnd_data.kgn_nthreads) == 0,
+ "tried to shutdown with threads active\n");
+
+ if (dev->gnd_rcv_fma_cqh) {
+ rrc = kgnilnd_cq_destroy(dev->gnd_rcv_fma_cqh);
+ LASSERTF(rrc == GNI_RC_SUCCESS,
+ "bad rc from gni_cq_destroy on rcv_fma_cqh: %d\n", rrc);
+ dev->gnd_rcv_fma_cqh = NULL;
+ }
+
+ if (dev->gnd_snd_rdma_cqh) {
+ rrc = kgnilnd_cq_destroy(dev->gnd_snd_rdma_cqh);
+ LASSERTF(rrc == GNI_RC_SUCCESS,
+ "bad rc from gni_cq_destroy on send_rdma_cqh: %d\n", rrc);
+ dev->gnd_snd_rdma_cqh = NULL;
+ }
+
+ if (dev->gnd_snd_fma_cqh) {
+ rrc = kgnilnd_cq_destroy(dev->gnd_snd_fma_cqh);
+ LASSERTF(rrc == GNI_RC_SUCCESS,
+ "bad rc from gni_cq_destroy on snd_fma_cqh: %d\n", rrc);
+ dev->gnd_snd_fma_cqh = NULL;
+ }
+
+ if (dev->gnd_err_handle) {
+ rrc = kgnilnd_release_errors(dev->gnd_err_handle);
+ LASSERTF(rrc == GNI_RC_SUCCESS,
+ "bad rc from gni_release_errors: %d\n", rrc);
+ dev->gnd_err_handle = NULL;
+ }
+
+ if (dev->gnd_domain) {
+ rrc = kgnilnd_cdm_destroy(dev->gnd_domain);
+ LASSERTF(rrc == GNI_RC_SUCCESS,
+ "bad rc from gni_cdm_destroy: %d\n", rrc);
+ dev->gnd_domain = NULL;
+ }
+
+ EXIT;
+}
+
+
+int kgnilnd_base_startup(void)
+{
+ struct timeval tv;
+ int pkmem = atomic_read(&libcfs_kmemory);
+ int rc;
+ int i;
+ kgn_device_t *dev;
+ struct task_struct *thrd;
+ ENTRY;
+
+ LASSERTF(kgnilnd_data.kgn_init == GNILND_INIT_NOTHING,
+ "init %d\n", kgnilnd_data.kgn_init);
+
+ /* zero pointers, flags etc */
+ memset(&kgnilnd_data, 0, sizeof(kgnilnd_data));
+ memset(&kgnilnd_hssops, 0, sizeof(kgnilnd_hssops));
+
+ /* CAVEAT EMPTOR: Every 'Fma' message includes the sender's NID and
+ * a unique (for all time) connstamp so we can uniquely identify
+ * the sender. The connstamp is an incrementing counter
+ * initialised with seconds + microseconds at startup time. So we
+ * rely on NOT creating connections more frequently on average than
+ * 1MHz to ensure we don't use old connstamps when we reboot. */
+ do_gettimeofday(&tv);
+ kgnilnd_data.kgn_connstamp =
+ kgnilnd_data.kgn_peerstamp =
+ (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+
+ init_rwsem(&kgnilnd_data.kgn_net_rw_sem);
+
+ for (i = 0; i < GNILND_MAXDEVS; i++) {
+ kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
+
+ dev->gnd_id = i;
+ INIT_LIST_HEAD(&dev->gnd_ready_conns);
+ INIT_LIST_HEAD(&dev->gnd_map_tx);
+ INIT_LIST_HEAD(&dev->gnd_fma_buffs);
+ mutex_init(&dev->gnd_cq_mutex);
+ sema_init(&dev->gnd_fmablk_sem, 1);
+ spin_lock_init(&dev->gnd_fmablk_lock);
+ init_waitqueue_head(&dev->gnd_waitq);
+ init_waitqueue_head(&dev->gnd_dgram_waitq);
+ init_waitqueue_head(&dev->gnd_dgping_waitq);
+ spin_lock_init(&dev->gnd_lock);
+ INIT_LIST_HEAD(&dev->gnd_map_list);
+ spin_lock_init(&dev->gnd_map_lock);
+ atomic_set(&dev->gnd_nfmablk, 0);
+ atomic_set(&dev->gnd_fmablk_vers, 1);
+ atomic_set(&dev->gnd_neps, 0);
+ atomic_set(&dev->gnd_canceled_dgrams, 0);
+ INIT_LIST_HEAD(&dev->gnd_connd_peers);
+ spin_lock_init(&dev->gnd_connd_lock);
+ spin_lock_init(&dev->gnd_dgram_lock);
+ spin_lock_init(&dev->gnd_rdmaq_lock);
+ INIT_LIST_HEAD(&dev->gnd_rdmaq);
+
+ /* alloc & setup nid based dgram table */
+ LIBCFS_ALLOC(dev->gnd_dgrams,
+ sizeof(struct list_head) * *kgnilnd_tunables.kgn_peer_hash_size);
+
+ if (dev->gnd_dgrams == NULL) {
+ rc = -ENOMEM;
+ GOTO(failed, rc);
+ }
+
+ for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
+ INIT_LIST_HEAD(&dev->gnd_dgrams[i]);
+ }
+ atomic_set(&dev->gnd_ndgrams, 0);
+
+ /* setup timer for RDMAQ processing */
+ setup_timer(&dev->gnd_rdmaq_timer, kgnilnd_schedule_device_timer,
+ (unsigned long)dev);
+ }
+
+ /* CQID 0 isn't allowed, set to MAX_MSG_ID - 1 to check for conflicts early */
+ kgnilnd_data.kgn_next_cqid = GNILND_MAX_MSG_ID - 1;
+ kgnilnd_data.kgn_new_min_timeout = *kgnilnd_tunables.kgn_timeout;
+ init_waitqueue_head(&kgnilnd_data.kgn_reaper_waitq);
+ init_waitqueue_head(&kgnilnd_data.kgn_ruhroh_waitq);
+ spin_lock_init(&kgnilnd_data.kgn_reaper_lock);
+
+ sema_init(&kgnilnd_data.kgn_quiesce_sem, 1);
+ atomic_set(&kgnilnd_data.kgn_nquiesce, 0);
+ atomic_set(&kgnilnd_data.kgn_npending_conns, 0);
+ atomic_set(&kgnilnd_data.kgn_npending_unlink, 0);
+ atomic_set(&kgnilnd_data.kgn_npending_detach, 0);
+ /* OK to call kgnilnd_api_shutdown() to cleanup now */
+ kgnilnd_data.kgn_init = GNILND_INIT_DATA;
+ PORTAL_MODULE_USE;
+
+ rwlock_init(&kgnilnd_data.kgn_peer_conn_lock);
+
+ LIBCFS_ALLOC(kgnilnd_data.kgn_peers,
+ sizeof(struct list_head) * *kgnilnd_tunables.kgn_peer_hash_size);
+
+ if (kgnilnd_data.kgn_peers == NULL) {
+ rc = -ENOMEM;
+ GOTO(failed, rc);
+ }
+
+ for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
+ INIT_LIST_HEAD(&kgnilnd_data.kgn_peers[i]);
+ }
+
+ LIBCFS_ALLOC(kgnilnd_data.kgn_conns,
+ sizeof(struct list_head) * *kgnilnd_tunables.kgn_peer_hash_size);
+
+ if (kgnilnd_data.kgn_conns == NULL) {
+ rc = -ENOMEM;
+ GOTO(failed, rc);
+ }
+
+ for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
+ INIT_LIST_HEAD(&kgnilnd_data.kgn_conns[i]);
+ }
+
+ LIBCFS_ALLOC(kgnilnd_data.kgn_nets,
+ sizeof(struct list_head) * *kgnilnd_tunables.kgn_net_hash_size);
+
+ if (kgnilnd_data.kgn_nets == NULL) {
+ rc = -ENOMEM;
+ GOTO(failed, rc);
+ }
+
+ for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) {
+ INIT_LIST_HEAD(&kgnilnd_data.kgn_nets[i]);
+ }
+
+ kgnilnd_data.kgn_mbox_cache =
+ cfs_mem_cache_create("kgn_mbox_block",
+ KMALLOC_MAX_SIZE,
+ 0, /* offset */
+ SLAB_HWCACHE_ALIGN); /* flags */
+ if (kgnilnd_data.kgn_mbox_cache == NULL) {
+ CERROR("Can't create slab for physical mbox blocks\n");
+ rc = -ENOMEM;
+ GOTO(failed, rc);
+ }
+
+ kgnilnd_data.kgn_rx_cache =
+ cfs_mem_cache_create("kgn_rx_t",
+ sizeof(kgn_rx_t),
+ 0, /* offset */
+ 0); /* flags */
+ if (kgnilnd_data.kgn_rx_cache == NULL) {
+ CERROR("Can't create slab for kgn_rx_t descriptors\n");
+ rc = -ENOMEM;
+ GOTO(failed, rc);
+ }
+
+ kgnilnd_data.kgn_tx_cache =
+ cfs_mem_cache_create("kgn_tx_t",
+ sizeof(kgn_tx_t),
+ 0, /* offset */
+ 0); /* flags */
+ if (kgnilnd_data.kgn_tx_cache == NULL) {
+ CERROR("Can't create slab for kgn_tx_t\n");
+ rc = -ENOMEM;
+ GOTO(failed, rc);
+ }
+
+ kgnilnd_data.kgn_tx_phys_cache =
+ cfs_mem_cache_create("kgn_tx_phys",
+ LNET_MAX_IOV * sizeof(gni_mem_segment_t),
+ 0, /* offset */
+ 0); /* flags */
+ if (kgnilnd_data.kgn_tx_phys_cache == NULL) {
+ CERROR("Can't create slab for kgn_tx_phys\n");
+ rc = -ENOMEM;
+ GOTO(failed, rc);
+ }
+
+ kgnilnd_data.kgn_dgram_cache =
+ cfs_mem_cache_create("kgn_dgram_t",
+ sizeof(kgn_dgram_t),
+ 0, /* offset */
+ 0); /* flags */
+ if (kgnilnd_data.kgn_dgram_cache == NULL) {
+ CERROR("Can't create slab for outgoing datagrams\n");
+ rc = -ENOMEM;
+ GOTO(failed, rc);
+ }
+
+ /* allocate a MAX_IOV array of page pointers for each cpu */
+ kgnilnd_data.kgn_cksum_map_pages = kmalloc(num_possible_cpus() * sizeof (struct page *),
+ GFP_KERNEL);
+ if (kgnilnd_data.kgn_cksum_map_pages == NULL) {
+ CERROR("Can't allocate vmap cksum pages\n");
+ rc = -ENOMEM;
+ GOTO(failed, rc);
+ }
+ kgnilnd_data.kgn_cksum_npages = num_possible_cpus();
+ memset(kgnilnd_data.kgn_cksum_map_pages, 0,
+ kgnilnd_data.kgn_cksum_npages * sizeof (struct page *));
+
+ for (i = 0; i < kgnilnd_data.kgn_cksum_npages; i++) {
+ kgnilnd_data.kgn_cksum_map_pages[i] = kmalloc(LNET_MAX_IOV * sizeof (struct page *),
+ GFP_KERNEL);
+ if (kgnilnd_data.kgn_cksum_map_pages[i] == NULL) {
+ CERROR("Can't allocate vmap cksum pages for cpu %d\n", i);
+ rc = -ENOMEM;
+ GOTO(failed, rc);
+ }
+ }
+
+ LASSERT(kgnilnd_data.kgn_ndevs == 0);
+
+ /* Use all available GNI devices */
+ for (i = 0; i < GNILND_MAXDEVS; i++) {
+ dev = &kgnilnd_data.kgn_devices[kgnilnd_data.kgn_ndevs];
+
+ rc = kgnilnd_dev_init(dev);
+ if (rc == 0) {
+ /* Increment here so base_shutdown cleans it up */
+ kgnilnd_data.kgn_ndevs++;
+
+ rc = kgnilnd_allocate_phys_fmablk(dev);
+ if (rc) {
+ GOTO(failed, rc);
+ }
+ }
+ }
+
+ if (kgnilnd_data.kgn_ndevs == 0) {
+ CERROR("Can't initialise any GNI devices\n");
+ rc = -ENODEV;
+ GOTO(failed, rc);
+ }
+
+ rc = kgnilnd_thread_start(kgnilnd_reaper, NULL, "kgnilnd_rpr", 0);
+ if (rc != 0) {
+ CERROR("Can't spawn gnilnd reaper: %d\n", rc);
+ GOTO(failed, rc);
+ }
+
+ /*
+ * Start ruhroh thread. We can't use kgnilnd_thread_start() because
+ * we don't want this thread included in kgnilnd_data.kgn_nthreads
+ * count. This thread controls quiesce, so it mustn't
+ * quiesce itself.
+ */
+ thrd = kthread_run(kgnilnd_ruhroh_thread, NULL, "%s_%02d", "kgnilnd_rr", 0);
+ if (IS_ERR(thrd)) {
+ rc = PTR_ERR(thrd);
+ CERROR("Can't spawn gnilnd ruhroh thread: %d\n", rc);
+ GOTO(failed, rc);
+ }
+
+ /* threads will load balance across devs as they are available */
+ for (i = 0; i < *kgnilnd_tunables.kgn_sched_threads; i++) {
+ rc = kgnilnd_thread_start(kgnilnd_scheduler, (void *)((long)i),
+ "kgnilnd_sd", i);
+ if (rc != 0) {
+ CERROR("Can't spawn gnilnd scheduler[%d]: %d\n",
+ i, rc);
+ GOTO(failed, rc);
+ }
+ }
+
+ for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+ dev = &kgnilnd_data.kgn_devices[i];
+ rc = kgnilnd_thread_start(kgnilnd_dgram_mover, dev,
+ "kgnilnd_dg", dev->gnd_id);
+ if (rc != 0) {
+ CERROR("Can't spawn gnilnd dgram_mover[%d]: %d\n",
+ dev->gnd_id, rc);
+ GOTO(failed, rc);
+ }
+
+ rc = kgnilnd_thread_start(kgnilnd_dgram_waitq, dev,
+ "kgnilnd_dgn", dev->gnd_id);
+ if (rc != 0) {
+ CERROR("Can't spawn gnilnd dgram_waitq[%d]: %d\n",
+ dev->gnd_id, rc);
+ GOTO(failed, rc);
+ }
+
+ rc = kgnilnd_setup_wildcard_dgram(dev);
+
+ if (rc != 0) {
+ CERROR("Can't create wildcard dgrams[%d]: %d\n",
+ dev->gnd_id, rc);
+ GOTO(failed, rc);
+ }
+ }
+
+
+
+ /* flag everything initialised */
+ kgnilnd_data.kgn_init = GNILND_INIT_ALL;
+ /*****************************************************/
+
+ CDEBUG(D_MALLOC, "initial kmem %d\n", pkmem);
+ RETURN(0);
+
+failed:
+ kgnilnd_base_shutdown();
+ kgnilnd_data.kgn_init = GNILND_INIT_NOTHING;
+ RETURN(rc);
+}
+
+void
+kgnilnd_base_shutdown(void)
+{
+ int i;
+ ENTRY;
+
+ while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_PAUSE_SHUTDOWN, 1)) {};
+
+ kgnilnd_data.kgn_wc_kill = 1;
+
+ for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+ kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
+ kgnilnd_cancel_wc_dgrams(dev);
+ kgnilnd_del_conn_or_peer(NULL, LNET_NID_ANY, GNILND_DEL_PEER, -ESHUTDOWN);
+ kgnilnd_wait_for_canceled_dgrams(dev);
+ }
+
+ /* Peer state all cleaned up BEFORE setting shutdown, so threads don't
+ * have to worry about shutdown races. NB connections may be created
+ * while there are still active connds, but these will be temporary
+ * since peer creation always fails after the listener has started to
+ * shut down.
+ * all peers should have been cleared out on the nets */
+ LASSERTF(atomic_read(&kgnilnd_data.kgn_npeers) == 0,
+ "peers left %d\n", atomic_read(&kgnilnd_data.kgn_npeers));
+
+ /* Wait for the ruhroh thread to shut down. */
+ kgnilnd_data.kgn_ruhroh_shutdown = 1;
+ wake_up(&kgnilnd_data.kgn_ruhroh_waitq);
+ i = 2;
+ while (kgnilnd_data.kgn_ruhroh_running != 0) {
+ i++;
+ CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
+ "Waiting for ruhroh thread to terminate\n");
+ cfs_pause(cfs_time_seconds(1));
+ }
+
+ /* Flag threads to terminate */
+ kgnilnd_data.kgn_shutdown = 1;
+
+ for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+ kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
+
+ /* should clear all the MDDs */
+ kgnilnd_unmap_phys_fmablk(dev);
+
+ kgnilnd_schedule_device(dev);
+ wake_up_all(&dev->gnd_dgram_waitq);
+ wake_up_all(&dev->gnd_dgping_waitq);
+ LASSERT(list_empty(&dev->gnd_connd_peers));
+ }
+
+ spin_lock(&kgnilnd_data.kgn_reaper_lock);
+ wake_up_all(&kgnilnd_data.kgn_reaper_waitq);
+ spin_unlock(&kgnilnd_data.kgn_reaper_lock);
+
+ /* Wait for threads to exit */
+ i = 2;
+ while (atomic_read(&kgnilnd_data.kgn_nthreads) != 0) {
+ i++;
+ CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+ "Waiting for %d threads to terminate\n",
+ atomic_read(&kgnilnd_data.kgn_nthreads));
+ cfs_pause(cfs_time_seconds(1));
+ }
+
+ LASSERTF(atomic_read(&kgnilnd_data.kgn_npeers) == 0,
+ "peers left %d\n", atomic_read(&kgnilnd_data.kgn_npeers));
+
+ if (kgnilnd_data.kgn_peers != NULL) {
+ for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++)
+ LASSERT(list_empty(&kgnilnd_data.kgn_peers[i]));
+
+ LIBCFS_FREE(kgnilnd_data.kgn_peers,
+ sizeof (struct list_head) *
+ *kgnilnd_tunables.kgn_peer_hash_size);
+ }
+
+ down_write(&kgnilnd_data.kgn_net_rw_sem);
+ if (kgnilnd_data.kgn_nets != NULL) {
+ for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++)
+ LASSERT(list_empty(&kgnilnd_data.kgn_nets[i]));
+
+ LIBCFS_FREE(kgnilnd_data.kgn_nets,
+ sizeof (struct list_head) *
+ *kgnilnd_tunables.kgn_net_hash_size);
+ }
+ up_write(&kgnilnd_data.kgn_net_rw_sem);
+
+ LASSERTF(atomic_read(&kgnilnd_data.kgn_nconns) == 0,
+ "conns left %d\n", atomic_read(&kgnilnd_data.kgn_nconns));
+
+ if (kgnilnd_data.kgn_conns != NULL) {
+ for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++)
+ LASSERT(list_empty(&kgnilnd_data.kgn_conns[i]));
+
+ LIBCFS_FREE(kgnilnd_data.kgn_conns,
+ sizeof (struct list_head) *
+ *kgnilnd_tunables.kgn_peer_hash_size);
+ }
+
+ for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+ kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
+ kgnilnd_dev_fini(dev);
+
+ LASSERTF(atomic_read(&dev->gnd_ndgrams) == 0,
+ "dgrams left %d\n", atomic_read(&dev->gnd_ndgrams));
+
+ if (dev->gnd_dgrams != NULL) {
+ for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++)
+ LASSERT(list_empty(&dev->gnd_dgrams[i]));
+
+ LIBCFS_FREE(dev->gnd_dgrams,
+ sizeof (struct list_head) *
+ *kgnilnd_tunables.kgn_peer_hash_size);
+ }
+
+ kgnilnd_free_phys_fmablk(dev);
+ }
+
+ if (kgnilnd_data.kgn_mbox_cache != NULL) {
+ i = cfs_mem_cache_destroy(kgnilnd_data.kgn_mbox_cache);
+ LASSERTF(i == 0, "rc %d destroying kgn_mbox_cache\n", i);
+ }
+
+ if (kgnilnd_data.kgn_rx_cache != NULL) {
+ i = cfs_mem_cache_destroy(kgnilnd_data.kgn_rx_cache);
+ LASSERTF(i == 0, "rc %d destroying kgn_rx_cache\n", i);
+ }
+
+ if (kgnilnd_data.kgn_tx_cache != NULL) {
+ i = cfs_mem_cache_destroy(kgnilnd_data.kgn_tx_cache);
+ LASSERTF(i == 0, "rc %d destroying kgn_tx_cache\n", i);
+ }
+
+ if (kgnilnd_data.kgn_tx_phys_cache != NULL) {
+ i = cfs_mem_cache_destroy(kgnilnd_data.kgn_tx_phys_cache);
+ LASSERTF(i == 0, "rc %d destroying kgn_tx_phys_cache\n", i);
+ }
+
+ if (kgnilnd_data.kgn_dgram_cache != NULL) {
+ i = cfs_mem_cache_destroy(kgnilnd_data.kgn_dgram_cache);
+ LASSERTF(i == 0, "rc %d destroying kgn_dgram_cache\n", i);
+ }
+
+ if (kgnilnd_data.kgn_cksum_map_pages != NULL) {
+ for (i = 0; i < kgnilnd_data.kgn_cksum_npages; i++) {
+ if (kgnilnd_data.kgn_cksum_map_pages[i] != NULL) {
+ kfree(kgnilnd_data.kgn_cksum_map_pages[i]);
+ }
+ }
+ kfree(kgnilnd_data.kgn_cksum_map_pages);
+ }
+
+ CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+ atomic_read(&libcfs_kmemory));
+
+ kgnilnd_data.kgn_init = GNILND_INIT_NOTHING;
+ PORTAL_MODULE_UNUSE;
+
+ EXIT;
+}
+
+int
+kgnilnd_startup(lnet_ni_t *ni)
+{
+ int rc, devno;
+ kgn_net_t *net;
+ ENTRY;
+
+ LASSERTF(ni->ni_lnd == &the_kgnilnd,
+ "bad LND 0x%p != the_kgnilnd @ 0x%p\n",
+ ni->ni_lnd, &the_kgnilnd);
+
+ if (kgnilnd_data.kgn_init == GNILND_INIT_NOTHING) {
+ rc = kgnilnd_base_startup();
+ if (rc != 0)
+ RETURN(rc);
+ }
+
+ /* Serialize with shutdown. */
+ down(&kgnilnd_data.kgn_quiesce_sem);
+
+ LIBCFS_ALLOC(net, sizeof(*net));
+ if (net == NULL) {
+ CERROR("could not allocate net for new interface instance\n");
+ rc = -ENOMEM;
+ /* no need to cleanup the CDM... */
+ GOTO(failed, rc);
+ }
+ INIT_LIST_HEAD(&net->gnn_list);
+ ni->ni_data = net;
+ net->gnn_ni = ni;
+ ni->ni_maxtxcredits = *kgnilnd_tunables.kgn_credits;
+ ni->ni_peertxcredits = *kgnilnd_tunables.kgn_peer_credits;
+
+ if (*kgnilnd_tunables.kgn_peer_health) {
+ int fudge;
+
+ /* give this a bit of leeway - we don't have a hard timeout
+ * as we only check timeouts periodically - see comment in kgnilnd_reaper */
+ fudge = (GNILND_TO2KA(*kgnilnd_tunables.kgn_timeout) / GNILND_REAPER_NCHECKS);
+
+ ni->ni_peertimeout = *kgnilnd_tunables.kgn_timeout + fudge;
+
+ LCONSOLE_INFO("Enabling LNet peer health for gnilnd, timeout %ds\n",
+ ni->ni_peertimeout);
+ }
+
+ atomic_set(&net->gnn_refcount, 1);
+
+ /* if we have multiple devices, spread the nets around */
+ net->gnn_netnum = LNET_NETNUM(LNET_NIDNET(ni->ni_nid));
+
+ devno = LNET_NIDNET(ni->ni_nid) % GNILND_MAXDEVS;
+ net->gnn_dev = &kgnilnd_data.kgn_devices[devno];
+
+ /* allocate a 'dummy' cdm for datagram use. We can only have a single
+ * datagram between a nid:inst_id and nid2:inst_id. The fake cdm
+ * give us additional inst_id to use, allowing the datagrams to flow
+ * like rivers of honey and beer */
+
+ /* the instance id for the cdm is the NETNUM offset by MAXDEVS -
+ * ensuring we'll have a unique id */
+
+
+ ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), net->gnn_dev->gnd_nid);
+ CDEBUG(D_NET, "adding net %p nid=%s on dev %d \n",
+ net, libcfs_nid2str(ni->ni_nid), net->gnn_dev->gnd_id);
+ /* until the gnn_list is set, we need to cleanup ourselves as
+ * kgnilnd_shutdown is just gonna get confused */
+
+ down_write(&kgnilnd_data.kgn_net_rw_sem);
+ list_add_tail(&net->gnn_list, kgnilnd_netnum2netlist(net->gnn_netnum));
+ up_write(&kgnilnd_data.kgn_net_rw_sem);
+
+ /* we need a separate thread to call probe_wait_by_id until
+ * we get a function callback notifier from kgni */
+ up(&kgnilnd_data.kgn_quiesce_sem);
+ RETURN(0);
+ failed:
+ up(&kgnilnd_data.kgn_quiesce_sem);
+ kgnilnd_shutdown(ni);
+ RETURN(rc);
+}
+
+void
+kgnilnd_shutdown(lnet_ni_t *ni)
+{
+ kgn_net_t *net = ni->ni_data;
+ int i;
+ int rc;
+ ENTRY;
+
+ CFS_RACE(CFS_FAIL_GNI_SR_DOWN_RACE);
+
+ LASSERTF(kgnilnd_data.kgn_init == GNILND_INIT_ALL,
+ "init %d\n", kgnilnd_data.kgn_init);
+
+ /* Serialize with startup. */
+ down(&kgnilnd_data.kgn_quiesce_sem);
+ CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
+ atomic_read(&libcfs_kmemory));
+
+ if (net == NULL) {
+ CERROR("got NULL net for ni %p\n", ni);
+ rc = -EINVAL;
+ GOTO(out, rc);
+ }
+
+ LASSERTF(ni == net->gnn_ni,
+ "ni %p gnn_ni %p\n", net, net->gnn_ni);
+
+ ni->ni_data = NULL;
+
+ LASSERT(!net->gnn_shutdown);
+ LASSERTF(atomic_read(&net->gnn_refcount) != 0,
+ "net %p refcount %d\n",
+ net, atomic_read(&net->gnn_refcount));
+
+ if (!list_empty(&net->gnn_list)) {
+ /* serialize with peer creation */
+ down_write(&kgnilnd_data.kgn_net_rw_sem);
+ net->gnn_shutdown = 1;
+ up_write(&kgnilnd_data.kgn_net_rw_sem);
+
+ kgnilnd_cancel_net_dgrams(net);
+
+ kgnilnd_del_conn_or_peer(net, LNET_NID_ANY, GNILND_DEL_PEER, -ESHUTDOWN);
+
+ /* if we are quiesced, need to wake up - we need those threads
+ * alive to release peers, etc */
+ if (GNILND_IS_QUIESCED) {
+ set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_IDLE);
+ kgnilnd_quiesce_wait("shutdown");
+ }
+
+ kgnilnd_wait_for_canceled_dgrams(net->gnn_dev);
+
+ /* We wait until the nets ref's are 1, we will release final ref which is ours
+ * this allows us to make sure everything else is done before we free the
+ * net.
+ */
+ i = 4;
+ while (atomic_read(&net->gnn_refcount) != 1) {
+ i++;
+ CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
+ "Waiting for %d references to clear on net %d\n",
+ atomic_read(&net->gnn_refcount),
+ net->gnn_netnum);
+ cfs_pause(cfs_time_seconds(1));
+ }
+
+ /* release ref from kgnilnd_startup */
+ kgnilnd_net_decref(net);
+ /* serialize with reaper and conn_task looping */
+ down_write(&kgnilnd_data.kgn_net_rw_sem);
+ list_del_init(&net->gnn_list);
+ up_write(&kgnilnd_data.kgn_net_rw_sem);
+
+ }
+
+ /* not locking, this can't race with writers */
+ LASSERTF(atomic_read(&net->gnn_refcount) == 0,
+ "net %p refcount %d\n",
+ net, atomic_read(&net->gnn_refcount));
+ LIBCFS_FREE(net, sizeof(*net));
+
+out:
+ down_read(&kgnilnd_data.kgn_net_rw_sem);
+ for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) {
+ if (!list_empty(&kgnilnd_data.kgn_nets[i])) {
+ up_read(&kgnilnd_data.kgn_net_rw_sem);
+ break;
+ }
+
+ if (i == *kgnilnd_tunables.kgn_net_hash_size - 1) {
+ up_read(&kgnilnd_data.kgn_net_rw_sem);
+ kgnilnd_base_shutdown();
+ }
+ }
+ CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+ atomic_read(&libcfs_kmemory));
+
+ up(&kgnilnd_data.kgn_quiesce_sem);
+ EXIT;
+ return;
+}
+
+void __exit
+kgnilnd_module_fini(void)
+{
+ lnet_unregister_lnd(&the_kgnilnd);
+ kgnilnd_proc_fini();
+ kgnilnd_remove_sysctl();
+ kgnilnd_tunables_fini();
+}
+
+int __init
+kgnilnd_module_init(void)
+{
+ int rc;
+
+ rc = kgnilnd_tunables_init();
+ if (rc != 0)
+ return rc;
+
+ printk(KERN_INFO "Lustre: kgnilnd build version: "KGNILND_BUILD_REV"\n");
+
+ kgnilnd_insert_sysctl();
+ kgnilnd_proc_init();
+
+ lnet_register_lnd(&the_kgnilnd);
+
+ return 0;
+}
+
+MODULE_AUTHOR("Cray, Inc. <nic@cray.com>");
+MODULE_DESCRIPTION("Kernel Gemini LND v"KGNILND_BUILD_REV);
+MODULE_LICENSE("GPL");
+
+module_init(kgnilnd_module_init);
+module_exit(kgnilnd_module_fini);
--- /dev/null
+/*
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ * Copyright (C) 2009-2012 Cray, Inc.
+ *
+ * Derived from work by: Eric Barton <eric@bartonsoftware.com>
+ * Author: Nic Henke <nic@cray.com>
+ * Author: James Shimek <jshimek@cray.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#ifndef _GNILND_GNILND_H_
+#define _GNILND_GNILND_H_
+
+#define DEBUG_SUBSYSTEM S_LND
+
+#include <libcfs/libcfs.h>
+#include <lnet/lnet.h>
+#include <lnet/lib-lnet.h>
+#include <lnet/lnet-sysctl.h>
+
+#include <gni_pub.h>
+#include "gnilnd_version.h"
+#include "gnilnd_hss_ops.h"
+
+/* tunables determined at compile time */
+#define GNILND_MIN_TIMEOUT 5 /* minimum timeout interval (seconds) */
+#define GNILND_BASE_TIMEOUT 60 /* default sane timeout */
+#define GNILND_TO2KA(t) (((t)-1)/2) /* timeout -> keepalive interval */
+#define GNILND_MIN_RECONNECT_TO (GNILND_BASE_TIMEOUT/4)
+#define GNILND_MAX_RECONNECT_TO GNILND_BASE_TIMEOUT
+#define GNILND_HARDWARE_TIMEOUT 15 /* maximum time for data to travel between nodes */
+#define GNILND_MDD_TIMEOUT 15 /* MDD hold timeout in minutes */
+
+/* reaper thread wakup interval */
+#define GNILND_REAPER_THREAD_WAKE 1
+/* reaper thread checks each conn NCHECKS time every kgnilnd_data.kgn_new_min_timeout */
+#define GNILND_REAPER_NCHECKS 4
+
+/* fixed constants */
+#define GNILND_MAXDEVS 1 /* max # of GNI devices currently supported */
+#define GNILND_MBOX_CREDITS 256 /* number of credits per mailbox */
+#define GNILND_COOKIE 0xa3579 /* cookie used by along with ptag by GNI */
+
+/* checksum values */
+#define GNILND_CHECKSUM_OFF 0 /* checksum turned off */
+#define GNILND_CHECKSUM_SMSG_HEADER 1 /* Only checksum SMSG header */
+#define GNILND_CHECKSUM_SMSG 2 /* checksum entire SMSG packet */
+#define GNILND_CHECKSUM_SMSG_BTE 3 /* Full checksum support */
+
+/* tune down some COMPUTE options as they won't see the same number of connections and
+ * don't need the throughput of multiple threads by default */
+#if defined(CONFIG_CRAY_COMPUTE)
+#define GNILND_SCHED_THREADS 1 /* default # of kgnilnd_scheduler threads */
+#define GNILND_FMABLK 64 /* default number of mboxes per fmablk */
+#else
+#define GNILND_SCHED_THREADS 3 /* default # of kgnilnd_scheduler threads */
+#define GNILND_FMABLK 1024 /* default number of mboxes per fmablk */
+#endif
+
+/* EXTRA_BITS are there to allow us to hide NOOP/CLOSE and anything else out of band */
+#define GNILND_EXTRA_BITS 1
+/* maximum number of conns & bits for cqid in the SMSG event data */
+#define GNILND_CQID_NBITS (21 - GNILND_EXTRA_BITS)
+#define GNILND_MSGID_TX_NBITS (32 - GNILND_CQID_NBITS)
+#define GNILND_MAX_CQID (1 << GNILND_CQID_NBITS)
+#define GNILND_MAX_MSG_ID (1 << GNILND_MSGID_TX_NBITS)
+#define GNILND_MAX_MSG_SIZE (*kgnilnd_tunables.kgn_max_immediate + sizeof(kgn_msg_t))
+
+/* need sane upper bound to limit copy overhead */
+#define GNILND_MAX_IMMEDIATE (64<<10)
+
+/* payload size to add to the base mailbox size
+ * This is subtracting 2 from the concurrent_sends as 4 messages are included in the size
+ * gni_smsg_buff_size_needed calculates, the MAX_PAYLOAD is added to
+ * the calculation return from that function.*/
+#define GNILND_MBOX_PAYLOAD \
+ (GNILND_MAX_MSG_SIZE * \
+ ((*kgnilnd_tunables.kgn_concurrent_sends - 2) * 2));
+
+/* timeout -> deadman timer for kgni mdd holds */
+#define GNILND_TIMEOUT2DEADMAN ((*kgnilnd_tunables.kgn_mdd_timeout) * 1000 * 60)
+
+/* timeout for failing sends in t is in jiffies*/
+#define GNILND_TIMEOUTRX(t) (t + cfs_time_seconds(*kgnilnd_tunables.kgn_hardware_timeout))
+
+/* time when to release from purgatory in the reaper thread in jiffies */
+#define GNILND_PURG_RELEASE(t) (GNILND_TIMEOUTRX(t) * 3)
+
+/* Macro for finding last_rx 2 datapoints are compared
+ * and the most recent one in jiffies is returned.
+ */
+#define GNILND_LASTRX(conn) (time_after(conn->gnc_last_rx, conn->gnc_last_rx_cq) \
+ ? conn->gnc_last_rx : conn->gnc_last_rx_cq)
+
+/************************************************************************
+ * Enum, flag and tag data
+ */
+#define GNILND_INIT_NOTHING 0
+#define GNILND_INIT_DATA 1
+#define GNILND_INIT_ALL 2
+
+/* If you change the ordering away from MAPPED = UNMAPPED + 1, things break */
+#define GNILND_BUF_NONE 0 /* buffer type not set */
+#define GNILND_BUF_IMMEDIATE 1 /* immediate data */
+#define GNILND_BUF_IMMEDIATE_KIOV 2 /* immediate data */
+#define GNILND_BUF_PHYS_UNMAPPED 3 /* physical: not mapped yet */
+#define GNILND_BUF_PHYS_MAPPED 4 /* physical: mapped already */
+#define GNILND_BUF_VIRT_UNMAPPED 5 /* virtual: not mapped yet */
+#define GNILND_BUF_VIRT_MAPPED 6 /* virtual: mapped already */
+
+#define GNILND_TX_WAITING_REPLY (1<<1) /* expecting to receive reply */
+#define GNILND_TX_WAITING_COMPLETION (1<<2) /* waiting for smsg_send to complete */
+#define GNILND_TX_PENDING_RDMA (1<<3) /* RDMA transaction pending until we get prev. completion */
+#define GNILND_TX_QUIET_ERROR (1<<4) /* don't print error on tx_done */
+#define GNILND_TX_FAIL_SMSG (1<<5) /* pass down error injection for SMSG fail */
+
+/* stash above max CQID to avoid any collision */
+#define GNILND_MSGID_NOOP (GNILND_MAX_CQID + 128)
+#define GNILND_MSGID_CLOSE (GNILND_MSGID_NOOP + 1)
+
+/* kgn_msg_t::gnm_type */
+#define GNILND_MSG_NONE 0x00 /* illegal message */
+#define GNILND_MSG_NOOP 0x01 /* empty gnm_u (keepalive) */
+#define GNILND_MSG_IMMEDIATE 0x02 /* gnm_u.immediate */
+#define GNILND_MSG_PUT_REQ 0x03 /* gnm_u.putreq (src->sink) */
+#define GNILND_MSG_PUT_NAK 0x04 /* gnm_u.completion (no PUT match: sink->src) */
+#define GNILND_MSG_PUT_ACK 0x05 /* gnm_u.putack (PUT matched: sink->src) */
+#define GNILND_MSG_PUT_DONE 0x06 /* gnm_u.completion (src->sink) */
+#define GNILND_MSG_GET_REQ 0x07 /* gnm_u.get (sink->src) */
+#define GNILND_MSG_GET_NAK 0x08 /* gnm_u.completion (no GET match: src->sink) */
+#define GNILND_MSG_GET_DONE 0x09 /* gnm_u.completion (src->sink) */
+#define GNILND_MSG_CLOSE 0x0a /* empty gnm_u */
+
+/* defines for gnc_*scheduled states */
+#define GNILND_CONN_IDLE 0
+#define GNILND_CONN_SCHED 1
+#define GNILND_CONN_WANTS_SCHED 2
+#define GNILND_CONN_PROCESS 3
+
+#define GNILND_DEV_IDLE 0
+#define GNILND_DEV_IRQ 1
+#define GNILND_DEV_LOOP 2
+
+#define GNILND_DGRAM_IDLE 0
+#define GNILND_DGRAM_SCHED 1
+#define GNILND_DGRAM_PROCESS 2
+
+#define GNILND_PEER_IDLE 0
+#define GNILND_PEER_CONNECT 1
+#define GNILND_PEER_POSTING 2
+#define GNILND_PEER_POSTED 3
+#define GNILND_PEER_NEEDS_DEATH 4
+#define GNILND_PEER_KILL 5
+
+/* for gnc_close_recvd */
+#define GNILND_CLOSE_RX 1
+#define GNILND_CLOSE_INJECT1 2
+#define GNILND_CLOSE_INJECT2 3
+#define GNILND_CLOSE_EARLY 4
+
+/* defines for why quiesce trigger set */
+#define GNILND_QUIESCE_IDLE 0
+#define GNILND_QUIESCE_ADMIN 1
+#define GNILND_QUIESCE_RESET 2
+#define GNILND_QUIESCE_HW_QUIESCE 3
+
+#define GNILND_PEER_CLEAN 0
+#define GNILND_PEER_PERSISTING 1
+
+#define GNILND_DEL_CONN 0
+#define GNILND_DEL_PEER 1
+#define GNILND_CLEAR_PURGATORY 2
+
+typedef enum kgn_fmablk_state {
+ GNILND_FMABLK_IDLE = 0, /* is allocated or ready to be freed */
+ GNILND_FMABLK_PHYS, /* allocated out of slab of physical memory */
+ GNILND_FMABLK_VIRT, /* 'standard' vmalloc hunk */
+ GNILND_FMABLK_FREED, /* after free */
+} kgn_fmablk_state_t;
+
+typedef enum kgn_tx_list_state {
+ GNILND_TX_IDLE = 0, /* TX is on the idle list, kgn_idle_txs */
+ GNILND_TX_ALLOCD, /* TX has been alloced (off of idle), could be in any state transition */
+ GNILND_TX_PEERQ, /* TX on peer->gnp_tx_queue (no live conn) */
+ GNILND_TX_MAPQ, /* TX on dev:gnd_map_tx for buffer mapping */
+ GNILND_TX_FMAQ, /* TX waiting to be send on conn FMA */
+ GNILND_TX_LIVE_FMAQ, /* TX live on the FMA wire, waiting for completion or reply */
+ GNILND_TX_RDMAQ, /* TX waiting to send FMA confirmation to auth RDMA PUT */
+ GNILND_TX_LIVE_RDMAQ, /* TX live on the RDMA wire, waiting for completion */
+ GNILND_TX_DYING, /* TX got caught on MAPQ or RDMAQ while conn was closing, needs someone to call tx_done */
+ GNILND_TX_FREED /* TX is free! */
+} kgn_tx_list_state_t;
+
+typedef enum kgn_conn_state {
+ /* don't start @ 0 - prevent memset(0) badness */
+ GNILND_CONN_DUMMY = 0,
+ GNILND_CONN_LISTEN,
+ GNILND_CONN_CONNECTING,
+ GNILND_CONN_ESTABLISHED,
+ GNILND_CONN_CLOSING,
+ GNILND_CONN_CLOSED,
+ GNILND_CONN_DONE,
+ GNILND_CONN_DESTROY_EP
+} kgn_conn_state_t;
+
+/* changing these requires a change to GNILND_CONNREQ_VERSION and
+ * will result in dropped packets instead of NAKs. Adding to this is
+ * acceptable without changing the CONNREQ_VERSION, but code should
+ * be ready to handle NAKs on version mismatch */
+typedef enum kgn_connreq_type {
+ GNILND_CONNREQ_REQ = 1, /* how YOU doin' ? */
+ GNILND_CONNREQ_NAK, /* NO soup for you! */
+ GNILND_CONNREQ_CLOSE, /* we should see other people */
+} kgn_connreq_type_t;
+
+typedef enum kgn_dgram_state {
+ /* don't use 0 to avoid thinking a memset of zero is valid data */
+ GNILND_DGRAM_USED = 1,
+ GNILND_DGRAM_POSTING,
+ GNILND_DGRAM_POSTED,
+ GNILND_DGRAM_PROCESSING,
+ GNILND_DGRAM_CANCELED,
+ GNILND_DGRAM_DONE,
+} kgn_dgram_state_t;
+
+typedef enum kgn_dgram_type {
+ GNILND_DGRAM_REQ = 1, /* how YOU doin' ? */
+ GNILND_DGRAM_WC_REQ, /* you talkin' to ME? */
+ GNILND_DGRAM_NAK, /* NO soup for you! */
+ GNILND_DGRAM_CLOSE, /* we should see other people */
+} kgn_dgram_type_t;
+
+/************************************************************************
+ * Wire message structs. These are sent in sender's byte order
+ * (i.e. receiver checks magic and flips if required).
+ */
+
+#define GNILND_MSG_MAGIC LNET_PROTO_GNI_MAGIC /* unique magic */
+#define GNILND_DGRAM_MAGIC 0x0DDBA11
+
+/* kgn_msg_t - FMA/SMSG wire struct
+ v2:
+ * - added checksum to FMA
+ * moved seq before paylod
+ * WIRE_ATTR added for alignment
+ v3:
+ * added gnm_payload_len for FMA payload size
+ v4:
+ * added gncm_retval to completion, allowing return code transmission
+ on RDMA NAKs
+ v5:
+ * changed how CQID and TX ids are assigned
+ v6:
+ * added retval on CLOSE
+ v7:
+ * added payload checksumming
+ v8:
+ * reworked checksumming a bit, changed payload checksums
+*/
+#define GNILND_MSG_VERSION 8
+/* kgn_connreq_t connection request datagram wire struct
+ v2:
+ * added NAKs
+*/
+
+#define GNILND_CONNREQ_VERSION 2
+
+typedef struct kgn_gniparams {
+ __u32 gnpr_host_id; /* ph. host ID of the NIC */
+ __u32 gnpr_cqid; /* cqid I want peer to use when sending events to me */
+ gni_smsg_attr_t gnpr_smsg_attr; /* my short msg. attributes */
+} WIRE_ATTR kgn_gniparams_t;
+
+typedef struct kgn_nak_data {
+ __s32 gnnd_errno; /* errno reason for NAK */
+
+} WIRE_ATTR kgn_nak_data_t;
+
+/* the first bits of the connreq struct CANNOT CHANGE FORM EVER
+ * without breaking the ability for us to properly NAK someone */
+typedef struct kgn_connreq { /* connection request/response */
+ __u32 gncr_magic; /* I'm an gnilnd connreq */
+ __u32 gncr_cksum; /* checksum (0 == disabled) */
+ __u16 gncr_type; /* REQ, NAK, etc */
+ __u16 gncr_version; /* this is my version number */
+ __u32 gncr_timeout; /* sender's timeout */
+ __u64 gncr_srcnid; /* sender's NID */
+ __u64 gncr_dstnid; /* who sender expects to listen */
+ __u64 gncr_peerstamp; /* sender's instance stamp */
+ __u64 gncr_connstamp; /* sender's connection stamp */
+
+ /* everything before this needs to stay static, adding after should
+ * result in a change to GNILND_CONNREQ_VERSION */
+
+ union {
+ kgn_gniparams_t gncr_gnparams; /* sender's endpoint info */
+ kgn_nak_data_t gncr_nakdata; /* data (rc, etc) for NAK */
+ };
+} WIRE_ATTR kgn_connreq_t;
+
+typedef struct {
+ gni_mem_handle_t gnrd_key;
+ __u64 gnrd_addr;
+ __u32 gnrd_nob;
+} WIRE_ATTR kgn_rdma_desc_t;
+
+typedef struct {
+ lnet_hdr_t gnim_hdr; /* LNet header */
+ /* LNet payload is in FMA "Message Data" */
+} WIRE_ATTR kgn_immediate_msg_t;
+
+typedef struct {
+ lnet_hdr_t gnprm_hdr; /* LNet header */
+ __u64 gnprm_cookie; /* opaque completion cookie */
+} WIRE_ATTR kgn_putreq_msg_t;
+
+typedef struct {
+ __u64 gnpam_src_cookie; /* reflected completion cookie */
+ __u64 gnpam_dst_cookie; /* opaque completion cookie */
+ kgn_rdma_desc_t gnpam_desc; /* sender's sink buffer */
+} WIRE_ATTR kgn_putack_msg_t;
+
+typedef struct {
+ lnet_hdr_t gngm_hdr; /* LNet header */
+ __u64 gngm_cookie; /* opaque completion cookie */
+ kgn_rdma_desc_t gngm_desc; /* sender's sink buffer */
+} WIRE_ATTR kgn_get_msg_t;
+
+typedef struct {
+ int gncm_retval; /* error on NAK, size on REQ */
+ __u64 gncm_cookie; /* reflected completion cookie */
+} WIRE_ATTR kgn_completion_msg_t;
+
+typedef struct { /* NB must fit in FMA "Prefix" */
+ __u32 gnm_magic; /* I'm an gni message */
+ __u16 gnm_version; /* this is my version number */
+ __u16 gnm_type; /* msg type */
+ __u64 gnm_srcnid; /* sender's NID */
+ __u64 gnm_connstamp; /* sender's connection stamp */
+ __u32 gnm_seq; /* incrementing sequence number */
+ __u16 gnm_cksum; /* checksum (0 == no checksum ) */
+ __u16 gnm_payload_cksum; /* payload checksum (0 == no checksum ) */
+ __u32 gnm_payload_len; /* size of the FMA payload sent */
+ union {
+ kgn_immediate_msg_t immediate;
+ kgn_putreq_msg_t putreq;
+ kgn_putack_msg_t putack;
+ kgn_get_msg_t get;
+ kgn_completion_msg_t completion;
+ } gnm_u;
+} WIRE_ATTR kgn_msg_t;
+
+/************************************************************************
+ * runtime tunable data
+ */
+
+typedef struct kgn_tunables {
+ int *kgn_min_reconnect_interval; /* connreq starting timeout & retransmit interval */
+ int *kgn_max_reconnect_interval; /* ...exponentially increasing to this */
+ int *kgn_credits; /* # concurrent sends */
+ int *kgn_fma_cq_size; /* # entries in receive CQ */
+ int *kgn_peer_credits; /* # LNet peer credits */
+ int *kgn_concurrent_sends; /* max # of max_immediate in mbox */
+ int *kgn_timeout; /* comms timeout (seconds) */
+ int *kgn_max_immediate; /* immediate payload breakpoint */
+ int *kgn_checksum; /* checksum data */
+ int *kgn_checksum_dump; /* dump raw data to D_INFO log when checksumming */
+ int *kgn_bte_hash; /* hashing on BTE transfers */
+ int *kgn_bte_adapt; /* adaptive routing on BTE transfers */
+ int *kgn_bte_relaxed_ordering; /* relaxed ordering (PASSPW) on BTE transfers */
+ int *kgn_ptag; /* PTAG for cdm_create */
+ int *kgn_max_retransmits; /* max number of FMA retransmits */
+ int *kgn_nwildcard; /* # wildcard per net to post */
+ int *kgn_nice; /* nice value for kgnilnd threads */
+ int *kgn_rdmaq_intervals; /* # intervals per second for rdmaq throttle */
+ int *kgn_loops; /* # of loops sched does before flush/heartbeat tickle */
+ int *kgn_peer_hash_size; /* size of kgn_peers */
+ int *kgn_peer_health; /* enable/disable peer health */
+ int *kgn_vmap_cksum; /* enable/disable vmap of kiov checksums */
+ int *kgn_mbox_per_block; /* mailboxes per fmablk */
+ int *kgn_nphys_mbox; /* # mailboxes to preallocate with physical memory */
+ int *kgn_mbox_credits; /* max credits per fma */
+ int *kgn_sched_threads; /* number of kgnilnd_scheduler threads */
+ int *kgn_net_hash_size; /* size of kgn_net_ht */
+ int *kgn_hardware_timeout; /* max time for a message to get across the network */
+ int *kgn_mdd_timeout; /* max time for ghal to hold an mdd in minutes */
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+ cfs_sysctl_table_header_t *kgn_sysctl; /* sysctl interface */
+#endif
+} kgn_tunables_t;
+
+typedef struct kgn_mbox_info {
+ lnet_nid_t mbx_prev_nid;
+ unsigned long mbx_create_conn_memset;
+ unsigned long mbx_add_purgatory;
+ unsigned long mbx_detach_of_purgatory;
+ unsigned long mbx_release_from_purgatory;
+ unsigned long mbx_release_purg_active_dgram;
+} kgn_mbox_info_t;
+
+typedef struct kgn_fma_memblock {
+ struct list_head gnm_bufflist; /* memblock is part of device's gnd_fma_buffs */
+ kgn_fmablk_state_t gnm_state; /* how this memory allocated & state of it */
+ int gnm_hold_timeout; /* hold_timeout if used at unmap time */
+ int gnm_num_mboxs; /* total mboxes allocated */
+ int gnm_avail_mboxs; /* number of available mailboxes in the block */
+ int gnm_held_mboxs; /* number of purgatory held mailboxes */
+ int gnm_mbox_size; /* size of the single mailbox */
+ int gnm_next_avail_mbox; /* next available mailbox */
+ long gnm_max_timeout; /* max timeout for possible purgatory hold */
+ unsigned int gnm_blk_size; /* how big is our hunk o memory ? */
+ void *gnm_block; /* pointer to mem. block */
+ gni_mem_handle_t gnm_hndl; /* mem. handle of the block */
+ unsigned long *gnm_bit_array; /* bit array tracking allocation of mailboxes */
+ kgn_mbox_info_t *gnm_mbox_info; /* array of mbox_information about each mbox */
+} kgn_fma_memblock_t;
+
+typedef struct kgn_device {
+ gni_nic_handle_t gnd_handle; /* device handle */
+ gni_cdm_handle_t gnd_domain; /* GNI communication domain */
+ gni_err_handle_t gnd_err_handle; /* device error handle */
+ unsigned long gnd_sched_alive; /* scheduler thread alive stamp */
+ gni_cq_handle_t gnd_rcv_fma_cqh; /* FMA rcv. completion queue handle */
+ gni_cq_handle_t gnd_snd_rdma_cqh; /* rdma send completion queue handle */
+ gni_cq_handle_t gnd_snd_fma_cqh; /* rdma send completion queue handle */
+ struct mutex gnd_cq_mutex; /* CQ access serialization */
+ __u32 gnd_host_id; /* ph. host ID of the NIC */
+ int gnd_id; /* device id, also index in kgn_devices */
+ __u32 gnd_nid; /* ph host ID translated to NID */
+ struct list_head gnd_fma_buffs; /* list of FMA memory blocks */
+ struct semaphore gnd_fmablk_sem; /* semaphore for FMA block memory alloc/free */
+ spinlock_t gnd_fmablk_lock; /* lock for mbox alloc/release */
+ atomic_t gnd_nfmablk; /* # of fmablk live */
+ atomic_t gnd_fmablk_vers; /* gnd_fma_bufs stamp */
+ atomic_t gnd_neps; /* # EP allocated to conns */
+ short gnd_ready; /* stuff to do in scheduler thread */
+ struct list_head gnd_ready_conns; /* connections ready to tx/rx */
+ struct list_head gnd_map_tx; /* TX: needing buffer mapping */
+ wait_queue_head_t gnd_waitq; /* scheduler wakeup */
+ spinlock_t gnd_lock; /* serialise gnd_ready_conns */
+ struct list_head gnd_connd_peers; /* peers waiting for a connection */
+ spinlock_t gnd_connd_lock; /* serialise connd_peers */
+ wait_queue_head_t gnd_dgram_waitq; /* dgram_mover thread wakeup */
+ wait_queue_head_t gnd_dgping_waitq; /* dgram thread ping-pong */
+ int gnd_dgram_ready; /* dgrams need movin' */
+ struct list_head *gnd_dgrams; /* nid hash to dgrams */
+ atomic_t gnd_ndgrams; /* # dgrams extant */
+ spinlock_t gnd_dgram_lock; /* serialize gnd_dgrams */
+ struct list_head gnd_map_list; /* list of all mapped regions */
+ int gnd_map_version; /* version flag for map list */
+ atomic_t gnd_n_mdd; /* number of total MDD - fma, tx, etc */
+ atomic_t gnd_n_mdd_held; /* number of total MDD held - fma, tx, etc */
+ atomic_t gnd_nq_map; /* # queued waiting for mapping (MDD/GART) */
+ atomic64_t gnd_nbytes_map; /* bytes of total GART maps - fma, tx, etc */
+ __u32 gnd_map_nphys; /* # TX phys mappings */
+ __u32 gnd_map_physnop; /* # TX phys pages mapped */
+ __u32 gnd_map_nvirt; /* # TX virt mappings */
+ __u64 gnd_map_virtnob; /* # TX virt bytes mapped */
+ spinlock_t gnd_map_lock; /* serialize gnd_map_XXX */
+ struct list_head gnd_rdmaq; /* RDMA to be sent */
+ spinlock_t gnd_rdmaq_lock; /* play nice with others */
+ atomic64_t gnd_rdmaq_bytes_out; /* # bytes authorized */
+ atomic64_t gnd_rdmaq_bytes_ok; /* # bytes allowed until deadline */
+ atomic_t gnd_rdmaq_nstalls; /* # stalls due to throttle */
+ unsigned long gnd_rdmaq_deadline; /* when does bucket roll over ? */
+ struct timer_list gnd_rdmaq_timer; /* wakey-wakey */
+ atomic_t gnd_short_ntx; /* TX stats: short messages */
+ atomic64_t gnd_short_txbytes; /* TX stats: short message payload*/
+ atomic_t gnd_rdma_ntx; /* TX stats: rdma messages */
+ atomic64_t gnd_rdma_txbytes; /* TX stats: rdma message payload*/
+ atomic_t gnd_short_nrx; /* RX stats: short messages */
+ atomic64_t gnd_short_rxbytes; /* RX stats: short message payload*/
+ atomic_t gnd_rdma_nrx; /* RX stats: rdma messages */
+ atomic64_t gnd_rdma_rxbytes; /* RX stats: rdma message payload*/
+ atomic_t gnd_fast_try; /* # of times fast send tried */
+ atomic_t gnd_fast_ok; /* # of times fast send ok */
+ atomic_t gnd_fast_block; /* # of times fast send blocked */
+ unsigned long gnd_mutex_delay;
+ atomic_t gnd_n_yield;
+ atomic_t gnd_n_schedule;
+ atomic_t gnd_canceled_dgrams; /* # of outstanding cancels */
+} kgn_device_t;
+
+typedef struct kgn_net {
+ struct list_head gnn_list; /* chain on kgni_data::kgn_nets */
+ kgn_device_t *gnn_dev; /* device for this net */
+ lnet_ni_t *gnn_ni; /* network interface instance */
+ atomic_t gnn_refcount; /* # current references */
+ int gnn_shutdown; /* lnd_shutdown set */
+ __u16 gnn_netnum; /* stash netnum for quicker lookup */
+} kgn_net_t;
+
+static inline lnet_nid_t
+kgnilnd_lnd2lnetnid(lnet_nid_t ni_nid, lnet_nid_t kgnilnd_nid)
+{
+ return LNET_MKNID(LNET_NIDNET(ni_nid), LNET_NIDADDR(kgnilnd_nid));
+}
+
+static inline lnet_nid_t
+kgnilnd_lnet2lndnid(lnet_nid_t lnet_nid, lnet_nid_t kgnilnd_nid)
+{
+ return LNET_MKNID(LNET_NIDNET(kgnilnd_nid), LNET_NIDADDR(lnet_nid));
+}
+
+/* The code for this is a bit ugly - but really this just boils down to a __u64
+ * that can have various parts accessed separately.
+ *
+ * The lower 32 bits is the ID
+ * we give to SMSG for our completion event - it needs to be globally unique across
+ * all TX currently in flight. We separate that out into the CQID so that we can
+ * reference the connection (kgnilnd_cqid2conn_locked) and then the msg_id to pull
+ * the actual TX out of the per-connection gnc_tx_ref_table.
+ *
+ * The upper 32 bits are just extra stuff we put into the cookie to ensure this TX
+ * has a unique value we can send with RDMA setup messages to ensure the completion for
+ * those is unique across the wire. The extra 32 bits are there to ensure that TX id
+ * reuse is separated.
+ */
+
+typedef struct kgn_tx_ev_id {
+ union {
+ __u64 txe_cookie; /* are you my mommy ? */
+ struct {
+ __u32 txe_chips; /* extra bits to ensure ID unique across reuse */
+ union {
+ __u32 txe_smsg_id; /* ID for SMSG CQ event */
+ /* N.B: Never ever ever ever use the bit shifts directly,
+ * you are just asking for a world of pain and are at the
+ * mercy of the compiler layouts */
+ struct {
+ __u32 txe_cqid :GNILND_CQID_NBITS;
+ __u32 txe_idx :GNILND_MSGID_TX_NBITS;
+ };
+ };
+ };
+ };
+} kgn_tx_ev_id_t;
+
+typedef struct kgn_dgram {
+ struct list_head gndg_list; /* on hash dev::gnd_dgrams */
+ kgn_dgram_state_t gndg_state; /* state of this dgram */
+ kgn_dgram_type_t gndg_type; /* REQ, NAK, etc */
+ __u32 gndg_magic; /* saftey word */
+ unsigned long gndg_post_time; /* time when we posted */
+ struct kgn_conn *gndg_conn; /* unbound conn with ep & smsg */
+ kgn_connreq_t gndg_conn_out; /* connreq from local node */
+ kgn_connreq_t gndg_conn_in; /* connreq from remote node */
+} kgn_dgram_t;
+
+typedef struct kgn_tx { /* message descriptor */
+ struct list_head tx_list; /* TX queues - peer, conn, rdma */
+ kgn_tx_list_state_t tx_list_state;/* where in state machine is this TX ? */
+ struct list_head *tx_list_p; /* pointer to current list */
+ struct kgn_conn *tx_conn; /* owning conn */
+ lnet_msg_t *tx_lntmsg[2]; /* ptl msgs to finalize on completion */
+ unsigned long tx_qtime; /* when tx started to wait for something (jiffies) */
+ unsigned long tx_cred_wait; /* time spend waiting for smsg creds */
+ struct list_head tx_map_list; /* list entry on device map list */
+ unsigned int tx_nob; /* # bytes of payload */
+ int tx_buftype; /* payload buffer type */
+ int tx_phys_npages; /* # physical pages */
+ gni_mem_handle_t tx_map_key; /* mapping key */
+ gni_mem_segment_t *tx_phys; /* page descriptors */
+ kgn_msg_t tx_msg; /* FMA message buffer */
+ kgn_tx_ev_id_t tx_id; /* who are you, who ? who ? */
+ __u8 tx_state; /* state of the descriptor */
+ int tx_retrans; /* retrans count of RDMA */
+ int tx_rc; /* if we need to stash the ret code until we see completion */
+ void *tx_buffer; /* source/sink buffer */
+ union {
+ gni_post_descriptor_t tx_rdma_desc; /* rdma descriptor */
+ struct page *tx_imm_pages[GNILND_MAX_IMMEDIATE/PAGE_SIZE]; /* page array to map kiov for immediate send */
+ };
+
+ /* we only use one or the other */
+ union {
+ kgn_putack_msg_t tx_putinfo; /* data for differed rdma & re-try */
+ kgn_get_msg_t tx_getinfo; /* data for rdma re-try*/
+ };
+} kgn_tx_t;
+
+typedef struct kgn_conn {
+ kgn_device_t *gnc_device; /* which device */
+ struct kgn_peer *gnc_peer; /* owning peer */
+ struct list_head gnc_list; /* stash on peer's conn list - or pending purgatory lists as we clear them */
+ struct list_head gnc_hashlist; /* stash in connection hash table */
+ struct list_head gnc_schedlist; /* schedule (on gnd_?_conns) for attention */
+ struct list_head gnc_fmaq; /* txs queued for FMA */
+ struct list_head gnc_mdd_list; /* hold list for MDD on hard conn reset */
+ __u64 gnc_peerstamp; /* peer's unique stamp */
+ __u64 gnc_peer_connstamp; /* peer's unique connection stamp */
+ __u64 gnc_my_connstamp; /* my unique connection stamp */
+ unsigned long gnc_first_rx; /* when I first received an FMA message (jiffies) */
+ unsigned long gnc_last_tx; /* when I last sent an FMA message (jiffies) */
+ unsigned long gnc_last_rx; /* when I last sent an FMA message (jiffies) */
+ unsigned long gnc_last_tx_cq; /* when I last received an FMA CQ (jiffies) */
+ unsigned long gnc_last_rx_cq; /* when I last received an FMA CQ (jiffies) */
+ unsigned long gnc_last_noop_want; /* time I wanted to send NOOP */
+ unsigned long gnc_last_noop_sent; /* time I did gni_smsg_send on NOOP */
+ unsigned long gnc_last_noop_cq; /* time when NOOP completed */
+ unsigned long gnc_last_sched_ask; /* time when conn added to ready_conns */
+ unsigned long gnc_last_sched_do; /* time when conn processed from ready_conns */
+ atomic_t gnc_reaper_noop; /* # reaper triggered NOOP */
+ atomic_t gnc_sched_noop; /* # sched triggered NOOP */
+ unsigned int gnc_timeout; /* infer peer death if no rx for this many seconds */
+ __u32 gnc_cqid; /* my completion callback id (non-unique) */
+ __u32 gnc_tx_seq; /* tx msg sequence number */
+ __u32 gnc_rx_seq; /* rx msg sequence number */
+ __u64 gnc_tx_retrans; /* # retrans on SMSG */
+ atomic_t gnc_nlive_fma; /* # live FMA */
+ atomic_t gnc_nq_rdma; /* # queued (on device) RDMA */
+ atomic_t gnc_nlive_rdma; /* # live RDMA */
+ short gnc_close_sent; /* I've sent CLOSE */
+ short gnc_close_recvd; /* I've received CLOSE */
+ short gnc_in_purgatory; /* in the sin bin */
+ int gnc_error; /* errno when conn being closed due to error */
+ int gnc_peer_error; /* errno peer sent us on CLOSE */
+ kgn_conn_state_t gnc_state; /* connection state */
+ int gnc_scheduled; /* being attented to */
+ atomic_t gnc_refcount; /* # users */
+ spinlock_t gnc_list_lock; /* serialise tx lists, max_rx_age */
+ gni_ep_handle_t gnc_ephandle; /* GNI endpoint */
+ kgn_fma_memblock_t *gnc_fma_blk; /* pointer to fma block for our mailbox */
+ gni_smsg_attr_t gnpr_smsg_attr; /* my short msg. attributes */
+ spinlock_t gnc_tx_lock; /* protect tx alloc/free */
+ __u8 gnc_tx_bits[GNILND_MAX_MSG_ID/8]; /* bit table for tx id */
+ int gnc_next_tx; /* next tx to use in tx_ref_table */
+ kgn_tx_t **gnc_tx_ref_table; /* table of TX descriptors for this conn */
+ int gnc_mbox_id; /* id of mbox in fma_blk */
+ short gnc_needs_detach; /* flag set in detach_purgatory_all_locked so reaper will clear out purgatory */
+ short gnc_needs_closing; /* flag set in del_conns when called from kgnilnd_del_peer_or_conn */
+} kgn_conn_t;
+
+typedef struct kgn_mdd_purgatory {
+ gni_mem_handle_t gmp_map_key; /* mapping key */
+ struct list_head gmp_list; /* entry point for purgatory list */
+} kgn_mdd_purgatory_t;
+
+typedef struct kgn_peer {
+ struct list_head gnp_list; /* stash on global peer list */
+ struct list_head gnp_connd_list; /* schedule on kgn_connd_peers */
+ struct list_head gnp_conns; /* all active connections and all conns in purgatory for the peer */
+ struct list_head gnp_tx_queue; /* msgs waiting for a conn */
+ kgn_net_t *gnp_net; /* net instance for this peer */
+ lnet_nid_t gnp_nid; /* who's on the other end(s) */
+ atomic_t gnp_refcount; /* # users */
+ __u32 gnp_host_id; /* ph. host ID of the peer */
+ short gnp_connecting; /* connection forming */
+ short gnp_pending_unlink; /* need last conn close to trigger unlink */
+ int gnp_last_errno; /* last error conn saw */
+ unsigned long gnp_last_alive; /* last time I had valid comms */
+ int gnp_last_dgram_errno; /* last error dgrams saw */
+ unsigned long gnp_last_dgram_time; /* last time I tried to connect */
+ unsigned long gnp_reconnect_time; /* CURRENT_SECONDS when reconnect OK */
+ unsigned long gnp_reconnect_interval; /* exponential backoff */
+ atomic_t gnp_dirty_eps; /* # of old but yet to be destroyed EPs from conns */
+} kgn_peer_t;
+
+/* the kgn_rx_t is a struct for handing to LNET as the private pointer for things
+ * like lnet_parse. It allows a single pointer to let us get enough
+ * information in _recv and friends */
+typedef struct kgn_rx {
+ kgn_conn_t *grx_conn; /* connection */
+ kgn_msg_t *grx_msg; /* message */
+ lnet_msg_t *grx_lntmsg; /* lnet msg for this rx (eager only) */
+ int grx_eager; /* if eager, we copied msg to somewhere */
+ struct timespec grx_received; /* time this msg received */
+} kgn_rx_t;
+
+typedef struct kgn_data {
+ int kgn_init; /* initialisation state */
+ int kgn_shutdown; /* shut down? */
+ int kgn_wc_kill; /* Should I repost the WC */
+ atomic_t kgn_nthreads; /* # live threads */
+ int kgn_nresets; /* number of stack resets */
+ int kgn_in_reset; /* are we in stack reset ? */
+
+ kgn_device_t kgn_devices[GNILND_MAXDEVS]; /* device/ptag/cq etc */
+ int kgn_ndevs; /* # devices */
+
+ int kgn_ruhroh_running; /* ruhroh thread is running */
+ int kgn_ruhroh_shutdown; /* ruhroh thread should or is shut down */
+ wait_queue_head_t kgn_ruhroh_waitq; /* ruhroh thread wakeup */
+ int kgn_quiesce_trigger; /* should we quiesce ? */
+ atomic_t kgn_nquiesce; /* how many quiesced ? */
+ struct semaphore kgn_quiesce_sem; /* serialize ruhroh task, startup and shutdown */
+ int kgn_needs_reset; /* we need stack reset */
+
+ /* These next three members implement communication from gnilnd into
+ * the ruhroh task. To ensure correct operation of the task, code that
+ * writes into them must use memory barriers to ensure that the changes
+ * are visible to other cores in the order the members appear below. */
+ __u32 kgn_quiesce_secs; /* seconds to bump timeouts */
+ int kgn_bump_info_rdy; /* we have info needed to bump */
+ int kgn_needs_pause; /* we need to pause for network quiesce */
+
+ struct list_head *kgn_nets; /* hashtable of kgn_net instances */
+ struct rw_semaphore kgn_net_rw_sem; /* serialise gnn_shutdown, kgn_nets */
+
+ rwlock_t kgn_peer_conn_lock; /* stabilize peer/conn ops */
+ struct list_head *kgn_peers; /* hash table of all my known peers */
+ atomic_t kgn_npeers; /* # peers extant */
+ int kgn_peer_version; /* version flag for peer tables */
+
+ struct list_head *kgn_conns; /* conns hashed by cqid */
+ atomic_t kgn_nconns; /* # connections extant */
+ __u64 kgn_peerstamp; /* when I started up */
+ __u64 kgn_connstamp; /* conn stamp generator */
+ int kgn_conn_version; /* version flag for conn tables */
+ int kgn_next_cqid; /* cqid generator */
+
+ long kgn_new_min_timeout; /* minimum timeout on any new conn */
+ wait_queue_head_t kgn_reaper_waitq; /* reaper sleeps here */
+ spinlock_t kgn_reaper_lock; /* serialise */
+
+ cfs_mem_cache_t *kgn_rx_cache; /* rx descriptor space */
+ cfs_mem_cache_t *kgn_tx_cache; /* tx descriptor memory */
+ cfs_mem_cache_t *kgn_tx_phys_cache; /* tx phys descriptor memory */
+ atomic_t kgn_ntx; /* # tx in use */
+ cfs_mem_cache_t *kgn_dgram_cache; /* outgoing datagrams */
+
+ struct page ***kgn_cksum_map_pages; /* page arrays for mapping pages on checksum */
+ __u64 kgn_cksum_npages; /* Number of pages allocated for checksumming */
+ atomic_t kgn_nvmap_cksum; /* # times we vmapped for checksums */
+ atomic_t kgn_nvmap_short; /* # times we vmapped for short kiov */
+
+ atomic_t kgn_nkmap_short; /* # time we kmapped for a short kiov */
+ long kgn_rdmaq_override; /* bytes per second override */
+
+ struct kmem_cache *kgn_mbox_cache; /* mailboxes from not-GART */
+
+ atomic_t kgn_npending_unlink; /* # of peers pending unlink */
+ atomic_t kgn_npending_conns; /* # of conns with pending closes */
+ atomic_t kgn_npending_detach; /* # of conns with a pending detach */
+
+} kgn_data_t;
+
+extern kgn_data_t kgnilnd_data;
+extern kgn_tunables_t kgnilnd_tunables;
+
+extern void kgnilnd_destroy_peer(kgn_peer_t *peer);
+extern void kgnilnd_destroy_conn(kgn_conn_t *conn);
+extern void kgnilnd_schedule_conn(kgn_conn_t *conn);
+
+static inline int
+kgnilnd_thread_start(int(*fn)(void *arg), void *arg, char *name, int id)
+{
+ struct task_struct *thrd = kthread_run(fn, arg, "%s_%02d", name, id);
+ if (IS_ERR(thrd))
+ return PTR_ERR(thrd);
+
+ atomic_inc(&kgnilnd_data.kgn_nthreads);
+ return 0;
+}
+
+static inline void
+kgnilnd_thread_fini(void)
+{
+ atomic_dec(&kgnilnd_data.kgn_nthreads);
+}
+
+/* like mutex_trylock but with a jiffies spinner. This is to allow certain
+ * parts of the code to avoid a scheduler trip when the mutex is held
+ *
+ * Try to acquire the mutex atomically for 1 jiffie. Returns 1 if the mutex
+ * has been acquired successfully, and 0 on contention.
+ *
+ * NOTE: this function follows the spin_trylock() convention, so
+ * it is negated to the down_trylock() return values! Be careful
+ * about this when converting semaphore users to mutexes.
+ *
+ * This function must not be used in interrupt context. The
+ * mutex must be released by the same task that acquired it.
+ */
+static inline int kgnilnd_mutex_trylock(struct mutex *lock)
+{
+ int ret;
+ unsigned long timeout;
+
+ LASSERT(!in_interrupt());
+
+ for (timeout = jiffies + 1; time_before(jiffies, timeout);) {
+
+ ret = mutex_trylock(lock);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/* Copied from DEBUG_REQ in Lustre - the dance is needed to save stack space */
+
+extern void
+_kgnilnd_debug_msg(kgn_msg_t *msg,
+ struct libcfs_debug_msg_data *data, const char *fmt, ... );
+
+#define kgnilnd_debug_msg(msgdata, mask, cdls, msg, fmt, a...) \
+do { \
+ CFS_CHECK_STACK(msgdata, mask, cdls); \
+ \
+ if (((mask) & D_CANTMASK) != 0 || \
+ ((libcfs_debug & (mask)) != 0 && \
+ (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) \
+ _kgnilnd_debug_msg((msg), msgdata, fmt, ##a); \
+} while(0)
+
+/* for most callers (level is a constant) this is resolved at compile time */
+#define GNIDBG_MSG(level, msg, fmt, args...) \
+do { \
+ if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) { \
+ static cfs_debug_limit_state_t cdls; \
+ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls); \
+ kgnilnd_debug_msg(&msgdata, level, &cdls, msg, \
+ "$$ "fmt" from %s ", ## args, \
+ libcfs_nid2str((msg)->gnm_srcnid)); \
+ } else { \
+ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL); \
+ kgnilnd_debug_msg(&msgdata, level, NULL, msg, \
+ "$$ "fmt" from %s ", ## args, \
+ libcfs_nid2str((msg)->gnm_srcnid)); \
+ } \
+} while (0)
+
+/* user puts 'to nid' in msg for us */
+#define GNIDBG_TOMSG(level, msg, fmt, args...) \
+do { \
+ if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) { \
+ static cfs_debug_limit_state_t cdls; \
+ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls); \
+ kgnilnd_debug_msg(&msgdata, level, &cdls, msg, \
+ "$$ "fmt" ", ## args); \
+ } else { \
+ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL); \
+ kgnilnd_debug_msg(&msgdata, level, NULL, msg, \
+ "$$ "fmt" ", ## args); \
+ } \
+} while (0)
+
+extern void
+_kgnilnd_debug_conn(kgn_conn_t *conn,
+ struct libcfs_debug_msg_data *data, const char *fmt, ... );
+
+#define kgnilnd_debug_conn(msgdata, mask, cdls, conn, fmt, a...) \
+do { \
+ CFS_CHECK_STACK(msgdata, mask, cdls); \
+ \
+ if (((mask) & D_CANTMASK) != 0 || \
+ ((libcfs_debug & (mask)) != 0 && \
+ (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) \
+ _kgnilnd_debug_conn((conn), msgdata, fmt, ##a); \
+} while(0)
+
+/* for most callers (level is a constant) this is resolved at compile time */
+#define GNIDBG_CONN(level, conn, fmt, args...) \
+do { \
+ if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) { \
+ static cfs_debug_limit_state_t cdls; \
+ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls); \
+ kgnilnd_debug_conn(&msgdata, level, &cdls, conn, \
+ "$$ "fmt" ", ## args); \
+ } else { \
+ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL); \
+ kgnilnd_debug_conn(&msgdata, level, NULL, conn, \
+ "$$ "fmt" ", ## args); \
+ } \
+} while (0)
+
+extern void
+_kgnilnd_debug_tx(kgn_tx_t *tx,
+ struct libcfs_debug_msg_data *data, const char *fmt, ... );
+
+#define kgnilnd_debug_tx(msgdata, mask, cdls, tx, fmt, a...) \
+do { \
+ CFS_CHECK_STACK(msgdata, mask, cdls); \
+ \
+ if (((mask) & D_CANTMASK) != 0 || \
+ ((libcfs_debug & (mask)) != 0 && \
+ (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) \
+ _kgnilnd_debug_tx((tx), msgdata, fmt, ##a); \
+} while(0)
+
+/* for most callers (level is a constant) this is resolved at compile time */
+#define GNIDBG_TX(level, tx, fmt, args...) \
+do { \
+ if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) { \
+ static cfs_debug_limit_state_t cdls; \
+ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls); \
+ kgnilnd_debug_tx(&msgdata, level, &cdls, tx, \
+ "$$ "fmt" ", ## args); \
+ } else { \
+ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL); \
+ kgnilnd_debug_tx(&msgdata, level, NULL, tx, \
+ "$$ "fmt" ", ## args); \
+ } \
+} while (0)
+
+#define GNITX_ASSERTF(tx, cond, fmt, a...) \
+({ \
+ if (unlikely(!(cond))) { \
+ GNIDBG_TX(D_EMERG, tx, "ASSERTION(" #cond ") failed:" fmt, a); \
+ LBUG(); \
+ } \
+})
+
+#define GNILND_IS_QUIESCED \
+ (atomic_read(&kgnilnd_data.kgn_nquiesce) == \
+ atomic_read(&kgnilnd_data.kgn_nthreads))
+
+#define KGNILND_SPIN_QUIESCE \
+do { \
+ /* E.T phone home */ \
+ atomic_inc(&kgnilnd_data.kgn_nquiesce); \
+ CDEBUG(D_NET, "Waiting for thread pause to be over...\n"); \
+ while (kgnilnd_data.kgn_quiesce_trigger) { \
+ set_current_state(TASK_INTERRUPTIBLE); \
+ cfs_schedule_timeout_and_set_state(TASK_INTERRUPTIBLE, \
+ cfs_time_seconds(1)); \
+ } \
+ /* Mom, my homework is done */ \
+ CDEBUG(D_NET, "Waking up from thread pause\n"); \
+ atomic_dec(&kgnilnd_data.kgn_nquiesce); \
+} while(0)
+
+/* use macros for addref/decref to get the calling function name in the CDEBUG */
+#ifndef LIBCFS_DEBUG
+#error "this code uses actions inside LASSERT for ref counting"
+#endif
+
+#define kgnilnd_admin_addref(atomic) \
+do { \
+ int val = atomic_inc_return(&atomic); \
+ LASSERTF(val > 0, #atomic " refcount %d\n", val); \
+ CDEBUG(D_NETTRACE, #atomic " refcount %d\n", val); \
+} while (0)
+
+#define kgnilnd_admin_decref(atomic) \
+do { \
+ int val = atomic_dec_return(&atomic); \
+ LASSERTF(val >=0, #atomic " refcount %d\n", val); \
+ CDEBUG(D_NETTRACE, #atomic " refcount %d\n", val); \
+}while (0)
+
+#define kgnilnd_net_addref(net) \
+do { \
+ int val = atomic_inc_return(&net->gnn_refcount); \
+ LASSERTF(val > 1, "net %p refcount %d\n", net, val); \
+ CDEBUG(D_NETTRACE, "net %p->%s++ (%d)\n", net, \
+ libcfs_nid2str(net->gnn_ni->ni_nid), val); \
+} while (0)
+
+#define kgnilnd_net_decref(net) \
+do { \
+ int val = atomic_dec_return(&net->gnn_refcount); \
+ LASSERTF(val >= 0, "net %p refcount %d\n", net, val); \
+ CDEBUG(D_NETTRACE, "net %p->%s-- (%d)\n", net, \
+ libcfs_nid2str(net->gnn_ni->ni_nid), val); \
+} while (0)
+
+#define kgnilnd_peer_addref(peer) \
+do { \
+ int val = atomic_inc_return(&peer->gnp_refcount); \
+ LASSERTF(val > 1, "peer %p refcount %d\n", peer, val); \
+ CDEBUG(D_NETTRACE, "peer %p->%s++ (%d)\n", peer, \
+ libcfs_nid2str(peer->gnp_nid), val); \
+} while (0)
+
+#define kgnilnd_peer_decref(peer) \
+do { \
+ int val = atomic_dec_return(&peer->gnp_refcount); \
+ LASSERTF(val >= 0, "peer %p refcount %d\n", peer, val); \
+ CDEBUG(D_NETTRACE, "peer %p->%s--(%d)\n", peer, \
+ libcfs_nid2str(peer->gnp_nid), val); \
+ if (atomic_read(&peer->gnp_refcount) == 0) \
+ kgnilnd_destroy_peer(peer); \
+} while(0)
+
+#define kgnilnd_conn_addref(conn) \
+do { \
+ int val; \
+ \
+ smp_wmb(); \
+ val = atomic_inc_return(&conn->gnc_refcount); \
+ LASSERTF(val >= 0, "conn %p refc %d to %s\n", \
+ conn, val, \
+ conn->gnc_peer \
+ ? libcfs_nid2str(conn->gnc_peer->gnp_nid) \
+ : "<?>"); \
+ CDEBUG(D_NETTRACE, "conn %p->%s++ (%d)\n", conn, \
+ conn->gnc_peer \
+ ? libcfs_nid2str(conn->gnc_peer->gnp_nid) \
+ : "<?>", \
+ val); \
+} while (0)
+
+/* we hijack conn_decref && gnc_refcount = 1 to allow us to push the conn
+ * through the scheduler thread to get the EP destroyed. This avoids some
+ * messy semaphore business and allows us to reuse the connd_list and existing
+ * linkage and avoid creating extra lists just for destroying EPs */
+
+/* Safety Disclaimer:
+ * Q: If we decrement the refcount and then check it again, is it possible that
+ * another caller could have passed through this macro concurrently? If so,
+ * then it is possible that both will attempt to call kgnilnd_destroy_conn().
+ *
+ * A: Yes, entirely possible in most cases, but we can't get concurrent users
+ * once we are refcount <= 2. It hinges around gnc_state and membership of
+ * gnc_hashlist. There are two ways to find a connection - either ask for
+ * it from the peer, kgnilnd_find_conn_locked(peer) or from the CQ id,
+ * kgnilnd_cqid2conn_locked(id). While a conn is live, we'll have at least
+ * 4 refcounts
+ *
+ * - #1 from create (kgnilnd_create_conn)
+ * - #2 for EP (kgnilnd_create_conn)
+ * - #3 - living on peer (gnc_list, kgnilnd_finish_connect)
+ * - #4 living in global hash (gnc_hashlist, kgnilnd_finish_connect).
+ *
+ * Actually, only 3 live, as at the end of kgnilnd_finish_connect, we drop:
+ * - #1 - the ref the dgram inherited from kgnilnd_create_conn.
+ *
+ * There could be more from TX descriptors during the lifetime of a live
+ * conn.
+ *
+ * If we nuke the conn before finish_connect, we won't have parallel paths
+ * because nobody besides the dgram handler for the single outstanding
+ * dgram can find the connection as it isn't in any searchable tables yet.
+ *
+ * This leaves connection close, we'll drop 2 refs (#4 and #3) but only
+ * after calling kgnilnd_schedule_conn, which would add a new ref (#5). At
+ * this point gnc_refcount=2 (#2, #5). We have a 'maybe' send of the CLOSE
+ * now on the next scheduler loop, this could be #6 (schedule_conn again)
+ * and #7 (TX on gnc_fmaq). Both would be cleared quickly as that TX is
+ * sent. Now the gnc_state == CLOSED, so we hit
+ * kgnilnd_complete_closed_conn. At this point, nobody can 'find' this conn
+ * - we've nuked them from the peer and CQ id tables, so we own them and
+ * are guaranteed serial access - hence the complete lack of conn list
+ * locking in kgnilnd_complete_closed_conn. We are free then to mark the
+ * conn DESTROY_EP (add #6 for schedule_conn), then lose #5 in
+ * kgnilnd_process_conns. Then the next scheduler loop would call
+ * kgnilnd_destroy_conn_ep (drop #2 for EP) and lose #6 (refcount=0) in
+ * kgnilnd_process_conns.
+ *
+ * Clearly, we are totally safe. Clearly.
+ */
+
+#define kgnilnd_conn_decref(conn) \
+do { \
+ int val; \
+ \
+ smp_wmb(); \
+ val = atomic_dec_return(&conn->gnc_refcount); \
+ LASSERTF(val >= 0, "conn %p refc %d to %s\n", \
+ conn, val, \
+ conn->gnc_peer \
+ ? libcfs_nid2str(conn->gnc_peer->gnp_nid) \
+ : "<?>"); \
+ CDEBUG(D_NETTRACE, "conn %p->%s-- (%d)\n", conn, \
+ conn->gnc_peer \
+ ? libcfs_nid2str(conn->gnc_peer->gnp_nid) \
+ : "<?>", \
+ val); \
+ smp_rmb(); \
+ if ((atomic_read(&conn->gnc_refcount) == 1) && \
+ (conn->gnc_ephandle != NULL) && \
+ (conn->gnc_state != GNILND_CONN_DESTROY_EP)) { \
+ set_mb(conn->gnc_state, GNILND_CONN_DESTROY_EP); \
+ kgnilnd_schedule_conn(conn); \
+ } else if (atomic_read(&conn->gnc_refcount) == 0) { \
+ kgnilnd_destroy_conn(conn); \
+ } \
+} while (0)
+
+static inline struct list_head *
+kgnilnd_nid2peerlist(lnet_nid_t nid)
+{
+ unsigned int hash = ((unsigned int)LNET_NIDADDR(nid)) % *kgnilnd_tunables.kgn_peer_hash_size;
+
+ RETURN(&kgnilnd_data.kgn_peers[hash]);
+}
+
+static inline struct list_head *
+kgnilnd_netnum2netlist(__u16 netnum)
+{
+ unsigned int hash = ((unsigned int) netnum) % *kgnilnd_tunables.kgn_net_hash_size;
+
+ RETURN(&kgnilnd_data.kgn_nets[hash]);
+}
+
+static inline int
+kgnilnd_peer_active(kgn_peer_t *peer)
+{
+ /* Am I in the peer hash table? */
+ return (!list_empty(&peer->gnp_list));
+}
+
+/* need write_lock on kgn_peer_conn_lock */
+static inline int
+kgnilnd_can_unlink_peer_locked(kgn_peer_t *peer)
+{
+ CDEBUG(D_NET, "peer 0x%p->%s conns? %d tx? %d\n",
+ peer, libcfs_nid2str(peer->gnp_nid),
+ !list_empty(&peer->gnp_conns),
+ !list_empty(&peer->gnp_tx_queue));
+
+ /* kgn_peer_conn_lock protects us from conflict with
+ * kgnilnd_peer_notify and gnp_persistent */
+ RETURN ((list_empty(&peer->gnp_conns)) &&
+ (list_empty(&peer->gnp_tx_queue)));
+}
+
+/* returns positive if error was for a clean shutdown of conn */
+static inline int
+kgnilnd_conn_clean_errno(int errno)
+{
+ /* - ESHUTDOWN - LND is unloading
+ * - EUCLEAN - admin requested via "lctl del_peer"
+ * - ENETRESET - admin requested via "lctl disconnect"
+ * - ENOTRECOVERABLE - stack reset
+ * - EISCONN - cleared via "lctl push"
+ * not doing ESTALE - that isn't clean */
+ RETURN ((errno == 0) ||
+ (errno == -ESHUTDOWN) ||
+ (errno == -EUCLEAN) ||
+ (errno == -ENETRESET) ||
+ (errno == -EISCONN) ||
+ (errno == -ENOTRECOVERABLE));
+}
+
+/* returns positive if error results in purgatory hold */
+static inline int
+kgnilnd_check_purgatory_errno(int errno)
+{
+ /* We don't want to save the purgatory lists these cases:
+ * - EUCLEAN - admin requested via "lctl del_peer"
+ * - ESHUTDOWN - LND is unloading
+ */
+ RETURN ((errno != -ESHUTDOWN) &&
+ (errno != -EUCLEAN));
+
+}
+
+/* returns positive if a purgatory hold is needed */
+static inline int
+kgnilnd_check_purgatory_conn(kgn_conn_t *conn)
+{
+ int loopback = 0;
+
+ if (conn->gnc_peer) {
+ loopback = conn->gnc_peer->gnp_nid ==
+ conn->gnc_peer->gnp_net->gnn_ni->ni_nid;
+ } else {
+ /* short circuit - a conn that didn't complete
+ * setup never needs a purgatory hold */
+ RETURN(0);
+ }
+ CDEBUG(D_NETTRACE, "conn 0x%p->%s loopback %d close_recvd %d\n",
+ conn, conn->gnc_peer ?
+ libcfs_nid2str(conn->gnc_peer->gnp_nid) :
+ "<?>",
+ loopback, conn->gnc_close_recvd);
+
+ /* we only use a purgatory hold if we've not received the CLOSE msg
+ * from our peer - without that message, we can't know the state of
+ * the other end of this connection and must put it into purgatory
+ * to prevent reuse and corruption.
+ * The theory is that a TX error can be communicated in all other cases
+ */
+ RETURN(likely(!loopback) && !conn->gnc_close_recvd &&
+ kgnilnd_check_purgatory_errno(conn->gnc_error));
+}
+
+static inline const char *
+kgnilnd_tx_state2str(kgn_tx_list_state_t state);
+
+static inline struct list_head *
+kgnilnd_tx_state2list(kgn_peer_t *peer, kgn_conn_t *conn,
+ kgn_tx_list_state_t to_state)
+{
+ switch (to_state) {
+ case GNILND_TX_PEERQ:
+ return &peer->gnp_tx_queue;
+ case GNILND_TX_FMAQ:
+ return &conn->gnc_fmaq;
+ case GNILND_TX_LIVE_FMAQ:
+ case GNILND_TX_LIVE_RDMAQ:
+ case GNILND_TX_DYING:
+ return NULL;
+ case GNILND_TX_MAPQ:
+ return &conn->gnc_device->gnd_map_tx;
+ case GNILND_TX_RDMAQ:
+ return &conn->gnc_device->gnd_rdmaq;
+ default:
+ /* IDLE, FREED or ALLOCD is not valid "on list" state */
+ CERROR("invalid state requested: %s\n",
+ kgnilnd_tx_state2str(to_state));
+ LBUG();
+ break;
+ }
+}
+
+/* should hold tx, conn or peer lock when calling */
+static inline void
+kgnilnd_tx_add_state_locked(kgn_tx_t *tx, kgn_peer_t *peer,
+ kgn_conn_t *conn, kgn_tx_list_state_t state,
+ int add_tail)
+{
+ struct list_head *list = NULL;
+
+ /* make sure we have a sane TX state to start */
+ GNITX_ASSERTF(tx, (tx->tx_list_p == NULL &&
+ tx->tx_list_state == GNILND_TX_ALLOCD) &&
+ list_empty(&tx->tx_list),
+ "bad state with tx_list %s",
+ list_empty(&tx->tx_list) ? "empty" : "not empty");
+
+ /* WTF - you are already on that state buttmunch */
+ GNITX_ASSERTF(tx, state != tx->tx_list_state,
+ "already at %s", kgnilnd_tx_state2str(state));
+
+ /* get proper list from the state requested */
+ list = kgnilnd_tx_state2list(peer, conn, state);
+
+ /* add refcount */
+ switch (state) {
+ case GNILND_TX_PEERQ:
+ kgnilnd_peer_addref(peer);
+ break;
+ case GNILND_TX_ALLOCD:
+ /* no refs needed */
+ break;
+ case GNILND_TX_FMAQ:
+ kgnilnd_conn_addref(conn);
+ break;
+ case GNILND_TX_MAPQ:
+ atomic_inc(&conn->gnc_device->gnd_nq_map);
+ kgnilnd_conn_addref(conn);
+ break;
+ case GNILND_TX_LIVE_FMAQ:
+ atomic_inc(&conn->gnc_nlive_fma);
+ kgnilnd_conn_addref(conn);
+ break;
+ case GNILND_TX_LIVE_RDMAQ:
+ atomic_inc(&conn->gnc_nlive_rdma);
+ kgnilnd_conn_addref(conn);
+ break;
+ case GNILND_TX_RDMAQ:
+ atomic_inc(&conn->gnc_nq_rdma);
+ kgnilnd_conn_addref(conn);
+ break;
+ case GNILND_TX_DYING:
+ kgnilnd_conn_addref(conn);
+ break;
+ default:
+ CERROR("invalid state requested: %s\n",
+ kgnilnd_tx_state2str(state));
+ LBUG();
+ break;;
+ }
+
+ /* if this changes, change kgnilnd_alloc_tx */
+ tx->tx_list_state = state;
+
+ /* some states don't have lists - we track them in the per conn
+ * TX table instead. Waste not, want not! */
+ if (list != NULL) {
+ tx->tx_list_p = list;
+ if (add_tail)
+ list_add_tail(&tx->tx_list, list);
+ else
+ list_add(&tx->tx_list, list);
+ } else {
+ /* set dummy list_p to make book keeping happy and let debugging
+ * be a hair easier */
+ tx->tx_list_p = (void *)state;
+ }
+
+ GNIDBG_TX(D_NET, tx, "onto %s->0x%p",
+ kgnilnd_tx_state2str(state), list);
+}
+
+static inline void
+kgnilnd_tx_del_state_locked(kgn_tx_t *tx, kgn_peer_t *peer,
+ kgn_conn_t *conn, kgn_tx_list_state_t new_state)
+{
+ /* These is only 1 "off-list" state */
+ GNITX_ASSERTF(tx, new_state == GNILND_TX_ALLOCD,
+ "invalid new_state %s", kgnilnd_tx_state2str(new_state));
+
+ /* new_state == ALLOCD means we are deallocating this tx,
+ * so make sure it was on a valid list to start with */
+ GNITX_ASSERTF(tx, (tx->tx_list_p != NULL) &&
+ (((tx->tx_list_state == GNILND_TX_LIVE_FMAQ) ||
+ (tx->tx_list_state == GNILND_TX_LIVE_RDMAQ) ||
+ (tx->tx_list_state == GNILND_TX_DYING)) == list_empty(&tx->tx_list)),
+ "bad state", NULL);
+
+ GNIDBG_TX(D_NET, tx, "off %p", tx->tx_list_p);
+
+ /* drop refcount */
+ switch (tx->tx_list_state) {
+ case GNILND_TX_PEERQ:
+ kgnilnd_peer_decref(peer);
+ break;
+ case GNILND_TX_FREED:
+ case GNILND_TX_IDLE:
+ case GNILND_TX_ALLOCD:
+ /* no refs needed */
+ break;
+ case GNILND_TX_DYING:
+ kgnilnd_conn_decref(conn);
+ break;
+ case GNILND_TX_FMAQ:
+ kgnilnd_conn_decref(conn);
+ break;
+ case GNILND_TX_MAPQ:
+ atomic_dec(&conn->gnc_device->gnd_nq_map);
+ kgnilnd_conn_decref(conn);
+ break;
+ case GNILND_TX_LIVE_FMAQ:
+ atomic_dec(&conn->gnc_nlive_fma);
+ kgnilnd_conn_decref(conn);
+ break;
+ case GNILND_TX_LIVE_RDMAQ:
+ atomic_dec(&conn->gnc_nlive_rdma);
+ kgnilnd_conn_decref(conn);
+ break;
+ case GNILND_TX_RDMAQ:
+ atomic_dec(&conn->gnc_nq_rdma);
+ kgnilnd_conn_decref(conn);
+ /* don't need to assert on default, already did in set */
+ }
+
+ /* for ALLOCD, this might already be true, but no harm doing it again */
+ list_del_init(&tx->tx_list);
+ tx->tx_list_p = NULL;
+ tx->tx_list_state = new_state;
+}
+
+static inline int
+kgnilnd_tx_mapped(kgn_tx_t *tx)
+{
+ return (tx->tx_buftype == GNILND_BUF_VIRT_MAPPED ||
+ tx->tx_buftype == GNILND_BUF_PHYS_MAPPED);
+}
+
+static inline struct list_head *
+kgnilnd_cqid2connlist(__u32 cqid)
+{
+ unsigned int hash = cqid % *kgnilnd_tunables.kgn_peer_hash_size;
+
+ return (&kgnilnd_data.kgn_conns [hash]);
+}
+
+static inline kgn_conn_t *
+kgnilnd_cqid2conn_locked(__u32 cqid)
+{
+ struct list_head *conns = kgnilnd_cqid2connlist(cqid);
+ struct list_head *tmp;
+ kgn_conn_t *conn;
+
+ list_for_each(tmp, conns) {
+ conn = list_entry(tmp, kgn_conn_t, gnc_hashlist);
+
+ if (conn->gnc_cqid == cqid)
+ return conn;
+ }
+
+ return NULL;
+}
+
+/* returns 1..GNILND_MAX_CQID on success, 0 on failure */
+static inline __u32
+kgnilnd_get_cqid_locked(void)
+{
+ int looped = 0;
+ __u32 cqid;
+
+ do {
+ cqid = kgnilnd_data.kgn_next_cqid++;
+ if (kgnilnd_data.kgn_next_cqid >= GNILND_MAX_CQID) {
+ if (looped) {
+ return 0;
+ }
+ kgnilnd_data.kgn_next_cqid = 1;
+ looped = 1;
+ }
+ } while (kgnilnd_cqid2conn_locked(cqid) != NULL);
+
+ return cqid;
+}
+
+static inline void
+kgnilnd_validate_tx_ev_id(kgn_tx_ev_id_t *ev_id, kgn_tx_t **txp, kgn_conn_t **connp)
+{
+ kgn_tx_t *tx = NULL;
+ kgn_conn_t *conn = NULL;
+
+ /* set to NULL so any early return is an error */
+ *txp = NULL;
+ *connp = NULL;
+
+ LASSERTF((ev_id->txe_idx > 0) &&
+ (ev_id->txe_idx < GNILND_MAX_MSG_ID),
+ "bogus txe_idx %d >= %d\n",
+ ev_id->txe_idx, GNILND_MAX_MSG_ID);
+
+ LASSERTF((ev_id->txe_cqid > 0) &&
+ (ev_id->txe_cqid < GNILND_MAX_CQID),
+ "bogus txe_cqid %d >= %d\n",
+ ev_id->txe_cqid, GNILND_MAX_CQID);
+
+ read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+ conn = kgnilnd_cqid2conn_locked(ev_id->txe_cqid);
+
+ if (conn == NULL) {
+ /* Conn was destroyed? */
+ read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+ CDEBUG(D_NET, "CQID %d lookup failed\n", ev_id->txe_cqid);
+ return;
+ }
+ /* just insurance */
+ kgnilnd_conn_addref(conn);
+ read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+ /* we know this is safe - as the TX won't be reused until AFTER
+ * the conn is unlinked from the cqid hash, so we can use the TX
+ * (serializing to avoid any cache oddness) freely from the conn tx ref table */
+
+ spin_lock(&conn->gnc_tx_lock);
+ tx = conn->gnc_tx_ref_table[ev_id->txe_idx];
+ spin_unlock(&conn->gnc_tx_lock);
+
+ /* We could have a tx that was cleared out by other forces
+ * lctl disconnect or del_peer. */
+ if (tx == NULL) {
+ CNETERR("txe_idx %d is gone, ignoring event\n", ev_id->txe_idx);
+ kgnilnd_conn_decref(conn);
+ return;
+ }
+
+ /* check tx->tx_msg magic to make sure kgni didn't eat it */
+ GNITX_ASSERTF(tx, tx->tx_msg.gnm_magic == GNILND_MSG_MAGIC,
+ "came back from kgni with bad magic %x", tx->tx_msg.gnm_magic);
+
+ GNITX_ASSERTF(tx, tx->tx_id.txe_idx == ev_id->txe_idx,
+ "conn 0x%p->%s tx_ref_table hosed: wanted txe_idx %d "
+ "found tx %p txe_idx %d",
+ conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+ ev_id->txe_idx, tx, tx->tx_id.txe_idx);
+
+ GNITX_ASSERTF(tx, tx->tx_conn != NULL, "tx with NULL connection", NULL);
+
+ GNITX_ASSERTF(tx, tx->tx_conn == conn, "tx conn does not equal conn", NULL);
+
+ *txp = tx;
+ *connp = conn;
+
+ GNIDBG_TX(D_NET, tx, "validated to 0x%p", conn);
+}
+
+/* set_normalized_timepsec isn't exported from the kernel, so
+ * we need to do the same thing inline */
+static inline struct timespec
+kgnilnd_ts_sub(struct timespec lhs, struct timespec rhs)
+{
+ time_t sec;
+ long nsec;
+ struct timespec ts;
+
+ sec = lhs.tv_sec - rhs.tv_sec;
+ nsec = lhs.tv_nsec - rhs.tv_nsec;
+
+ while (nsec >= NSEC_PER_SEC) {
+ nsec -= NSEC_PER_SEC;
+ ++sec;
+ }
+ while (nsec < 0) {
+ nsec += NSEC_PER_SEC;
+ --sec;
+ }
+ ts.tv_sec = sec;
+ ts.tv_nsec = nsec;
+ return ts;
+}
+
+static inline int
+kgnilnd_count_list(struct list_head *q)
+{
+ struct list_head *e;
+ int n = 0;
+
+ list_for_each(e, q) {
+ n++;
+ }
+
+ return n;
+}
+
+/* kgnilnd_find_net adds a reference to the net it finds
+ * this is so the net will not be removed before the calling function
+ * has time to use the data returned. This reference needs to be released
+ * by the calling function once it has finished using the returned net
+ */
+
+static inline int
+kgnilnd_find_net(lnet_nid_t nid, kgn_net_t **netp)
+{
+ kgn_net_t *net;
+ int rc;
+
+ rc = down_read_trylock(&kgnilnd_data.kgn_net_rw_sem);
+
+ if (!rc) {
+ return -ESHUTDOWN;
+ }
+
+ list_for_each_entry(net, kgnilnd_netnum2netlist(LNET_NETNUM(LNET_NIDNET(nid))), gnn_list) {
+ if (!net->gnn_shutdown && LNET_NIDNET(net->gnn_ni->ni_nid) == LNET_NIDNET(nid)) {
+ kgnilnd_net_addref(net);
+ up_read(&kgnilnd_data.kgn_net_rw_sem);
+ *netp = net;
+ return 0;
+ }
+ }
+
+ up_read(&kgnilnd_data.kgn_net_rw_sem);
+
+ return -ENONET;
+}
+
+#ifdef CONFIG_DEBUG_SLAB
+#define KGNILND_POISON(ptr, c, s) do {} while(0)
+#else
+#define KGNILND_POISON(ptr, c, s) memset(ptr, c, s)
+#endif
+
+int kgnilnd_dev_init(kgn_device_t *dev);
+void kgnilnd_dev_fini(kgn_device_t *dev);
+int kgnilnd_startup(lnet_ni_t *ni);
+void kgnilnd_shutdown(lnet_ni_t *ni);
+int kgnilnd_base_startup(void);
+void kgnilnd_base_shutdown(void);
+
+int kgnilnd_allocate_phys_fmablk(kgn_device_t *device);
+int kgnilnd_map_phys_fmablk(kgn_device_t *device);
+void kgnilnd_unmap_phys_fmablk(kgn_device_t *device);
+void kgnilnd_free_phys_fmablk(kgn_device_t *device);
+
+int kgnilnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
+void kgnilnd_query(lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when);
+int kgnilnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+int kgnilnd_eager_recv(lnet_ni_t *ni, void *private,
+ lnet_msg_t *lntmsg, void **new_private);
+int kgnilnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+ int delayed, unsigned int niov,
+ struct iovec *iov, lnet_kiov_t *kiov,
+ unsigned int offset, unsigned int mlen, unsigned int rlen);
+
+__u16 kgnilnd_cksum_kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int offset, unsigned int nob, int dump_blob);
+
+/* purgatory functions */
+void kgnilnd_add_purgatory_locked(kgn_conn_t *conn, kgn_peer_t *peer);
+void kgnilnd_mark_for_detach_purgatory_all_locked(kgn_peer_t *peer);
+void kgnilnd_detach_purgatory_locked(kgn_conn_t *conn, struct list_head *conn_list);
+void kgnilnd_release_purgatory_list(struct list_head *conn_list);
+
+void kgnilnd_update_reaper_timeout(long timeout);
+void kgnilnd_unmap_buffer(kgn_tx_t *tx, int error);
+kgn_tx_t *kgnilnd_new_tx_msg(int type, lnet_nid_t source);
+void kgnilnd_tx_done(kgn_tx_t *tx, int completion);
+void kgnilnd_txlist_done(struct list_head *txlist, int error);
+void kgnilnd_unlink_peer_locked(kgn_peer_t *peer);
+void kgnilnd_schedule_conn(kgn_conn_t *conn);
+void kgnilnd_schedule_process_conn(kgn_conn_t *conn, int sched_intent);
+
+void kgnilnd_schedule_dgram(kgn_device_t *dev);
+int kgnilnd_create_peer_safe(kgn_peer_t **peerp, lnet_nid_t nid, kgn_net_t *net);
+void kgnilnd_add_peer_locked(lnet_nid_t nid, kgn_peer_t *new_stub_peer, kgn_peer_t **peerp);
+int kgnilnd_add_peer(kgn_net_t *net, lnet_nid_t nid, kgn_peer_t **peerp);
+
+kgn_peer_t *kgnilnd_find_peer_locked(lnet_nid_t nid);
+int kgnilnd_del_conn_or_peer(kgn_net_t *net, lnet_nid_t nid, int command, int error);
+void kgnilnd_peer_increase_reconnect_locked(kgn_peer_t *peer);
+void kgnilnd_queue_reply(kgn_conn_t *conn, kgn_tx_t *tx);
+void kgnilnd_queue_tx(kgn_conn_t *conn, kgn_tx_t *tx);
+void kgnilnd_launch_tx(kgn_tx_t *tx, kgn_net_t *net, lnet_process_id_t *target);
+int kgnilnd_send_mapped_tx(kgn_tx_t *tx, int try_map_if_full);
+void kgnilnd_consume_rx(kgn_rx_t *rx);
+
+void kgnilnd_schedule_device(kgn_device_t *dev);
+void kgnilnd_device_callback(__u32 devid, __u64 arg);
+void kgnilnd_schedule_device_timer(unsigned long arg);
+
+int kgnilnd_reaper(void *arg);
+int kgnilnd_scheduler(void *arg);
+int kgnilnd_dgram_mover(void *arg);
+
+int kgnilnd_create_conn(kgn_conn_t **connp, kgn_device_t *dev);
+int kgnilnd_conn_isdup_locked(kgn_peer_t *peer, kgn_conn_t *newconn);
+kgn_conn_t *kgnilnd_find_conn_locked(kgn_peer_t *peer);
+int kgnilnd_get_conn(kgn_conn_t **connp, kgn_peer_t);
+kgn_conn_t *kgnilnd_find_or_create_conn_locked(kgn_peer_t *peer);
+void kgnilnd_peer_cancel_tx_queue(kgn_peer_t *peer);
+void kgnilnd_cancel_peer_connect_locked(kgn_peer_t *peer, struct list_head *zombies);
+int kgnilnd_close_stale_conns_locked(kgn_peer_t *peer, kgn_conn_t *newconn);
+void kgnilnd_peer_alive(kgn_peer_t *peer);
+void kgnilnd_peer_notify(kgn_peer_t *peer, int error);
+void kgnilnd_close_conn_locked(kgn_conn_t *conn, int error);
+void kgnilnd_close_conn(kgn_conn_t *conn, int error);
+void kgnilnd_complete_closed_conn(kgn_conn_t *conn);
+void kgnilnd_destroy_conn_ep(kgn_conn_t *conn);
+
+int kgnilnd_close_peer_conns_locked(kgn_peer_t *peer, int why);
+
+int kgnilnd_tunables_init(void);
+void kgnilnd_tunables_fini(void);
+void kgnilnd_init_msg(kgn_msg_t *msg, int type, lnet_nid_t source);
+
+void kgnilnd_bump_timeouts(__u32 nap_time, char *reason);
+void kgnilnd_pause_threads(void);
+int kgnilnd_hw_in_quiesce(void);
+int kgnilnd_check_hw_quiesce(void);
+void kgnilnd_quiesce_wait(char *reason);
+void kgnilnd_quiesce_end_callback(gni_nic_handle_t nic_handle, uint64_t msecs);
+int kgnilnd_ruhroh_thread(void *arg);
+void kgnilnd_reset_stack(void);
+void kgnilnd_critical_error(gni_err_handle_t err_handle);
+
+void kgnilnd_insert_sysctl(void);
+void kgnilnd_remove_sysctl(void);
+void kgnilnd_proc_init(void);
+void kgnilnd_proc_fini(void);
+
+/* gnilnd_conn.c */
+void kgnilnd_release_mbox(kgn_conn_t *conn, int purgatory_hold);
+
+int kgnilnd_find_and_cancel_dgram(kgn_device_t *dev, lnet_nid_t dst_nid);
+void kgnilnd_cancel_dgram_locked(kgn_dgram_t *dgram);
+void kgnilnd_release_dgram(kgn_device_t *dev, kgn_dgram_t *dgram);
+
+int kgnilnd_setup_wildcard_dgram(kgn_device_t *dev);
+int kgnilnd_cancel_net_dgrams(kgn_net_t *net);
+int kgnilnd_cancel_wc_dgrams(kgn_device_t *dev);
+void kgnilnd_wait_for_canceled_dgrams(kgn_device_t *dev);
+
+int kgnilnd_dgram_waitq(void *arg);
+
+int kgnilnd_set_conn_params(kgn_dgram_t *dgram);
+
+/* struct2str functions - we don't use a default: case to cause the compile
+ * to fail if there is a missing case. This allows us to hide these down here
+ * out of the way but ensure we'll catch any updates to the enum/types
+ * above */
+
+#define DO_TYPE(x) case x: return #x;
+static inline const char *
+kgnilnd_fmablk_state2str(kgn_fmablk_state_t state)
+{
+ /* Only want single char string for this */
+ switch (state) {
+ case GNILND_FMABLK_IDLE:
+ return "I";
+ case GNILND_FMABLK_PHYS:
+ return "P";
+ case GNILND_FMABLK_VIRT:
+ return "V";
+ case GNILND_FMABLK_FREED:
+ return "F";
+ }
+ return "<unknown state>";
+}
+
+static inline const char *
+kgnilnd_msgtype2str(int type)
+{
+ switch (type) {
+ DO_TYPE(GNILND_MSG_NONE);
+ DO_TYPE(GNILND_MSG_NOOP);
+ DO_TYPE(GNILND_MSG_IMMEDIATE);
+ DO_TYPE(GNILND_MSG_PUT_REQ);
+ DO_TYPE(GNILND_MSG_PUT_NAK);
+ DO_TYPE(GNILND_MSG_PUT_ACK);
+ DO_TYPE(GNILND_MSG_PUT_DONE);
+ DO_TYPE(GNILND_MSG_GET_REQ);
+ DO_TYPE(GNILND_MSG_GET_NAK);
+ DO_TYPE(GNILND_MSG_GET_DONE);
+ DO_TYPE(GNILND_MSG_CLOSE);
+ }
+ return "<unknown msg type>";
+}
+
+static inline const char *
+kgnilnd_tx_state2str(kgn_tx_list_state_t state)
+{
+ switch (state) {
+ DO_TYPE(GNILND_TX_IDLE);
+ DO_TYPE(GNILND_TX_ALLOCD);
+ DO_TYPE(GNILND_TX_PEERQ);
+ DO_TYPE(GNILND_TX_MAPQ);
+ DO_TYPE(GNILND_TX_FMAQ);
+ DO_TYPE(GNILND_TX_LIVE_FMAQ);
+ DO_TYPE(GNILND_TX_RDMAQ);
+ DO_TYPE(GNILND_TX_LIVE_RDMAQ);
+ DO_TYPE(GNILND_TX_DYING);
+ DO_TYPE(GNILND_TX_FREED);
+ }
+ return "<unknown state>";
+}
+
+static inline const char *
+kgnilnd_conn_state2str(kgn_conn_t *conn)
+{
+ kgn_conn_state_t state = conn->gnc_state;
+ switch (state) {
+ DO_TYPE(GNILND_CONN_DUMMY);
+ DO_TYPE(GNILND_CONN_LISTEN);
+ DO_TYPE(GNILND_CONN_CONNECTING);
+ DO_TYPE(GNILND_CONN_ESTABLISHED);
+ DO_TYPE(GNILND_CONN_CLOSING);
+ DO_TYPE(GNILND_CONN_CLOSED);
+ DO_TYPE(GNILND_CONN_DONE);
+ DO_TYPE(GNILND_CONN_DESTROY_EP);
+ }
+ return "<?state?>";
+}
+
+static inline const char *
+kgnilnd_connreq_type2str(kgn_connreq_t *connreq)
+{
+ kgn_connreq_type_t type = connreq->gncr_type;
+
+ switch (type) {
+ DO_TYPE(GNILND_CONNREQ_REQ);
+ DO_TYPE(GNILND_CONNREQ_NAK);
+ DO_TYPE(GNILND_CONNREQ_CLOSE);
+ }
+ return "<?type?>";
+}
+
+static inline const char *
+kgnilnd_dgram_state2str(kgn_dgram_t *dgram)
+{
+ kgn_dgram_state_t state = dgram->gndg_state;
+
+ switch (state) {
+ DO_TYPE(GNILND_DGRAM_USED);
+ DO_TYPE(GNILND_DGRAM_POSTING);
+ DO_TYPE(GNILND_DGRAM_POSTED);
+ DO_TYPE(GNILND_DGRAM_PROCESSING);
+ DO_TYPE(GNILND_DGRAM_DONE);
+ DO_TYPE(GNILND_DGRAM_CANCELED);
+ }
+ return "<?state?>";
+}
+
+static inline const char *
+kgnilnd_dgram_type2str(kgn_dgram_t *dgram)
+{
+ kgn_dgram_type_t type = dgram->gndg_type;
+
+ switch (type) {
+ DO_TYPE(GNILND_DGRAM_REQ);
+ DO_TYPE(GNILND_DGRAM_WC_REQ);
+ DO_TYPE(GNILND_DGRAM_NAK);
+ DO_TYPE(GNILND_DGRAM_CLOSE);
+ }
+ return "<?type?>";
+}
+
+
+#undef DO_TYPE
+
+/* API wrapper functions - include late to pick up all of the other defines */
+#include "gnilnd_api_wrap.h"
+
+#endif /* _GNILND_GNILND_H_ */
--- /dev/null
+/*
+ * Copyright (C) 2009-2012 Cray, Inc.
+ *
+ * Author: Nic Henke <nic@cray.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#ifndef _GNILND_API_WRAP_H
+#define _GNILND_API_WRAP_H
+
+/* LNet is allocated failure locations 0xe000 to 0xffff */
+
+/* GNILND has 0xf0XX */
+#define CFS_FAIL_GNI 0xf000
+#define CFS_FAIL_GNI_PHYS_MAP 0xf001
+#define CFS_FAIL_GNI_VIRT_MAP 0xf002
+#define CFS_FAIL_GNI_GET_UNMAP 0xf003
+#define CFS_FAIL_GNI_PUT_UNMAP 0xf004
+#define CFS_FAIL_GNI_MAP_TX 0xf005
+#define CFS_FAIL_GNI_SMSG_SEND 0xf006
+#define CFS_FAIL_GNI_CLOSE_SEND 0xf007
+#define CFS_FAIL_GNI_CDM_CREATE 0xf008
+#define CFS_FAIL_GNI_CDM_DESTROY 0xf009
+#define CFS_FAIL_GNI_CDM_ATTACH 0xf00a
+#define CFS_FAIL_GNI_CQ_CREATE 0xf00b
+#define CFS_FAIL_GNI_CQ_DESTROY 0xf00c
+#define CFS_FAIL_GNI_EP_BIND 0xf00d
+#define CFS_FAIL_GNI_EP_UNBIND 0xf00e
+#define CFS_FAIL_GNI_EP_SET_EVDATA 0xf00f
+#define CFS_FAIL_GNI_SMSG_INIT 0xf010
+#define CFS_FAIL_GNI_SMSG_RELEASE 0xf011
+#define CFS_FAIL_GNI_POST_RDMA 0xf012
+#define CFS_FAIL_GNI_GET_COMPLETED 0xf013
+#define CFS_FAIL_GNI_EP_DESTROY 0xf015
+#define CFS_FAIL_GNI_VIRT_UNMAP 0xf016
+#define CFS_FAIL_GNI_MDD_RELEASE 0xf017
+#define CFS_FAIL_GNI_NOOP_SEND 0xf018
+#define CFS_FAIL_GNI_ERR_SUBSCRIBE 0xf01a
+#define CFS_FAIL_GNI_QUIESCE_RACE 0xf01b
+#define CFS_FAIL_GNI_DG_TERMINATE 0xf01c
+#define CFS_FAIL_GNI_REG_QUIESCE 0xf01d
+#define CFS_FAIL_GNI_IN_QUIESCE 0xf01e
+#define CFS_FAIL_GNI_DELAY_RDMA 0xf01f
+#define CFS_FAIL_GNI_SR_DOWN_RACE 0xf020
+#define CFS_FAIL_GNI_ALLOC_TX 0xf021
+#define CFS_FAIL_GNI_FMABLK_AVAIL 0xf022
+#define CFS_FAIL_GNI_EP_CREATE 0xf023
+#define CFS_FAIL_GNI_CQ_GET_EVENT 0xf024
+#define CFS_FAIL_GNI_PROBE 0xf025
+#define CFS_FAIL_GNI_EP_TEST 0xf026
+#define CFS_FAIL_GNI_CONNREQ_DROP 0xf027
+#define CFS_FAIL_GNI_CONNREQ_PROTO 0xf028
+#define CFS_FAIL_GNI_CONND_PILEUP 0xf029
+#define CFS_FAIL_GNI_PHYS_SETUP 0xf02a
+#define CFS_FAIL_GNI_FIND_TARGET 0xf02b
+#define CFS_FAIL_GNI_WC_DGRAM_FREE 0xf02c
+#define CFS_FAIL_GNI_DROP_CLOSING 0xf02d
+#define CFS_FAIL_GNI_RX_CLOSE_CLOSING 0xf02e
+#define CFS_FAIL_GNI_RX_CLOSE_CLOSED 0xf02f
+#define CFS_FAIL_GNI_EP_POST 0xf030
+#define CFS_FAIL_GNI_PACK_SRCNID 0xf031
+#define CFS_FAIL_GNI_PACK_DSTNID 0xf032
+#define CFS_FAIL_GNI_PROBE_WAIT 0xf033
+#define CFS_FAIL_GNI_SMSG_CKSUM1 0xf034
+#define CFS_FAIL_GNI_SMSG_CKSUM2 0xf035
+#define CFS_FAIL_GNI_SMSG_CKSUM3 0xf036
+#define CFS_FAIL_GNI_DROP_DESTROY_EP 0xf037
+#define CFS_FAIL_GNI_SMSG_GETNEXT 0xf038
+#define CFS_FAIL_GNI_FINISH_PURG 0xf039
+#define CFS_FAIL_GNI_PURG_REL_DELAY 0xf03a
+#define CFS_FAIL_GNI_DONT_NOTIFY 0xf03b
+#define CFS_FAIL_GNI_VIRT_SMALL_MAP 0xf03c
+#define CFS_FAIL_GNI_DELAY_RDMAQ 0xf03d
+#define CFS_FAIL_GNI_PAUSE_SHUTDOWN 0xf03e
+#define CFS_FAIL_GNI_PAUSE_DGRAM_COMP 0xf03f
+#define CFS_FAIL_GNI_NET_LOOKUP 0xf040
+#define CFS_FAIL_GNI_RECV_TIMEOUT 0xf041
+#define CFS_FAIL_GNI_SEND_TIMEOUT 0xf042
+#define CFS_FAIL_GNI_ONLY_NOOP 0xf043
+#define CFS_FAIL_GNI_FINISH_PURG2 0xf044
+#define CFS_FAIL_GNI_RACE_RESET 0xf045
+#define CFS_FAIL_GNI_GNP_CONNECTING1 0xf046
+#define CFS_FAIL_GNI_GNP_CONNECTING2 0xf047
+#define CFS_FAIL_GNI_GNP_CONNECTING3 0xf048
+#define CFS_FAIL_GNI_PUT_ACK_AGAIN 0xf050
+#define CFS_FAIL_GNI_GET_REQ_AGAIN 0xf051
+
+/* helper macros */
+extern void
+_kgnilnd_api_rc_lbug(const char *rcstr, int rc, struct libcfs_debug_msg_data *data,
+ const char *fmt, ...)
+ __attribute__ ((format (printf, 4, 5)));
+
+#define kgnilnd_api_rc_lbug(msgdata, rc, fmt, a...) \
+do { \
+ CFS_CHECK_STACK(msgdata, D_ERROR, NULL); \
+ /* we don't mask this - it is always at D_ERROR */ \
+ _kgnilnd_api_rc_lbug(kgnilnd_api_rc2str(rc), (rc), msgdata, fmt, ##a); \
+} while (0)
+
+#define DO_RETCODE(x) case x: return #x;
+static inline const char *
+kgnilnd_api_rc2str(gni_return_t rrc)
+{
+
+ switch (rrc) {
+ DO_RETCODE(GNI_RC_SUCCESS)
+ DO_RETCODE(GNI_RC_NOT_DONE);
+ DO_RETCODE(GNI_RC_INVALID_PARAM);
+ DO_RETCODE(GNI_RC_ERROR_RESOURCE);
+ DO_RETCODE(GNI_RC_TIMEOUT);
+ DO_RETCODE(GNI_RC_PERMISSION_ERROR);
+ DO_RETCODE(GNI_RC_DESCRIPTOR_ERROR);
+ DO_RETCODE(GNI_RC_ALIGNMENT_ERROR);
+ DO_RETCODE(GNI_RC_INVALID_STATE);
+ DO_RETCODE(GNI_RC_NO_MATCH);
+ DO_RETCODE(GNI_RC_SIZE_ERROR);
+ DO_RETCODE(GNI_RC_TRANSACTION_ERROR);
+ DO_RETCODE(GNI_RC_ILLEGAL_OP);
+ DO_RETCODE(GNI_RC_ERROR_NOMEM);
+ }
+ LBUG();
+}
+#undef DO_RETCODE
+
+/* log an error and LBUG for unhandled rc from gni api function
+ * the fmt should be something like:
+ * gni_api_call(arg1, arg2, arg3)
+ */
+
+/* apick_fn and apick_fmt should be defined for each site */
+#undef apick_fn
+#undef apick_fmt
+
+#define GNILND_API_RC_LBUG(args...) \
+do { \
+ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_ERROR, NULL); \
+ kgnilnd_api_rc_lbug(&msgdata, rrc, apick_fn"("apick_fmt")", ##args); \
+} while (0)
+
+#define GNILND_API_SWBUG(args...) \
+do { \
+ CERROR("likely SOFTWARE BUG "apick_fn"("apick_fmt") rc %s\n", \
+ ##args, kgnilnd_api_rc2str(rrc)); \
+} while (0)
+
+#define GNILND_API_EINVAL(args...) \
+do { \
+ CERROR("invalid parameter to "apick_fn"("apick_fmt") rc %s\n", \
+ ##args, kgnilnd_api_rc2str(rrc)); \
+} while (0)
+
+#define GNILND_API_RESOURCE(args...) \
+do { \
+ CERROR("no resources for "apick_fn"("apick_fmt") rc %s\n", \
+ ##args, kgnilnd_api_rc2str(rrc)); \
+} while (0)
+
+#define GNILND_API_BUSY(args...) \
+do { \
+ CERROR("resources busy for "apick_fn"("apick_fmt") rc %s\n", \
+ ##args, kgnilnd_api_rc2str(rrc)); \
+} while (0)
+
+#undef DEBUG_SMSG_CREDITS
+#ifdef DEBUG_SMSG_CREDITS
+#define CRAY_CONFIG_GHAL_GEMINI
+#include <gni_priv.h>
+#define GNIDBG_SMSG_CREDS(level, conn) \
+do { \
+ gni_ep_smsg_mbox_t *smsg = conn->gnc_ephandle->smsg; \
+ CDEBUG(level, "SMSGDBG: conn %p mcred %d/%d bcred %d/%d " \
+ "s_seq %d/%d/%d r_seq %d/%d/%d retr %d\n", \
+ conn, smsg->mbox_credits, smsg->back_mbox_credits, \
+ smsg->buffer_credits, smsg->back_buffer_credits, \
+ smsg->s_seqno, smsg->s_seqno_back_mbox_credits, \
+ smsg->s_seqno_back_buffer_credits, smsg->r_seqno, \
+ smsg->r_seqno_back_mbox_credits, \
+ smsg->r_seqno_back_buffer_credits, smsg->retransmit_count); \
+} while (0)
+#else
+#define GNIDBG_SMSG_CREDS(level, conn) do {} while(0)
+#endif
+
+/* these are all wrappers around gni_XXX functions.
+ * This allows us to handle all the return codes and api checks without
+ * dirtying up the logic code */
+
+/* TODO: RETURN wrapper that translates integer to GNI API RC string */
+
+#define apick_fn "kgnilnd_cdm_create"
+#define apick_fmt "%u, %u, %u, %u, 0x%p"
+static inline gni_return_t kgnilnd_cdm_create(
+ IN uint32_t inst_id,
+ IN uint8_t ptag,
+ IN uint32_t cookie,
+ IN uint32_t modes,
+ OUT gni_cdm_handle_t *cdm_hndl
+ )
+{
+ gni_return_t rrc;
+
+ /* Error injection */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CDM_CREATE)) {
+ rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+ } else {
+ rrc = gni_cdm_create(inst_id, ptag, cookie, modes, cdm_hndl);
+ }
+
+ switch (rrc) {
+ case GNI_RC_SUCCESS:
+ break;
+ case GNI_RC_ERROR_RESOURCE:
+ case GNI_RC_INVALID_PARAM:
+ /* Try to bail gracefully */
+ GNILND_API_SWBUG(
+ inst_id, ptag, cookie, modes, cdm_hndl);
+ break;
+ default:
+ GNILND_API_RC_LBUG(
+ inst_id, ptag, cookie, modes, cdm_hndl);
+
+ /* LBUG never returns, but just for style and consistency */
+ break;
+ }
+ RETURN(rrc);
+}
+
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_cdm_attach"
+#define apick_fmt "0x%p, %u, 0x%p, 0x%p"
+static inline gni_return_t kgnilnd_cdm_attach(
+ IN gni_cdm_handle_t cdm_hndl,
+ IN uint32_t device_id,
+ OUT uint32_t *local_addr,
+ OUT gni_nic_handle_t *nic_hndl
+ )
+{
+ gni_return_t rrc;
+
+ /* Error injection */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CDM_ATTACH)) {
+ rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+ } else {
+ rrc = gni_cdm_attach(cdm_hndl, device_id, local_addr, nic_hndl);
+ }
+
+ switch (rrc) {
+ case GNI_RC_SUCCESS:
+ break;
+ case GNI_RC_NO_MATCH:
+ case GNI_RC_INVALID_PARAM:
+ GNILND_API_SWBUG(
+ cdm_hndl, device_id, local_addr, nic_hndl);
+ break;
+ case GNI_RC_ERROR_RESOURCE:
+ case GNI_RC_INVALID_STATE:
+ GNILND_API_RESOURCE(
+ cdm_hndl, device_id, local_addr, nic_hndl);
+ break;
+ default:
+ GNILND_API_RC_LBUG(
+ cdm_hndl, device_id, local_addr, nic_hndl);
+
+ /* LBUG never returns, but just for style and consistency */
+ break;
+ }
+ RETURN(rrc);
+}
+#undef apick_fmt
+#undef apick_fn
+
+#define apick_fn "kgnilnd_cdm_destroy"
+#define apick_fmt "0x%p"
+static inline gni_return_t kgnilnd_cdm_destroy(
+ IN gni_cdm_handle_t cdm_hndl
+ )
+{
+ gni_return_t rrc;
+
+ /* Error injection */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CQ_DESTROY)) {
+ rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+ } else {
+ rrc = gni_cdm_destroy(
+ cdm_hndl);
+ }
+
+ switch (rrc) {
+ case GNI_RC_SUCCESS:
+ break;
+ case GNI_RC_INVALID_PARAM:
+ GNILND_API_SWBUG(
+ cdm_hndl);
+ break;
+ default:
+ GNILND_API_RC_LBUG(
+ cdm_hndl);
+
+ /* LBUG never returns, but just for style and consistency */
+ break;
+ }
+ RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_subscribe_errors"
+#define apick_fmt "0x%p,%x,%u,0x%p,0x%p,0x%p"
+static inline gni_return_t kgnilnd_subscribe_errors(
+ IN gni_nic_handle_t nic_handle,
+ IN gni_error_mask_t mask,
+ IN uint32_t EEQ_size,
+ IN void (*EQ_new_event)(gni_err_handle_t),
+ IN void (*app_crit_err)(gni_err_handle_t),
+ OUT gni_err_handle_t *err_handle
+ )
+{
+ gni_return_t rrc;
+
+ /* Error injection */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_ERR_SUBSCRIBE)) {
+ rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+ } else {
+ rrc = gni_subscribe_errors(
+ nic_handle, mask, EEQ_size, EQ_new_event, app_crit_err,
+ err_handle);
+ }
+
+ switch (rrc) {
+ case GNI_RC_SUCCESS:
+ break;
+ case GNI_RC_INVALID_PARAM:
+ GNILND_API_SWBUG(
+ nic_handle, mask, EEQ_size, EQ_new_event, app_crit_err,
+ err_handle);
+ break;
+ case GNI_RC_ERROR_RESOURCE:
+ GNILND_API_RESOURCE(
+ nic_handle, mask, EEQ_size, EQ_new_event, app_crit_err,
+ err_handle);
+ break;
+ default:
+ GNILND_API_RC_LBUG(
+ nic_handle, mask, EEQ_size, EQ_new_event, app_crit_err,
+ err_handle);
+ /* LBUG never returns, but just for style and consistency */
+ break;
+ }
+ RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_release_errors"
+#define apick_fmt "0x%p"
+static inline gni_return_t kgnilnd_release_errors(
+ IN gni_err_handle_t err_handle
+ )
+{
+ gni_return_t rrc;
+
+ rrc = gni_release_errors(
+ err_handle);
+
+ switch (rrc) {
+ case GNI_RC_SUCCESS:
+ break;
+ case GNI_RC_INVALID_PARAM:
+ case GNI_RC_NOT_DONE:
+ GNILND_API_SWBUG(
+ err_handle);
+ break;
+ default:
+ GNILND_API_RC_LBUG(
+ err_handle);
+ /* LBUG never returns, but just for style and consistency */
+ break;
+ }
+ RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_set_quiesce_callback"
+#define apick_fmt "0x%p,0x%p"
+static inline gni_return_t kgnilnd_set_quiesce_callback(
+ IN gni_nic_handle_t nic_handle,
+ IN void (*qsce_func)(gni_nic_handle_t, uint64_t msecs)
+ )
+{
+ gni_return_t rrc;
+
+ /* Error injection */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_REG_QUIESCE)) {
+ rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+ } else {
+ rrc = gni_set_quiesce_callback(
+ nic_handle, qsce_func);
+ }
+
+ switch (rrc) {
+ case GNI_RC_SUCCESS:
+ break;
+ case GNI_RC_INVALID_STATE:
+ case GNI_RC_INVALID_PARAM:
+ GNILND_API_SWBUG(
+ nic_handle, qsce_func);
+ break;
+ default:
+ GNILND_API_RC_LBUG(
+ nic_handle, qsce_func);
+ /* LBUG never returns, but just for style and consistency */
+ break;
+ }
+ RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_get_quiesce_status"
+#define apick_fmt "0x%p"
+static inline gni_return_t kgnilnd_get_quiesce_status(
+ IN gni_nic_handle_t nic_handle
+ )
+{
+ uint32_t rrc;
+
+ /* this has weird RC -
+ * 0 - quiesce not in progress
+ * 1 - quiesce is turned on
+ */
+
+ /* Error injection */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_IN_QUIESCE)) {
+ rrc = 1;
+ } else {
+ rrc = gni_get_quiesce_status(
+ nic_handle);
+ }
+
+ switch (rrc) {
+ case 1:
+ case 0:
+ break;
+ default:
+ GNILND_API_RC_LBUG(
+ nic_handle);
+ /* LBUG never returns, but just for style and consistency */
+ break;
+ }
+ RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_cq_create"
+#define apick_fmt "0x%p, %u, %u, 0x%p, "LPX64", 0x%p"
+static inline gni_return_t kgnilnd_cq_create(
+ IN gni_nic_handle_t nic_hndl,
+ IN uint32_t entry_count,
+ IN uint32_t delay_index,
+ IN gni_cq_event_hndlr_f *event_handler,
+ IN uint64_t usr_event_data,
+ OUT gni_cq_handle_t *cq_hndl
+ )
+{
+ gni_return_t rrc;
+
+ /* Error injection */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CQ_CREATE)) {
+ rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+ } else {
+ rrc = gni_cq_create(
+ nic_hndl, entry_count, delay_index, event_handler,
+ usr_event_data, cq_hndl);
+ }
+
+ switch (rrc) {
+ case GNI_RC_SUCCESS:
+ break;
+ case GNI_RC_INVALID_PARAM:
+ GNILND_API_SWBUG(
+ nic_hndl, entry_count, delay_index, event_handler,
+ usr_event_data, cq_hndl);
+ break;
+ case GNI_RC_ERROR_RESOURCE:
+ GNILND_API_RESOURCE(
+ nic_hndl, entry_count, delay_index, event_handler,
+ usr_event_data, cq_hndl);
+ break;
+ default:
+ GNILND_API_RC_LBUG(
+ nic_hndl, entry_count, delay_index, event_handler,
+ usr_event_data, cq_hndl);
+
+ /* LBUG never returns, but just for style and consistency */
+ break;
+ }
+ RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_cq_destroy"
+#define apick_fmt "0x%p"
+static inline gni_return_t kgnilnd_cq_destroy(
+ IN gni_cq_handle_t cq_hndl
+ )
+{
+ gni_return_t rrc;
+
+ /* Error injection */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CQ_DESTROY)) {
+ rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+ } else {
+
+ rrc = gni_cq_destroy(
+ cq_hndl);
+ }
+
+ switch (rrc) {
+ case GNI_RC_SUCCESS:
+ break;
+ case GNI_RC_INVALID_PARAM:
+ GNILND_API_SWBUG(
+ cq_hndl);
+ break;
+ case GNI_RC_ERROR_RESOURCE:
+ GNILND_API_BUSY(
+ cq_hndl);
+ break;
+ default:
+ GNILND_API_RC_LBUG(
+ cq_hndl);
+
+ /* LBUG never returns, but just for style and consistency */
+ break;
+ }
+ RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_cq_get_event"
+#define apick_fmt "0x%p, 0x%p"
+static inline gni_return_t kgnilnd_cq_get_event(
+ IN gni_cq_handle_t cq_hndl,
+ OUT gni_cq_entry_t *event_data
+ )
+{
+ gni_return_t rrc;
+
+ /* no error injection - CQs are touchy about the data.
+ * where appropriate, we'll do this on the CQs that should be able to
+ * handle the various errors */
+ rrc = gni_cq_get_event(
+ cq_hndl, event_data);
+
+ switch (rrc) {
+ case GNI_RC_SUCCESS:
+ case GNI_RC_NOT_DONE:
+ case GNI_RC_TRANSACTION_ERROR:
+ break;
+ case GNI_RC_ERROR_RESOURCE:
+ LASSERTF(GNI_CQ_OVERRUN(*event_data),
+ "kgni returned ERROR_RESOURCE but cq_hndl 0x%p is not "
+ "overrun\n", cq_hndl);
+ break;
+ case GNI_RC_INVALID_PARAM:
+ GNILND_API_SWBUG(
+ cq_hndl, event_data);
+ break;
+ default:
+ GNILND_API_RC_LBUG(
+ cq_hndl, event_data);
+
+ /* LBUG never returns, but just for style and consistency */
+ break;
+ }
+ return rrc;
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_smsg_init"
+#define apick_fmt "0x%p, 0x%p, 0x%p"
+static inline gni_return_t kgnilnd_smsg_init(
+ IN gni_ep_handle_t ep_hndl,
+ IN gni_smsg_attr_t *local_smsg_attr,
+ IN gni_smsg_attr_t *remote_smsg_attr
+ )
+{
+ gni_return_t rrc;
+
+ /* Error injection */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_INIT)) {
+ rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_ERROR_RESOURCE;
+ } else {
+ rrc = gni_smsg_init(
+ ep_hndl, local_smsg_attr, remote_smsg_attr);
+ }
+
+ switch (rrc) {
+ /* both of these are OK, upper SW needs to handle */
+ case GNI_RC_SUCCESS:
+ case GNI_RC_NOT_DONE:
+ break;
+ case GNI_RC_INVALID_PARAM:
+ case GNI_RC_INVALID_STATE:
+ GNILND_API_SWBUG(
+ ep_hndl, local_smsg_attr, remote_smsg_attr);
+ break;
+ case GNI_RC_ERROR_RESOURCE:
+ GNILND_API_RESOURCE(
+ ep_hndl, local_smsg_attr, remote_smsg_attr);
+ break;
+ default:
+ GNILND_API_RC_LBUG(
+ ep_hndl, local_smsg_attr, remote_smsg_attr);
+
+ /* LBUG never returns, but just for style and consistency */
+ break;
+ }
+ RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_smsg_send"
+#define apick_fmt "0x%p, 0x%p, %d, 0x%p, %u %u"
+static inline gni_return_t kgnilnd_smsg_send(
+ IN gni_ep_handle_t ep_hndl,
+ IN void *header,
+ IN uint32_t header_length,
+ IN void *data,
+ IN uint32_t data_length,
+ IN uint32_t msg_id
+ )
+{
+ gni_return_t rrc;
+
+ /* Error injection */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_SEND)) {
+ if (cfs_fail_loc & CFS_FAIL_RAND) {
+ rrc = GNI_RC_NOT_DONE;
+ } else {
+ rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE;
+ }
+ } else {
+ rrc = gni_smsg_send(
+ ep_hndl, header, header_length, data, data_length, msg_id);
+ }
+
+ switch (rrc) {
+ /* both of these are OK, upper SW needs to handle */
+ case GNI_RC_SUCCESS:
+ case GNI_RC_NOT_DONE:
+ break;
+ case GNI_RC_INVALID_PARAM:
+ GNILND_API_SWBUG(
+ ep_hndl, header, header_length, data, data_length, msg_id);
+ break;
+ case GNI_RC_ERROR_RESOURCE:
+ GNILND_API_RESOURCE(
+ ep_hndl, header, header_length, data, data_length, msg_id);
+ break;
+ default:
+ GNILND_API_RC_LBUG(
+ ep_hndl, header, header_length, data, data_length, msg_id);
+
+ /* LBUG never returns, but just for style and consistency */
+ break;
+ }
+ RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_smsg_getnext"
+#define apick_fmt "0x%p,0x%p"
+static inline gni_return_t kgnilnd_smsg_getnext(
+ IN gni_ep_handle_t ep_hndl,
+ OUT void **header
+ )
+{
+ gni_return_t rrc;
+
+ /* Error injection */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_RELEASE)) {
+ rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE;
+ } else {
+ rrc = gni_smsg_getnext(
+ ep_hndl, header);
+ }
+
+ switch (rrc) {
+ /* both of these are OK, upper SW needs to handle */
+ case GNI_RC_SUCCESS:
+ case GNI_RC_NOT_DONE:
+ case GNI_RC_INVALID_STATE:
+ break;
+ case GNI_RC_INVALID_PARAM:
+ GNILND_API_SWBUG(
+ ep_hndl, header);
+ break;
+ default:
+ GNILND_API_RC_LBUG(
+ ep_hndl, header);
+
+ /* LBUG never returns, but just for style and consistency */
+ break;
+ }
+ RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_smsg_release"
+#define apick_fmt "0x%p"
+static inline gni_return_t kgnilnd_smsg_release(
+ IN gni_ep_handle_t ep_hndl
+ )
+{
+ gni_return_t rrc;
+
+ /* Error injection */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_RELEASE)) {
+ rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+ } else {
+ rrc = gni_smsg_release(
+ ep_hndl);
+ }
+
+ switch (rrc) {
+ /* both of these are OK, upper SW needs to handle */
+ case GNI_RC_SUCCESS:
+ case GNI_RC_NOT_DONE:
+ break;
+ case GNI_RC_INVALID_PARAM:
+ GNILND_API_SWBUG(
+ ep_hndl);
+ break;
+ default:
+ GNILND_API_RC_LBUG(
+ ep_hndl);
+
+ /* LBUG never returns, but just for style and consistency */
+ break;
+ }
+ RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_create"
+#define apick_fmt "0x%p, 0x%p, 0x%p"
+static inline gni_return_t kgnilnd_ep_create(
+ IN gni_nic_handle_t nic_hndl,
+ IN gni_cq_handle_t src_cq_hndl,
+ OUT gni_ep_handle_t *ep_hndl
+ )
+{
+ gni_return_t rrc;
+
+ /* error injection */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_CREATE)) {
+ rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_ERROR_NOMEM;
+ } else {
+ rrc = gni_ep_create(
+ nic_hndl, src_cq_hndl, ep_hndl);
+ }
+
+ switch (rrc) {
+ case GNI_RC_SUCCESS:
+ break;
+ case GNI_RC_INVALID_PARAM:
+ GNILND_API_SWBUG(
+ nic_hndl, src_cq_hndl, ep_hndl);
+ break;
+ case GNI_RC_ERROR_NOMEM:
+ GNILND_API_RESOURCE(
+ nic_hndl, src_cq_hndl, ep_hndl);
+ break;
+ default:
+ GNILND_API_RC_LBUG(
+ nic_hndl, src_cq_hndl, ep_hndl);
+
+ /* lbug never returns, but just for style and consistency */
+ break;
+ }
+ RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_bind"
+#define apick_fmt "0x%p, %x, %x"
+static inline gni_return_t kgnilnd_ep_bind(
+ IN gni_ep_handle_t ep_hndl,
+ IN uint32_t remote_addr,
+ IN uint32_t remote_id
+ )
+{
+ gni_return_t rrc;
+
+ /* error injection */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_BIND)) {
+ rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE;
+ } else {
+ rrc = gni_ep_bind(
+ ep_hndl, remote_addr, remote_id);
+ }
+
+ switch (rrc) {
+ /* both of these are ok, upper sw needs to handle */
+ case GNI_RC_SUCCESS:
+ case GNI_RC_NOT_DONE:
+ break;
+ case GNI_RC_INVALID_PARAM:
+ GNILND_API_SWBUG(
+ ep_hndl, remote_addr, remote_id);
+ break;
+ default:
+ GNILND_API_RC_LBUG(
+ ep_hndl, remote_addr, remote_id);
+
+ /* lbug never returns, but just for style and consistency */
+ break;
+ }
+ RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_set_eventdata"
+#define apick_fmt "0x%p, %x, %x"
+static inline gni_return_t kgnilnd_ep_set_eventdata(
+ IN gni_ep_handle_t ep_hndl,
+ IN uint32_t local_event,
+ IN uint32_t remote_event
+ )
+{
+ gni_return_t rrc;
+
+ /* Error injection */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_SET_EVDATA)) {
+ rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+ } else {
+ rrc = gni_ep_set_eventdata(
+ ep_hndl, local_event, remote_event);
+ }
+
+ switch (rrc) {
+ case GNI_RC_SUCCESS:
+ break;
+ case GNI_RC_INVALID_PARAM:
+ GNILND_API_SWBUG(
+ ep_hndl, local_event, remote_event);
+ break;
+ default:
+ GNILND_API_RC_LBUG(
+ ep_hndl, local_event, remote_event);
+
+ /* LBUG never returns, but just for style and consistency */
+ break;
+ }
+ RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_unbind"
+#define apick_fmt "0x%p"
+static inline gni_return_t kgnilnd_ep_unbind(
+ IN gni_ep_handle_t ep_hndl
+ )
+{
+ gni_return_t rrc;
+
+ /* Error injection */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_UNBIND)) {
+ rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE;
+ } else {
+ rrc = gni_ep_unbind(
+ ep_hndl);
+ }
+
+ switch (rrc) {
+ /* both of these are OK, upper SW needs to handle */
+ case GNI_RC_NOT_DONE:
+ case GNI_RC_SUCCESS:
+ break;
+ case GNI_RC_INVALID_PARAM:
+ GNILND_API_SWBUG(
+ ep_hndl);
+ break;
+ default:
+ GNILND_API_RC_LBUG(
+ ep_hndl);
+
+ /* LBUG never returns, but just for style and consistency */
+ break;
+ }
+ RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_destroy"
+#define apick_fmt "0x%p"
+static inline gni_return_t kgnilnd_ep_destroy(
+ IN gni_ep_handle_t ep_hndl
+ )
+{
+ gni_return_t rrc;
+
+ /* Error injection */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_DESTROY)) {
+ rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE;
+ } else {
+ rrc = gni_ep_destroy(
+ ep_hndl);
+ }
+
+ switch (rrc) {
+ case GNI_RC_SUCCESS:
+ break;
+ case GNI_RC_INVALID_PARAM:
+ GNILND_API_SWBUG(
+ ep_hndl);
+ break;
+ default:
+ GNILND_API_RC_LBUG(
+ ep_hndl);
+
+ /* LBUG never returns, but just for style and consistency */
+ break;
+ }
+ RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_postdata_w_id"
+#define apick_fmt "0x%p, 0x%p, %d, 0x%p, %d, "LPU64""
+static inline gni_return_t kgnilnd_ep_postdata_w_id(
+ IN gni_ep_handle_t ep_hndl,
+ IN void *in_data,
+ IN uint16_t data_len,
+ IN void *out_buf,
+ IN uint16_t buf_size,
+ IN uint64_t datagram_id
+ )
+{
+ gni_return_t rrc;
+
+ /* Error injection */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_POST)) {
+ rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_SIZE_ERROR;
+ } else {
+ rrc = gni_ep_postdata_w_id(
+ ep_hndl, in_data, data_len, out_buf, buf_size,
+ datagram_id);
+ }
+
+ switch (rrc) {
+ case GNI_RC_SUCCESS:
+ case GNI_RC_ERROR_NOMEM:
+ case GNI_RC_ERROR_RESOURCE:
+ break;
+ case GNI_RC_INVALID_PARAM:
+ case GNI_RC_SIZE_ERROR:
+ GNILND_API_SWBUG(
+ ep_hndl, in_data, data_len, out_buf, buf_size,
+ datagram_id);
+ break;
+ default:
+ GNILND_API_RC_LBUG(
+ ep_hndl, in_data, data_len, out_buf, buf_size,
+ datagram_id);
+
+ /* LBUG never returns, but just for style and consistency */
+ break;
+ }
+ RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_postdata_test_by_id"
+#define apick_fmt "0x%p, "LPU64", 0x%p, 0x%p, 0x%p"
+static inline gni_return_t kgnilnd_ep_postdata_test_by_id(
+ IN gni_ep_handle_t ep_hndl,
+ IN uint64_t datagram_id,
+ OUT gni_post_state_t *post_state,
+ OUT uint32_t *remote_addr,
+ OUT uint32_t *remote_id
+ )
+{
+ gni_return_t rrc;
+
+ /* Error injection */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_TEST)) {
+ rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_ERROR_NOMEM;
+ } else {
+ rrc = gni_ep_postdata_test_by_id(
+ ep_hndl, datagram_id, post_state, remote_addr,
+ remote_id);
+
+ /* we want to lie, but we need to do the actual work first
+ * so we don't keep getting the event saying a dgram is ready */
+ if (rrc == GNI_RC_SUCCESS && CFS_FAIL_CHECK(CFS_FAIL_GNI_DG_TERMINATE)) {
+ /* don't use fail_val, allows us to do FAIL_SOME */
+ *post_state = GNI_POST_TERMINATED;
+ }
+ }
+
+ switch (rrc) {
+ case GNI_RC_SUCCESS:
+ case GNI_RC_NO_MATCH:
+ break;
+ case GNI_RC_SIZE_ERROR:
+ case GNI_RC_INVALID_PARAM:
+ GNILND_API_SWBUG(
+ ep_hndl, datagram_id, post_state, remote_addr,
+ remote_id);
+ break;
+ case GNI_RC_ERROR_NOMEM:
+ GNILND_API_RESOURCE(
+ ep_hndl, datagram_id, post_state, remote_addr,
+ remote_id);
+ break;
+ default:
+ GNILND_API_RC_LBUG(
+ ep_hndl, datagram_id, post_state, remote_addr,
+ remote_id);
+
+ /* LBUG never returns, but just for style and consistency */
+ break;
+ }
+ RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_ep_postdata_cancel_by_id"
+#define apick_fmt "0x%p, "LPU64""
+static inline gni_return_t kgnilnd_ep_postdata_cancel_by_id(
+ IN gni_ep_handle_t ep_hndl,
+ IN uint64_t datagram_id
+ )
+{
+ gni_return_t rrc;
+
+ /* no error injection as the only thing we'd do is LBUG */
+
+ rrc = gni_ep_postdata_cancel_by_id(
+ ep_hndl, datagram_id);
+
+ switch (rrc) {
+ case GNI_RC_SUCCESS:
+ case GNI_RC_NO_MATCH:
+ break;
+ case GNI_RC_INVALID_PARAM:
+ GNILND_API_SWBUG(
+ ep_hndl, datagram_id);
+ break;
+ default:
+ GNILND_API_RC_LBUG(
+ ep_hndl, datagram_id);
+
+ /* LBUG never returns, but just for style and consistency */
+ break;
+ }
+ RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_postdata_probe_by_id"
+#define apick_fmt "0x%p, 0x%p"
+static inline gni_return_t kgnilnd_postdata_probe_by_id(
+ IN gni_nic_handle_t nic_hndl,
+ OUT uint64_t *datagram_id
+ )
+{
+ gni_return_t rrc;
+
+ /* Error injection */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PROBE)) {
+ rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NO_MATCH;
+ } else {
+ rrc = gni_postdata_probe_by_id(
+ nic_hndl, datagram_id);
+ }
+
+ switch (rrc) {
+ case GNI_RC_SUCCESS:
+ case GNI_RC_NO_MATCH:
+ break;
+ case GNI_RC_INVALID_PARAM:
+ GNILND_API_SWBUG(
+ nic_hndl, datagram_id);
+ break;
+ default:
+ GNILND_API_RC_LBUG(
+ nic_hndl, datagram_id);
+
+ /* LBUG never returns, but just for style and consistency */
+ break;
+ }
+ RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_postdata_probe_wait_by_id"
+#define apick_fmt "0x%p, %d, 0x%p"
+static inline gni_return_t kgnilnd_postdata_probe_wait_by_id(
+ IN gni_nic_handle_t nic_hndl,
+ IN uint32_t timeout,
+ OUT uint64_t *datagram_id
+ )
+{
+ gni_return_t rrc;
+
+ /* Error injection */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PROBE_WAIT)) {
+ rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_TIMEOUT;
+ } else {
+ rrc = gni_postdata_probe_wait_by_id(
+ nic_hndl, timeout, datagram_id);
+ }
+
+ switch (rrc) {
+ case GNI_RC_SUCCESS:
+ case GNI_RC_TIMEOUT:
+ break;
+ case GNI_RC_INVALID_PARAM:
+ GNILND_API_SWBUG(
+ nic_hndl, timeout, datagram_id);
+ break;
+ default:
+ GNILND_API_RC_LBUG(
+ nic_hndl, timeout, datagram_id);
+
+ /* LBUG never returns, but just for style and consistency */
+ break;
+ }
+ RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_post_rdma"
+#define apick_fmt "0x%p, 0x%p"
+static inline gni_return_t kgnilnd_post_rdma(
+ IN gni_ep_handle_t ep_hndl,
+ IN gni_post_descriptor_t *post_descr
+ )
+{
+ gni_return_t rrc;
+
+ /* Error injection */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_POST_RDMA)) {
+ rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+ } else {
+ rrc = gni_post_rdma(
+ ep_hndl, post_descr);
+ }
+
+ switch (rrc) {
+ case GNI_RC_SUCCESS:
+ break;
+ case GNI_RC_ALIGNMENT_ERROR:
+ case GNI_RC_INVALID_PARAM:
+ GNILND_API_SWBUG(
+ ep_hndl, post_descr);
+ break;
+ case GNI_RC_ERROR_RESOURCE:
+ GNILND_API_RESOURCE(
+ ep_hndl, post_descr);
+ break;
+ default:
+ GNILND_API_RC_LBUG(
+ ep_hndl, post_descr);
+
+ /* LBUG never returns, but just for style and consistency */
+ break;
+ }
+ RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_get_completed"
+#define apick_fmt "0x%p,"LPX64",0x%p"
+static inline gni_return_t kgnilnd_get_completed(
+ IN gni_cq_handle_t cq_hndl,
+ IN gni_cq_entry_t event_data,
+ OUT gni_post_descriptor_t **post_descr
+ )
+{
+ gni_return_t rrc;
+
+
+ rrc = gni_get_completed(cq_hndl, event_data, post_descr);
+
+ switch (rrc) {
+ case GNI_RC_TRANSACTION_ERROR:
+ case GNI_RC_SUCCESS:
+ break;
+ case GNI_RC_DESCRIPTOR_ERROR:
+ case GNI_RC_INVALID_PARAM:
+ GNILND_API_SWBUG(cq_hndl, event_data, post_descr);
+ break;
+ default:
+ GNILND_API_RC_LBUG(cq_hndl, event_data, post_descr);
+ /* LBUG never returns, but just for style and consistency */
+ break;
+ }
+
+ /* Error injection - we need a valid desc, so let kgni give us one
+ * - then we lie */
+ if (rrc == GNI_RC_SUCCESS &&
+ (CFS_FAIL_CHECK(CFS_FAIL_GNI_GET_COMPLETED))) {
+ /* We only trigger TRANSACTION_ERROR for now */
+ gni_post_descriptor_t *desc;
+ rrc = GNI_RC_TRANSACTION_ERROR;
+ desc = *post_descr;
+ desc->status = rrc;
+ /* recoverable decision made from cfs_fail_val in
+ * kgnilnd_cq_error_str and
+ * kgnilnd_cq_error_recoverable */
+ }
+ RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_cq_error_str"
+#define apick_fmt LPX64",0x%p,%d"
+static inline gni_return_t kgnilnd_cq_error_str(
+ IN gni_cq_entry_t entry,
+ IN void *buffer,
+ IN uint32_t len
+ )
+{
+ gni_return_t rrc;
+
+ /* Error injection - set string if we injected a
+ * TRANSACTION_ERROR earlier */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GET_COMPLETED)) {
+ /* if we just set persistent error, we can't ever
+ * break in via ssh to clear, so use a count > 10 to indicate fatal */
+ sprintf(buffer, "INJECT:%s", cfs_fail_val > 10 ?
+ "FATAL" : "RECOVERABLE");
+ rrc = GNI_RC_SUCCESS;
+ } else {
+ rrc = gni_cq_error_str(
+ entry, buffer, len);
+ }
+
+ switch (rrc) {
+ case GNI_RC_SUCCESS:
+ break;
+ case GNI_RC_SIZE_ERROR:
+ case GNI_RC_INVALID_PARAM:
+ GNILND_API_SWBUG(
+ entry, buffer, len);
+ /* give them something to use */
+ snprintf(buffer, len, "UNDEF:UNDEF");
+ break;
+ default:
+ GNILND_API_RC_LBUG(
+ entry, buffer, len);
+
+ /* LBUG never returns, but just for style and consistency */
+ break;
+ }
+ RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_cq_error_recoverable"
+#define apick_fmt LPX64",0x%p"
+static inline gni_return_t kgnilnd_cq_error_recoverable(
+ IN gni_cq_entry_t entry,
+ IN uint32_t *recoverable
+ )
+{
+ gni_return_t rrc;
+
+ /* Error injection - set string if we injected a
+ * TRANSACTION_ERROR earlier */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GET_COMPLETED)) {
+ *recoverable = cfs_fail_val > 10 ? 0 : 1;
+ rrc = GNI_RC_SUCCESS;
+ } else {
+ rrc = gni_cq_error_recoverable(
+ entry, recoverable);
+ }
+
+ switch (rrc) {
+ case GNI_RC_SUCCESS:
+ break;
+ case GNI_RC_INVALID_STATE:
+ case GNI_RC_INVALID_PARAM:
+ GNILND_API_SWBUG(
+ entry, recoverable);
+ *recoverable = 0;
+ break;
+ default:
+ GNILND_API_RC_LBUG(
+ entry, recoverable);
+
+ /* LBUG never returns, but just for style and consistency */
+ break;
+ }
+ RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_mem_register_segments"
+#define apick_fmt "0x%p,0x%p,%u,0x%p,%x,0x%p"
+static inline gni_return_t
+kgnilnd_mem_register_segments(
+ IN gni_nic_handle_t nic_hndl,
+ IN gni_mem_segment_t *mem_segments,
+ IN uint32_t segments_cnt,
+ IN gni_cq_handle_t dst_cq_hndl,
+ IN uint32_t flags,
+ OUT gni_mem_handle_t *mem_hndl
+ )
+{
+ gni_return_t rrc;
+
+ /* Error injection */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PHYS_MAP)) {
+ rrc = GNI_RC_ERROR_RESOURCE;
+ } else {
+ rrc = gni_mem_register_segments(
+ nic_hndl, mem_segments, segments_cnt,
+ dst_cq_hndl, flags, mem_hndl);
+ }
+
+ switch (rrc) {
+ case GNI_RC_SUCCESS:
+ case GNI_RC_ERROR_RESOURCE:
+ break;
+ case GNI_RC_INVALID_PARAM:
+ GNILND_API_SWBUG(
+ nic_hndl, mem_segments, segments_cnt,
+ dst_cq_hndl, flags, mem_hndl);
+ break;
+ default:
+ GNILND_API_RC_LBUG(
+ nic_hndl, mem_segments, segments_cnt,
+ dst_cq_hndl, flags, mem_hndl);
+
+ /* LBUG never returns, but just for style and consistency */
+ break;
+ }
+ RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_mem_register"
+#define apick_fmt "0x%p,"LPX64","LPX64"0x%p,%u,0x%p"
+static inline gni_return_t kgnilnd_mem_register(
+ IN gni_nic_handle_t nic_hndl,
+ IN uint64_t address,
+ IN uint64_t length,
+ IN gni_cq_handle_t dst_cq_hndl,
+ IN uint32_t flags,
+ OUT gni_mem_handle_t *mem_hndl
+ )
+{
+ gni_return_t rrc;
+
+ /* Error injection */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_VIRT_MAP)) {
+ rrc = GNI_RC_ERROR_RESOURCE;
+ } else if (CFS_FAIL_CHECK(CFS_FAIL_GNI_VIRT_SMALL_MAP) &&
+ length <= *kgnilnd_tunables.kgn_max_immediate) {
+ rrc = GNI_RC_INVALID_PARAM;
+ } else {
+ rrc = gni_mem_register(
+ nic_hndl, address, length,
+ dst_cq_hndl, flags, mem_hndl);
+ }
+
+ switch (rrc) {
+ case GNI_RC_SUCCESS:
+ case GNI_RC_ERROR_RESOURCE:
+ break;
+ case GNI_RC_INVALID_PARAM:
+ GNILND_API_SWBUG(
+ nic_hndl, address, length,
+ dst_cq_hndl, flags, mem_hndl);
+ break;
+ default:
+ GNILND_API_RC_LBUG(
+ nic_hndl, address, length,
+ dst_cq_hndl, flags, mem_hndl);
+
+ /* LBUG never returns, but just for style and consistency */
+ break;
+ }
+ RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_mem_deregister"
+#define apick_fmt "0x%p,0x%p,%d"
+static inline gni_return_t kgnilnd_mem_deregister(
+ IN gni_nic_handle_t nic_hndl,
+ IN gni_mem_handle_t *mem_hndl,
+ IN int hold_timeout
+ )
+{
+ gni_return_t rrc;
+
+ /* Error injection */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_VIRT_UNMAP)) {
+ rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM;
+ } else {
+ rrc = gni_mem_deregister(
+ nic_hndl, mem_hndl, hold_timeout);
+ }
+
+ switch (rrc) {
+ case GNI_RC_SUCCESS:
+ break;
+ case GNI_RC_INVALID_PARAM:
+ GNILND_API_SWBUG(
+ nic_hndl, mem_hndl, hold_timeout);
+ break;
+ default:
+ GNILND_API_RC_LBUG(
+ nic_hndl, mem_hndl, hold_timeout);
+
+ /* LBUG never returns, but just for style and consistency */
+ break;
+ }
+ RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#define apick_fn "kgnilnd_mem_mdd_release"
+#define apick_fmt "0x%p,0x%p"
+static inline gni_return_t kgnilnd_mem_mdd_release(
+ IN gni_nic_handle_t nic_hndl,
+ IN gni_mem_handle_t *mem_hndl
+ )
+{
+ gni_return_t rrc;
+
+ /* Error injection */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_MDD_RELEASE)) {
+ rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NO_MATCH;
+ } else {
+ rrc = gni_mem_mdd_release(
+ nic_hndl, mem_hndl);
+ }
+
+ switch (rrc) {
+ case GNI_RC_SUCCESS:
+ case GNI_RC_NO_MATCH:
+ break;
+ default:
+ GNILND_API_RC_LBUG(
+ nic_hndl, mem_hndl);
+
+ /* LBUG never returns, but just for style and consistency */
+ break;
+ }
+ RETURN(rrc);
+}
+#undef apick_fn
+#undef apick_fmt
+
+#endif /* _GNILND_API_WRAP_H */
--- /dev/null
+/*
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ * Copyright (C) 2009-2012 Cray, Inc.
+ *
+ * Derived from work by Eric Barton <eric@bartonsoftware.com>
+ * Author: Nic Henke <nic@cray.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <linux/nmi.h>
+#include "gnilnd.h"
+
+/* this is useful when needed to debug wire corruption. */
+static void
+kgnilnd_dump_blob(int level, char *prefix, void *buf, int len) {
+ __u64 *ptr;
+
+ ptr = (__u64 *) buf;
+
+ while (len > 0) {
+ if (len >= 32) {
+ CDEBUG(level,
+ "%s 0x%p: 0x%16.16llx 0x%16.16llx 0x%16.16llx 0x%16.16llx\n",
+ prefix, ptr, *(ptr), *(ptr + 1), *(ptr + 2), *(ptr + 3));
+ ptr += 4;
+ len -= 32;
+ } else if (len >= 16) {
+ CDEBUG(level,
+ "%s 0x%p: 0x%16.16llx 0x%16.16llx\n",
+ prefix, ptr, *(ptr), *(ptr + 1));
+ ptr += 2;
+ len -= 16;
+ } else {
+ CDEBUG(level, "%s 0x%p: 0x%16.16llx\n",
+ prefix, ptr, *(ptr));
+ ptr++;
+ len -= 8;
+ }
+ }
+}
+
+static void
+kgnilnd_dump_msg(int mask, kgn_msg_t *msg)
+{
+ CDEBUG(mask, "0x%8.8x 0x%4.4x 0x%4.4x 0x%16.16llx"
+ " 0x%16.16llx 0x%8.8x 0x%4.4x 0x%4.4x 0x%8.8x\n",
+ msg->gnm_magic, msg->gnm_version,
+ msg->gnm_type, msg->gnm_srcnid,
+ msg->gnm_connstamp, msg->gnm_seq,
+ msg->gnm_cksum, msg->gnm_payload_cksum,
+ msg->gnm_payload_len);
+}
+
+void
+kgnilnd_schedule_device(kgn_device_t *dev)
+{
+ short already_live = 0;
+
+ /* we'll only want to wake if the scheduler thread
+ * has come around and set ready to zero */
+ already_live = cmpxchg(&dev->gnd_ready, GNILND_DEV_IDLE, GNILND_DEV_IRQ);
+
+ if (!already_live) {
+ wake_up_all(&dev->gnd_waitq);
+ }
+ return;
+}
+
+void kgnilnd_schedule_device_timer(unsigned long arg)
+{
+ kgn_device_t *dev = (kgn_device_t *) arg;
+
+ kgnilnd_schedule_device(dev);
+}
+
+void
+kgnilnd_device_callback(__u32 devid, __u64 arg)
+{
+ kgn_device_t *dev;
+ int index = (int) arg;
+
+ if (index >= kgnilnd_data.kgn_ndevs) {
+ /* use _EMERG instead of an LBUG to prevent LBUG'ing in
+ * interrupt context. */
+ LCONSOLE_EMERG("callback for unknown device %d->%d\n",
+ devid, index);
+ return;
+ }
+
+ dev = &kgnilnd_data.kgn_devices[index];
+ /* just basic sanity */
+ if (dev->gnd_id == devid) {
+ kgnilnd_schedule_device(dev);
+ } else {
+ LCONSOLE_EMERG("callback for bad device %d devid %d\n",
+ dev->gnd_id, devid);
+ }
+}
+
+/* sched_intent values:
+ * < 0 : do not reschedule under any circumstances
+ * == 0: reschedule if someone marked him WANTS_SCHED
+ * > 0 : force a reschedule */
+
+void
+kgnilnd_schedule_process_conn(kgn_conn_t *conn, int sched_intent)
+{
+ int conn_sched;
+
+ /* move back to IDLE but save previous state.
+ * if we see WANTS_SCHED, we'll call kgnilnd_schedule_conn and
+ * let the xchg there handle any racing callers to get it
+ * onto gnd_ready_conns */
+
+ conn_sched = xchg(&conn->gnc_scheduled, GNILND_CONN_IDLE);
+ LASSERTF(conn_sched == GNILND_CONN_WANTS_SCHED ||
+ conn_sched == GNILND_CONN_PROCESS,
+ "conn %p after process in bad state: %d\n",
+ conn, conn_sched);
+
+ if (sched_intent >= 0) {
+ if ((sched_intent > 0 || (conn_sched == GNILND_CONN_WANTS_SCHED))) {
+ kgnilnd_schedule_conn(conn);
+ }
+ }
+}
+
+void
+kgnilnd_schedule_conn(kgn_conn_t *conn)
+{
+ kgn_device_t *dev = conn->gnc_device;
+ int sched;
+
+ sched = xchg(&conn->gnc_scheduled, GNILND_CONN_WANTS_SCHED);
+
+ /* if we are IDLE, add to list - only one guy sees IDLE and "wins"
+ * the chance to put it onto gnd_ready_conns.
+ * otherwise, leave marked as WANTS_SCHED and the thread that "owns"
+ * the conn in process_conns will take care of moving it back to
+ * SCHED when it is done processing */
+
+ if (sched == GNILND_CONN_IDLE) {
+ /* if the conn is already scheduled, we've already requested
+ * the scheduler thread wakeup */
+ kgnilnd_conn_addref(conn); /* +1 ref for scheduler */
+
+ LASSERTF(list_empty(&conn->gnc_schedlist), "conn %p already sched state %d\n",
+ conn, sched);
+
+ CDEBUG(D_INFO, "scheduling conn 0x%p\n", conn);
+
+ spin_lock(&dev->gnd_lock);
+ list_add_tail(&conn->gnc_schedlist, &dev->gnd_ready_conns);
+ spin_unlock(&dev->gnd_lock);
+ set_mb(conn->gnc_last_sched_ask, jiffies);
+
+ } else {
+ CDEBUG(D_INFO, "not scheduling conn 0x%p: %d\n", conn, sched);
+ }
+
+ /* make sure thread(s) going to process conns - but let it make
+ * separate decision from conn schedule */
+ kgnilnd_schedule_device(dev);
+}
+
+void
+kgnilnd_schedule_dgram(kgn_device_t *dev)
+{
+ int wake;
+
+ wake = xchg(&dev->gnd_dgram_ready, GNILND_DGRAM_SCHED);
+ if (wake != GNILND_DGRAM_SCHED) {
+ wake_up(&dev->gnd_dgram_waitq);
+ } else {
+ CDEBUG(D_NETTRACE, "not waking: %d\n", wake);
+ }
+}
+
+void
+kgnilnd_free_tx(kgn_tx_t *tx)
+{
+ /* taken from kgnilnd_tx_add_state_locked */
+
+ LASSERTF((tx->tx_list_p == NULL &&
+ tx->tx_list_state == GNILND_TX_ALLOCD) &&
+ list_empty(&tx->tx_list),
+ "tx %p with bad state %s (list_p %p) tx_list %s\n",
+ tx, kgnilnd_tx_state2str(tx->tx_list_state), tx->tx_list_p,
+ list_empty(&tx->tx_list) ? "empty" : "not empty");
+
+ atomic_dec(&kgnilnd_data.kgn_ntx);
+
+ /* we only allocate this if we need to */
+ if (tx->tx_phys != NULL) {
+ cfs_mem_cache_free(kgnilnd_data.kgn_tx_phys_cache, tx->tx_phys);
+ CDEBUG(D_MALLOC, "slab-freed 'tx_phys': %lu at %p.\n",
+ LNET_MAX_IOV * sizeof(gni_mem_segment_t), tx->tx_phys);
+ }
+#if 0
+ KGNILND_POISON(tx, 0x5a, sizeof(kgn_tx_t));
+#endif
+ cfs_mem_cache_free(kgnilnd_data.kgn_tx_cache, tx);
+ CDEBUG(D_MALLOC, "slab-freed 'tx': %lu at %p.\n",
+ sizeof(*tx), tx);
+}
+
+kgn_tx_t *
+kgnilnd_alloc_tx(void)
+{
+ kgn_tx_t *tx = NULL;
+
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_ALLOC_TX))
+ return tx;
+
+ tx = cfs_mem_cache_alloc(kgnilnd_data.kgn_tx_cache, CFS_ALLOC_ATOMIC);
+ if (tx == NULL) {
+ CERROR("failed to allocate tx\n");
+ return NULL;
+ }
+ CDEBUG(D_MALLOC, "slab-alloced 'tx': %lu at %p.\n",
+ sizeof(*tx), tx);
+
+ /* need this memset, cache alloc'd memory is not cleared */
+ memset(tx, 0, sizeof(*tx));
+
+ /* setup everything here to minimize time under the lock */
+ tx->tx_buftype = GNILND_BUF_NONE;
+ tx->tx_msg.gnm_type = GNILND_MSG_NONE;
+ INIT_LIST_HEAD(&tx->tx_list);
+ INIT_LIST_HEAD(&tx->tx_map_list);
+ tx->tx_list_state = GNILND_TX_ALLOCD;
+
+ atomic_inc(&kgnilnd_data.kgn_ntx);
+
+ return tx;
+}
+
+/* csum_fold needs to be run on the return value before shipping over the wire */
+#define _kgnilnd_cksum(seed, ptr, nob) csum_partial(ptr, nob, seed)
+
+/* we don't use offset as every one is passing a buffer reference that already
+ * includes the offset into the base address -
+ * see kgnilnd_setup_virt_buffer and kgnilnd_setup_immediate_buffer */
+static inline __u16
+kgnilnd_cksum(void *ptr, size_t nob)
+{
+ __u16 sum;
+
+ sum = csum_fold(_kgnilnd_cksum(0, ptr, nob));
+
+ /* don't use magic 'no checksum' value */
+ if (sum == 0)
+ sum = 1;
+
+ CDEBUG(D_INFO, "cksum 0x%x for ptr 0x%p sz %zu\n",
+ sum, ptr, nob);
+
+ return sum;
+}
+
+inline __u16
+kgnilnd_cksum_kiov(unsigned int nkiov, lnet_kiov_t *kiov,
+ unsigned int offset, unsigned int nob, int dump_blob)
+{
+ __wsum cksum = 0;
+ __wsum tmpck;
+ __u16 retsum;
+ void *addr;
+ unsigned int fraglen;
+ int i, odd;
+
+ LASSERT(nkiov > 0);
+ LASSERT(nob > 0);
+
+ CDEBUG(D_BUFFS, "calc cksum for kiov 0x%p nkiov %u offset %u nob %u, dump %d\n",
+ kiov, nkiov, offset, nob, dump_blob);
+
+ /* if loops changes, please change kgnilnd_setup_phys_buffer */
+
+ while (offset >= kiov->kiov_len) {
+ offset -= kiov->kiov_len;
+ nkiov--;
+ kiov++;
+ LASSERT(nkiov > 0);
+ }
+
+ /* ignore nob here, if nob < (kiov_len - offset), kiov == 1 */
+ odd = (unsigned long) (kiov[0].kiov_len - offset) & 1;
+
+ if ((odd || *kgnilnd_tunables.kgn_vmap_cksum) && nkiov > 1) {
+ struct page **pages = kgnilnd_data.kgn_cksum_map_pages[get_cpu()];
+
+ LASSERTF(pages != NULL, "NULL pages for cpu %d map_pages 0x%p\n",
+ get_cpu(), kgnilnd_data.kgn_cksum_map_pages);
+
+ CDEBUG(D_BUFFS, "odd %d len %u offset %u nob %u\n",
+ odd, kiov[0].kiov_len, offset, nob);
+
+ for (i = 0; i < nkiov; i++) {
+ pages[i] = kiov[i].kiov_page;
+ }
+
+ addr = vmap(pages, nkiov, VM_MAP, PAGE_KERNEL);
+ if (addr == NULL) {
+ CNETERR("Couldn't vmap %d frags on %d bytes to avoid odd length fragment in cksum\n",
+ nkiov, nob);
+ /* return zero to avoid killing tx - we'll just get warning on console
+ * when remote end sees zero checksum */
+ RETURN(0);
+ }
+ atomic_inc(&kgnilnd_data.kgn_nvmap_cksum);
+
+ tmpck = _kgnilnd_cksum(0, (void *) addr + kiov[0].kiov_offset + offset, nob);
+ cksum = tmpck;
+
+ if (dump_blob) {
+ kgnilnd_dump_blob(D_BUFFS, "flat kiov RDMA payload",
+ (void *)addr + kiov[0].kiov_offset + offset, nob);
+ }
+ CDEBUG(D_BUFFS, "cksum 0x%x (+0x%x) for addr 0x%p+%u len %u offset %u\n",
+ cksum, tmpck, addr, kiov[0].kiov_offset, nob, offset);
+ vunmap(addr);
+ } else {
+ do {
+ fraglen = min(kiov->kiov_len - offset, nob);
+
+ /* make dang sure we don't send a bogus checksum if somehow we get
+ * an odd length fragment on anything but the last entry in a kiov -
+ * we know from kgnilnd_setup_rdma_buffer that we can't have non
+ * PAGE_SIZE pages in the middle, so if nob < PAGE_SIZE, it is the last one */
+ LASSERTF(!(fraglen&1) || (nob < PAGE_SIZE),
+ "odd fraglen %u on nkiov %d, nob %u kiov_len %u offset %u kiov 0x%p\n",
+ fraglen, nkiov, nob, kiov->kiov_len, offset, kiov);
+
+ addr = (void *)kmap(kiov->kiov_page) + kiov->kiov_offset + offset;
+ tmpck = _kgnilnd_cksum(cksum, addr, fraglen);
+
+ CDEBUG(D_BUFFS,
+ "cksum 0x%x (+0x%x) for page 0x%p+%u (0x%p) len %u offset %u\n",
+ cksum, tmpck, kiov->kiov_page, kiov->kiov_offset, addr,
+ fraglen, offset);
+
+ cksum = tmpck;
+
+ if (dump_blob)
+ kgnilnd_dump_blob(D_BUFFS, "kiov cksum", addr, fraglen);
+
+ kunmap(kiov->kiov_page);
+
+ kiov++;
+ nkiov--;
+ nob -= fraglen;
+ offset = 0;
+
+ /* iov must not run out before end of data */
+ LASSERTF(nob == 0 || nkiov > 0, "nob %u nkiov %u\n", nob, nkiov);
+
+ } while (nob > 0);
+ }
+
+ retsum = csum_fold(cksum);
+
+ /* don't use magic 'no checksum' value */
+ if (retsum == 0)
+ retsum = 1;
+
+ CDEBUG(D_BUFFS, "retsum 0x%x from cksum 0x%x\n", retsum, cksum);
+
+ return retsum;
+}
+
+void
+kgnilnd_init_msg(kgn_msg_t *msg, int type, lnet_nid_t source)
+{
+ msg->gnm_magic = GNILND_MSG_MAGIC;
+ msg->gnm_version = GNILND_MSG_VERSION;
+ msg->gnm_type = type;
+ msg->gnm_payload_len = 0;
+ msg->gnm_srcnid = source;
+ /* gnm_connstamp gets set when FMA is sent */
+ /* gnm_srcnid is set on creation via function argument
+ * The right interface/net and nid is passed in when the message
+ * is created.
+ */
+}
+
+kgn_tx_t *
+kgnilnd_new_tx_msg(int type, lnet_nid_t source)
+{
+ kgn_tx_t *tx = kgnilnd_alloc_tx();
+
+ if (tx != NULL) {
+ kgnilnd_init_msg(&tx->tx_msg, type, source);
+ } else {
+ CERROR("couldn't allocate new tx type %s!\n",
+ kgnilnd_msgtype2str(type));
+ }
+
+ return tx;
+}
+
+static void
+kgnilnd_nak_rdma(kgn_conn_t *conn, int type, int error, __u64 cookie, lnet_nid_t source) {
+ kgn_tx_t *tx;
+
+ /* only allow NAK on error and truncate to zero */
+ LASSERTF(error <= 0, "error %d conn 0x%p, cookie "LPU64"\n",
+ error, conn, cookie);
+
+ tx = kgnilnd_new_tx_msg(type, source);
+ if (tx == NULL) {
+ CNETERR("can't get TX to NAK RDMA to %s\n",
+ libcfs_nid2str(conn->gnc_peer->gnp_nid));
+ return;
+ }
+
+ tx->tx_msg.gnm_u.completion.gncm_retval = error;
+ tx->tx_msg.gnm_u.completion.gncm_cookie = cookie;
+ kgnilnd_queue_tx(conn, tx);
+}
+
+int
+kgnilnd_setup_immediate_buffer(kgn_tx_t *tx, unsigned int niov, struct iovec *iov,
+ lnet_kiov_t *kiov, unsigned int offset, unsigned int nob)
+
+{
+ kgn_msg_t *msg = &tx->tx_msg;
+ int i;
+
+ /* To help save on MDDs for short messages, we'll vmap a kiov to allow
+ * gni_smsg_send to send that as the payload */
+
+ LASSERT(tx->tx_buftype == GNILND_BUF_NONE);
+ LASSERT(nob >= 0);
+
+ if (nob == 0) {
+ tx->tx_buffer = NULL;
+ } else if (kiov != NULL) {
+ LASSERTF(niov > 0 && niov < GNILND_MAX_IMMEDIATE/PAGE_SIZE,
+ "bad niov %d\n", niov);
+
+ while (offset >= kiov->kiov_len) {
+ offset -= kiov->kiov_len;
+ niov--;
+ kiov++;
+ LASSERT(niov > 0);
+ }
+ for (i = 0; i < niov; i++) {
+ /* We can't have a kiov_offset on anything but the first entry,
+ * otherwise we'll have a hole at the end of the mapping as we only map
+ * whole pages.
+ * Also, if we have a kiov_len < PAGE_SIZE but we need to map more
+ * than kiov_len, we will also have a whole at the end of that page
+ * which isn't allowed */
+ if ((kiov[i].kiov_offset != 0 && i > 0) ||
+ (kiov[i].kiov_offset + kiov[i].kiov_len != CFS_PAGE_SIZE && i < niov - 1)) {
+ CNETERR("Can't make payload contiguous in I/O VM:"
+ "page %d, offset %u, nob %u, kiov_offset %u kiov_len %u \n",
+ i, offset, nob, kiov->kiov_offset, kiov->kiov_len);
+ RETURN(-EINVAL);
+ }
+ tx->tx_imm_pages[i] = kiov[i].kiov_page;
+ }
+
+ /* hijack tx_phys for the later unmap */
+ if (niov == 1) {
+ /* tx->phyx being equal to NULL is the signal for unmap to discern between kmap and vmap */
+ tx->tx_phys = NULL;
+ tx->tx_buffer = (void *)kmap(tx->tx_imm_pages[0]) + kiov[0].kiov_offset + offset;
+ atomic_inc(&kgnilnd_data.kgn_nkmap_short);
+ GNIDBG_TX(D_NET, tx, "kmapped page for %d bytes for kiov 0x%p, buffer 0x%p",
+ nob, kiov, tx->tx_buffer);
+ } else {
+ tx->tx_phys = vmap(tx->tx_imm_pages, niov, VM_MAP, PAGE_KERNEL);
+ if (tx->tx_phys == NULL) {
+ CNETERR("Couldn't vmap %d frags on %d bytes\n", niov, nob);
+ RETURN(-ENOMEM);
+
+ }
+ atomic_inc(&kgnilnd_data.kgn_nvmap_short);
+ /* make sure we take into account the kiov offset as the start of the buffer */
+ tx->tx_buffer = (void *)tx->tx_phys + kiov[0].kiov_offset + offset;
+ GNIDBG_TX(D_NET, tx, "mapped %d pages for %d bytes from kiov 0x%p to 0x%p, buffer 0x%p",
+ niov, nob, kiov, tx->tx_phys, tx->tx_buffer);
+ }
+ tx->tx_buftype = GNILND_BUF_IMMEDIATE_KIOV;
+ tx->tx_nob = nob;
+
+ } else {
+ /* For now this is almost identical to kgnilnd_setup_virt_buffer, but we
+ * could "flatten" the payload into a single contiguous buffer ready
+ * for sending direct over an FMA if we ever needed to. */
+
+ LASSERT(niov > 0);
+
+ while (offset >= iov->iov_len) {
+ offset -= iov->iov_len;
+ niov--;
+ iov++;
+ LASSERT(niov > 0);
+ }
+
+ if (nob > iov->iov_len - offset) {
+ CERROR("Can't handle multiple vaddr fragments\n");
+ return -EMSGSIZE;
+ }
+
+ tx->tx_buffer = (void *)(((unsigned long)iov->iov_base) + offset);
+
+ tx->tx_buftype = GNILND_BUF_IMMEDIATE;
+ tx->tx_nob = nob;
+ }
+
+ /* checksum payload early - it shouldn't be changing after lnd_send */
+ if (*kgnilnd_tunables.kgn_checksum >= 2) {
+ msg->gnm_payload_cksum = kgnilnd_cksum(tx->tx_buffer, nob);
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_CKSUM2)) {
+ msg->gnm_payload_cksum += 0xe00e;
+ }
+ if (*kgnilnd_tunables.kgn_checksum_dump > 1) {
+ kgnilnd_dump_blob(D_BUFFS, "payload checksum",
+ tx->tx_buffer, nob);
+ }
+ } else {
+ msg->gnm_payload_cksum = 0;
+ }
+
+ return 0;
+}
+
+int
+kgnilnd_setup_virt_buffer(kgn_tx_t *tx,
+ unsigned int niov, struct iovec *iov,
+ unsigned int offset, unsigned int nob)
+
+{
+ LASSERT(nob > 0);
+ LASSERT(niov > 0);
+ LASSERT(tx->tx_buftype == GNILND_BUF_NONE);
+
+ while (offset >= iov->iov_len) {
+ offset -= iov->iov_len;
+ niov--;
+ iov++;
+ LASSERT(niov > 0);
+ }
+
+ if (nob > iov->iov_len - offset) {
+ CERROR("Can't handle multiple vaddr fragments\n");
+ return -EMSGSIZE;
+ }
+
+ tx->tx_buftype = GNILND_BUF_VIRT_UNMAPPED;
+ tx->tx_nob = nob;
+ tx->tx_buffer = (void *)(((unsigned long)iov->iov_base) + offset);
+ return 0;
+}
+
+int
+kgnilnd_setup_phys_buffer(kgn_tx_t *tx, int nkiov, lnet_kiov_t *kiov,
+ unsigned int offset, unsigned int nob)
+{
+ gni_mem_segment_t *phys;
+ int rc = 0;
+ unsigned int fraglen;
+
+ GNIDBG_TX(D_NET, tx, "niov %d kiov 0x%p offset %u nob %u", nkiov, kiov, offset, nob);
+
+ LASSERT(nob > 0);
+ LASSERT(nkiov > 0);
+ LASSERT(tx->tx_buftype == GNILND_BUF_NONE);
+
+ /* only allocate this if we are going to use it */
+ tx->tx_phys = cfs_mem_cache_alloc(kgnilnd_data.kgn_tx_phys_cache,
+ CFS_ALLOC_ATOMIC);
+ if (tx->tx_phys == NULL) {
+ CERROR("failed to allocate tx_phys\n");
+ rc = -ENOMEM;
+ GOTO(error, rc);
+ }
+
+ CDEBUG(D_MALLOC, "slab-alloced 'tx->tx_phys': %lu at %p.\n",
+ LNET_MAX_IOV * sizeof(gni_mem_segment_t), tx->tx_phys);
+
+ /* if loops changes, please change kgnilnd_cksum_kiov
+ * and kgnilnd_setup_immediate_buffer */
+
+ while (offset >= kiov->kiov_len) {
+ offset -= kiov->kiov_len;
+ nkiov--;
+ kiov++;
+ LASSERT(nkiov > 0);
+ }
+
+ /* at this point, kiov points to the first page that we'll actually map
+ * now that we've seeked into the koiv for offset and dropped any
+ * leading pages that fall entirely within the offset */
+ tx->tx_buftype = GNILND_BUF_PHYS_UNMAPPED;
+ tx->tx_nob = nob;
+
+ /* kiov_offset is start of 'valid' buffer, so index offset past that */
+ tx->tx_buffer = (void *)((unsigned long)(kiov->kiov_offset + offset));
+ phys = tx->tx_phys;
+
+ CDEBUG(D_NET, "tx 0x%p buffer 0x%p map start kiov 0x%p+%u niov %d offset %u\n",
+ tx, tx->tx_buffer, kiov, kiov->kiov_offset, nkiov, offset);
+
+ do {
+ fraglen = min(kiov->kiov_len - offset, nob);
+
+ /* We can't have a kiov_offset on anything but the first entry,
+ * otherwise we'll have a hole at the end of the mapping as we only map
+ * whole pages. Only the first page is allowed to have an offset -
+ * we'll add that into tx->tx_buffer and that will get used when we
+ * map in the segments (see kgnilnd_map_buffer).
+ * Also, if we have a kiov_len < PAGE_SIZE but we need to map more
+ * than kiov_len, we will also have a whole at the end of that page
+ * which isn't allowed */
+ if ((phys != tx->tx_phys) &&
+ ((kiov->kiov_offset != 0) ||
+ ((kiov->kiov_len < PAGE_SIZE) && (nob > kiov->kiov_len)))) {
+ CERROR("Can't make payload contiguous in I/O VM:"
+ "page %d, offset %u, nob %u, kiov_offset %u kiov_len %u \n",
+ (int)(phys - tx->tx_phys),
+ offset, nob, kiov->kiov_offset, kiov->kiov_len);
+ rc = -EINVAL;
+ GOTO(error, rc);
+ }
+
+ if ((phys - tx->tx_phys) == LNET_MAX_IOV) {
+ CERROR ("payload too big (%d)\n", (int)(phys - tx->tx_phys));
+ rc = -EMSGSIZE;
+ GOTO(error, rc);
+ }
+
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PHYS_SETUP)) {
+ rc = -EINVAL;
+ GOTO(error, rc);
+ }
+
+ CDEBUG(D_BUFFS, "page 0x%p kiov_offset %u kiov_len %u nob %u "
+ "nkiov %u offset %u\n",
+ kiov->kiov_page, kiov->kiov_offset, kiov->kiov_len, nob, nkiov, offset);
+
+ phys->address = lnet_page2phys(kiov->kiov_page);
+ phys++;
+ kiov++;
+ nkiov--;
+ nob -= fraglen;
+ offset = 0;
+
+ /* iov must not run out before end of data */
+ LASSERTF(nob == 0 || nkiov > 0, "nob %u nkiov %u\n", nob, nkiov);
+
+ } while (nob > 0);
+
+ tx->tx_phys_npages = phys - tx->tx_phys;
+
+ return 0;
+
+error:
+ if (tx->tx_phys != NULL) {
+ cfs_mem_cache_free(kgnilnd_data.kgn_tx_phys_cache, tx->tx_phys);
+ CDEBUG(D_MALLOC, "slab-freed 'tx_phys': %lu at %p.\n",
+ sizeof(*tx->tx_phys), tx->tx_phys);
+ tx->tx_phys = NULL;
+ }
+ return rc;
+}
+
+static inline int
+kgnilnd_setup_rdma_buffer(kgn_tx_t *tx, unsigned int niov,
+ struct iovec *iov, lnet_kiov_t *kiov,
+ unsigned int offset, unsigned int nob)
+{
+ int rc;
+
+ LASSERT((iov == NULL) != (kiov == NULL));
+
+ if (kiov != NULL) {
+ rc = kgnilnd_setup_phys_buffer(tx, niov, kiov, offset, nob);
+ } else {
+ rc = kgnilnd_setup_virt_buffer(tx, niov, iov, offset, nob);
+ }
+ return rc;
+}
+
+static void
+kgnilnd_parse_lnet_rdma(lnet_msg_t *lntmsg, unsigned int *niov, unsigned int *offset,
+ unsigned int *nob, lnet_kiov_t **kiov)
+{
+ /* GETs are weird, see kgnilnd_send */
+ if (lntmsg->msg_type == LNET_MSG_GET) {
+ if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0) {
+ *kiov = NULL;
+ } else {
+ *kiov = lntmsg->msg_md->md_iov.kiov;
+ }
+ *niov = lntmsg->msg_md->md_niov;
+ *nob = lntmsg->msg_md->md_length;
+ *offset = 0;
+ } else {
+ *kiov = lntmsg->msg_kiov;
+ *niov = lntmsg->msg_niov;
+ *nob = lntmsg->msg_len;
+ *offset = lntmsg->msg_offset;
+ }
+}
+
+static inline void
+kgnilnd_compute_rdma_cksum(kgn_tx_t *tx)
+{
+ unsigned int niov, offset, nob;
+ lnet_kiov_t *kiov;
+ lnet_msg_t *lntmsg = tx->tx_lntmsg[0];
+ int dump_cksum = (*kgnilnd_tunables.kgn_checksum_dump > 1);
+
+ GNITX_ASSERTF(tx, ((tx->tx_msg.gnm_type == GNILND_MSG_PUT_DONE) ||
+ (tx->tx_msg.gnm_type == GNILND_MSG_GET_DONE)),
+ "bad type %s", kgnilnd_msgtype2str(tx->tx_msg.gnm_type));
+
+
+ if (*kgnilnd_tunables.kgn_checksum < 3) {
+ tx->tx_msg.gnm_payload_cksum = 0;
+ return;
+ }
+
+ GNITX_ASSERTF(tx, lntmsg, "no LNet message!", NULL);
+
+ kgnilnd_parse_lnet_rdma(lntmsg, &niov, &offset, &nob, &kiov);
+
+ if (kiov != NULL) {
+ tx->tx_msg.gnm_payload_cksum = kgnilnd_cksum_kiov(niov, kiov, offset, nob, dump_cksum);
+ } else {
+ tx->tx_msg.gnm_payload_cksum = kgnilnd_cksum(tx->tx_buffer, nob);
+ if (dump_cksum) {
+ kgnilnd_dump_blob(D_BUFFS, "peer RDMA payload", tx->tx_buffer, nob);
+ }
+ }
+
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_CKSUM3)) {
+ tx->tx_msg.gnm_payload_cksum += 0xd00d;
+ }
+}
+
+static inline int
+kgnilnd_verify_rdma_cksum(kgn_tx_t *tx, __u16 rx_cksum)
+{
+ int rc = 0;
+ __u16 cksum;
+ unsigned int niov, offset, nob;
+ lnet_kiov_t *kiov;
+ lnet_msg_t *lntmsg = tx->tx_lntmsg[0];
+ int dump_on_err = *kgnilnd_tunables.kgn_checksum_dump;
+
+ /* we can only match certain requests */
+ GNITX_ASSERTF(tx, ((tx->tx_msg.gnm_type == GNILND_MSG_GET_REQ) ||
+ (tx->tx_msg.gnm_type == GNILND_MSG_PUT_ACK)),
+ "bad type %s", kgnilnd_msgtype2str(tx->tx_msg.gnm_type));
+
+ if (rx_cksum == 0) {
+ if (*kgnilnd_tunables.kgn_checksum >= 3) {
+ GNIDBG_MSG(D_WARNING, &tx->tx_msg,
+ "no RDMA payload checksum when enabled");
+ }
+ return 0;
+ }
+
+ GNITX_ASSERTF(tx, lntmsg, "no LNet message!", NULL);
+
+ kgnilnd_parse_lnet_rdma(lntmsg, &niov, &offset, &nob, &kiov);
+
+ if (kiov != NULL) {
+ cksum = kgnilnd_cksum_kiov(niov, kiov, offset, nob, 0);
+ } else {
+ cksum = kgnilnd_cksum(tx->tx_buffer, nob);
+ }
+
+ if (cksum != rx_cksum) {
+ GNIDBG_MSG(D_NETERROR, &tx->tx_msg,
+ "Bad RDMA payload checksum (%x expected %x); "
+ "kiov 0x%p niov %d nob %u offset %u",
+ cksum, rx_cksum, kiov, niov, nob, offset);
+ switch (dump_on_err) {
+ case 2:
+ if (kiov != NULL) {
+ kgnilnd_cksum_kiov(niov, kiov, offset, nob, 1);
+ } else {
+ kgnilnd_dump_blob(D_BUFFS, "RDMA payload",
+ tx->tx_buffer, nob);
+ }
+ /* fall through to dump log */
+ case 1:
+ libcfs_debug_dumplog();
+ break;
+ default:
+ break;
+ }
+ rc = -ENOKEY;
+ /* kgnilnd_check_fma_rx will close conn, kill tx with error */
+ }
+ return rc;
+}
+
+void
+kgnilnd_mem_add_map_list(kgn_device_t *dev, kgn_tx_t *tx)
+{
+ int bytes;
+
+ GNITX_ASSERTF(tx, list_empty(&tx->tx_map_list),
+ "already mapped!", NULL);
+
+ spin_lock(&dev->gnd_map_lock);
+ switch (tx->tx_buftype) {
+ default:
+ GNIDBG_TX(D_EMERG, tx,
+ "SOFTWARE BUG: invalid mapping %d", tx->tx_buftype);
+ spin_unlock(&dev->gnd_map_lock);
+ LBUG();
+ break;
+
+ case GNILND_BUF_PHYS_MAPPED:
+ bytes = tx->tx_phys_npages * PAGE_SIZE;
+ dev->gnd_map_nphys++;
+ dev->gnd_map_physnop += tx->tx_phys_npages;
+ break;
+
+ case GNILND_BUF_VIRT_MAPPED:
+ bytes = tx->tx_nob;
+ dev->gnd_map_nvirt++;
+ dev->gnd_map_virtnob += tx->tx_nob;
+ break;
+ }
+
+ if (tx->tx_msg.gnm_type == GNILND_MSG_PUT_ACK ||
+ tx->tx_msg.gnm_type == GNILND_MSG_GET_REQ) {
+ atomic64_add(bytes, &dev->gnd_rdmaq_bytes_out);
+ GNIDBG_TX(D_NETTRACE, tx, "rdma ++ %d to "LPD64"",
+ bytes, atomic64_read(&dev->gnd_rdmaq_bytes_out));
+ }
+
+ atomic_inc(&dev->gnd_n_mdd);
+ atomic64_add(bytes, &dev->gnd_nbytes_map);
+
+ /* clear retrans to prevent any SMSG goofiness as that code uses the same counter */
+ tx->tx_retrans = 0;
+
+ /* we only get here in the valid cases */
+ list_add_tail(&tx->tx_map_list, &dev->gnd_map_list);
+ dev->gnd_map_version++;
+ spin_unlock(&dev->gnd_map_lock);
+}
+
+void
+kgnilnd_mem_del_map_list(kgn_device_t *dev, kgn_tx_t *tx)
+{
+ int bytes;
+
+ GNITX_ASSERTF(tx, !list_empty(&tx->tx_map_list),
+ "not mapped!", NULL);
+ spin_lock(&dev->gnd_map_lock);
+
+ switch (tx->tx_buftype) {
+ default:
+ GNIDBG_TX(D_EMERG, tx,
+ "SOFTWARE BUG: invalid mapping %d", tx->tx_buftype);
+ spin_unlock(&dev->gnd_map_lock);
+ LBUG();
+ break;
+
+ case GNILND_BUF_PHYS_UNMAPPED:
+ bytes = tx->tx_phys_npages * PAGE_SIZE;
+ dev->gnd_map_nphys--;
+ dev->gnd_map_physnop -= tx->tx_phys_npages;
+ break;
+
+ case GNILND_BUF_VIRT_UNMAPPED:
+ bytes = tx->tx_nob;
+ dev->gnd_map_nvirt--;
+ dev->gnd_map_virtnob -= tx->tx_nob;
+ break;
+ }
+
+ if (tx->tx_msg.gnm_type == GNILND_MSG_PUT_ACK ||
+ tx->tx_msg.gnm_type == GNILND_MSG_GET_REQ) {
+ atomic64_sub(bytes, &dev->gnd_rdmaq_bytes_out);
+ LASSERTF(atomic64_read(&dev->gnd_rdmaq_bytes_out) >= 0,
+ "bytes_out negative! %ld\n", atomic64_read(&dev->gnd_rdmaq_bytes_out));
+ GNIDBG_TX(D_NETTRACE, tx, "rdma -- %d to "LPD64"",
+ bytes, atomic64_read(&dev->gnd_rdmaq_bytes_out));
+ }
+
+ atomic_dec(&dev->gnd_n_mdd);
+ atomic64_sub(bytes, &dev->gnd_nbytes_map);
+
+ /* we only get here in the valid cases */
+ list_del_init(&tx->tx_map_list);
+ dev->gnd_map_version++;
+ spin_unlock(&dev->gnd_map_lock);
+}
+
+int
+kgnilnd_map_buffer(kgn_tx_t *tx)
+{
+ kgn_conn_t *conn = tx->tx_conn;
+ kgn_device_t *dev = conn->gnc_device;
+ __u32 flags = GNI_MEM_READWRITE;
+ gni_return_t rrc;
+
+ /* The kgnilnd_mem_register(_segments) Gemini Driver functions can
+ * be called concurrently as there are internal locks that protect
+ * any data structures or HW resources. We just need to ensure
+ * that our concurrency doesn't result in the kgn_device_t
+ * getting nuked while we are in here */
+
+ LASSERTF(conn != NULL, "tx %p with NULL conn, someone forgot"
+ " to set tx_conn before calling %s\n", tx, __FUNCTION__);
+
+ if (unlikely(CFS_FAIL_CHECK(CFS_FAIL_GNI_MAP_TX)))
+ RETURN(-ENOMEM);
+
+ if (*kgnilnd_tunables.kgn_bte_relaxed_ordering) {
+ flags |= GNI_MEM_RELAXED_PI_ORDERING;
+ }
+
+ switch (tx->tx_buftype) {
+ default:
+ LBUG();
+
+ case GNILND_BUF_NONE:
+ case GNILND_BUF_IMMEDIATE:
+ case GNILND_BUF_IMMEDIATE_KIOV:
+ case GNILND_BUF_PHYS_MAPPED:
+ case GNILND_BUF_VIRT_MAPPED:
+ return 0;
+
+ case GNILND_BUF_PHYS_UNMAPPED:
+ GNITX_ASSERTF(tx, tx->tx_phys != NULL, "physical buffer not there!", NULL);
+ rrc = kgnilnd_mem_register_segments(dev->gnd_handle,
+ tx->tx_phys, tx->tx_phys_npages, NULL,
+ GNI_MEM_PHYS_SEGMENTS | flags,
+ &tx->tx_map_key);
+ /* could race with other uses of the map counts, but this is ok
+ * - this needs to turn into a non-fatal error soon to allow
+ * GART resource, etc starvation handling */
+ if (rrc != GNI_RC_SUCCESS) {
+ GNIDBG_TX(D_NET, tx, "Can't map %d pages: dev %d "
+ "phys %u pp %u, virt %u nob "LPU64"",
+ tx->tx_phys_npages, dev->gnd_id,
+ dev->gnd_map_nphys, dev->gnd_map_physnop,
+ dev->gnd_map_nvirt, dev->gnd_map_virtnob);
+ RETURN(rrc == GNI_RC_ERROR_RESOURCE ? -ENOMEM : -EINVAL);
+ }
+
+ tx->tx_buftype = GNILND_BUF_PHYS_MAPPED;
+ kgnilnd_mem_add_map_list(dev, tx);
+ return 0;
+
+ case GNILND_BUF_VIRT_UNMAPPED:
+ rrc = kgnilnd_mem_register(dev->gnd_handle,
+ (__u64)tx->tx_buffer, tx->tx_nob,
+ NULL, flags, &tx->tx_map_key);
+ if (rrc != GNI_RC_SUCCESS) {
+ GNIDBG_TX(D_NET, tx, "Can't map %u bytes: dev %d "
+ "phys %u pp %u, virt %u nob "LPU64"",
+ tx->tx_nob, dev->gnd_id,
+ dev->gnd_map_nphys, dev->gnd_map_physnop,
+ dev->gnd_map_nvirt, dev->gnd_map_virtnob);
+ RETURN(rrc == GNI_RC_ERROR_RESOURCE ? -ENOMEM : -EINVAL);
+ }
+
+ tx->tx_buftype = GNILND_BUF_VIRT_MAPPED;
+ kgnilnd_mem_add_map_list(dev, tx);
+ if (tx->tx_msg.gnm_type == GNILND_MSG_PUT_ACK ||
+ tx->tx_msg.gnm_type == GNILND_MSG_GET_REQ) {
+ atomic64_add(tx->tx_nob, &dev->gnd_rdmaq_bytes_out);
+ GNIDBG_TX(D_NETTRACE, tx, "rdma ++ %d to %ld\n",
+ tx->tx_nob, atomic64_read(&dev->gnd_rdmaq_bytes_out));
+ }
+
+ return 0;
+ }
+}
+
+void
+kgnilnd_add_purgatory_tx(kgn_tx_t *tx)
+{
+ kgn_conn_t *conn = tx->tx_conn;
+ kgn_mdd_purgatory_t *gmp;
+
+ LIBCFS_ALLOC(gmp, sizeof(*gmp));
+ LASSERTF(gmp != NULL, "couldn't allocate MDD purgatory member;"
+ " asserting to avoid data corruption\n");
+
+ gmp->gmp_map_key = tx->tx_map_key;
+ atomic_inc(&conn->gnc_device->gnd_n_mdd_held);
+
+ /* ensure that we don't have a blank purgatory - indicating the
+ * conn is not already on purgatory lists - we'd never recover these
+ * MDD if that were the case */
+ GNITX_ASSERTF(tx, conn->gnc_in_purgatory,
+ "conn 0x%p->%s with NULL purgatory",
+ conn, libcfs_nid2str(conn->gnc_peer->gnp_nid));
+
+ /* link 'er up! - only place we really need to lock for
+ * concurrent access */
+ spin_lock(&conn->gnc_list_lock);
+ list_add_tail(&gmp->gmp_list, &conn->gnc_mdd_list);
+ spin_unlock(&conn->gnc_list_lock);
+}
+
+void
+kgnilnd_unmap_buffer(kgn_tx_t *tx, int error)
+{
+ kgn_device_t *dev;
+ gni_return_t rrc;
+ int hold_timeout = 0;
+
+ /* code below relies on +1 relationship ... */
+ CLASSERT(GNILND_BUF_PHYS_MAPPED == (GNILND_BUF_PHYS_UNMAPPED + 1));
+ CLASSERT(GNILND_BUF_VIRT_MAPPED == (GNILND_BUF_VIRT_UNMAPPED + 1));
+
+ switch (tx->tx_buftype) {
+ default:
+ LBUG();
+
+ case GNILND_BUF_NONE:
+ case GNILND_BUF_IMMEDIATE:
+ case GNILND_BUF_PHYS_UNMAPPED:
+ case GNILND_BUF_VIRT_UNMAPPED:
+ break;
+ case GNILND_BUF_IMMEDIATE_KIOV:
+ if (tx->tx_phys != NULL) {
+ vunmap(tx->tx_phys);
+ } else if (tx->tx_phys == NULL && tx->tx_buffer != NULL) {
+ kunmap(tx->tx_imm_pages[0]);
+ }
+ /* clear to prevent kgnilnd_free_tx from thinking
+ * this is a RDMA descriptor */
+ tx->tx_phys = NULL;
+ break;
+
+ case GNILND_BUF_PHYS_MAPPED:
+ case GNILND_BUF_VIRT_MAPPED:
+ LASSERT(tx->tx_conn != NULL);
+
+ dev = tx->tx_conn->gnc_device;
+
+ /* only want to hold if we are closing conn without
+ * verified peer notification - the theory is that
+ * a TX error can be communicated in all other cases */
+ if (tx->tx_conn->gnc_state != GNILND_CONN_ESTABLISHED &&
+ kgnilnd_check_purgatory_conn(tx->tx_conn)) {
+ kgnilnd_add_purgatory_tx(tx);
+
+ /* The timeout we give to kgni is a deadman stop only.
+ * we are setting high to ensure we don't have the kgni timer
+ * fire before ours fires _and_ is handled */
+ hold_timeout = GNILND_TIMEOUT2DEADMAN;
+
+ GNIDBG_TX(D_NET, tx,
+ "dev %p delaying MDD release for %dms key "LPX64"."LPX64"",
+ tx->tx_conn->gnc_device, hold_timeout,
+ tx->tx_map_key.qword1, tx->tx_map_key.qword2);
+ }
+
+ rrc = kgnilnd_mem_deregister(dev->gnd_handle, &tx->tx_map_key, hold_timeout);
+
+ LASSERTF(rrc == GNI_RC_SUCCESS, "rrc %d\n", rrc);
+
+ tx->tx_buftype--;
+ kgnilnd_mem_del_map_list(dev, tx);
+ break;
+ }
+}
+
+void
+kgnilnd_tx_done(kgn_tx_t *tx, int completion)
+{
+ lnet_msg_t *lntmsg0, *lntmsg1;
+ int status0, status1;
+ lnet_ni_t *ni = NULL;
+ kgn_conn_t *conn = tx->tx_conn;
+
+ LASSERT(!in_interrupt());
+
+ lntmsg0 = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL;
+ lntmsg1 = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL;
+
+ if (completion &&
+ !(tx->tx_state & GNILND_TX_QUIET_ERROR) &&
+ !kgnilnd_conn_clean_errno(completion)) {
+ GNIDBG_TOMSG(D_NETERROR, &tx->tx_msg,
+ "error %d on tx 0x%p->%s id %u/%d state %s age %ds",
+ completion, tx, conn ?
+ libcfs_nid2str(conn->gnc_peer->gnp_nid) : "<?>",
+ tx->tx_id.txe_smsg_id, tx->tx_id.txe_idx,
+ kgnilnd_tx_state2str(tx->tx_list_state),
+ cfs_duration_sec((long)jiffies - tx->tx_qtime));
+ }
+
+ /* The error codes determine if we hold onto the MDD */
+ kgnilnd_unmap_buffer(tx, completion);
+
+ /* we have to deliver a reply on lntmsg[1] for the GET, so make sure
+ * we play nice with the error codes to avoid delivering a failed
+ * REQUEST and then a REPLY event as well */
+
+ /* return -EIO to lnet - it is the magic value for failed sends */
+ if (tx->tx_msg.gnm_type == GNILND_MSG_GET_REQ) {
+ status0 = 0;
+ status1 = completion;
+ } else {
+ status0 = status1 = completion;
+ }
+
+ tx->tx_buftype = GNILND_BUF_NONE;
+ tx->tx_msg.gnm_type = GNILND_MSG_NONE;
+
+ /* lnet_finalize doesn't do anything with the *ni, so ok for us to
+ * set NULL when we are a tx without a conn */
+ if (conn != NULL) {
+ ni = conn->gnc_peer->gnp_net->gnn_ni;
+
+ spin_lock(&conn->gnc_tx_lock);
+
+ LASSERTF(test_and_clear_bit(tx->tx_id.txe_idx,
+ (volatile unsigned long *)&conn->gnc_tx_bits),
+ "conn %p tx %p bit %d already cleared\n",
+ conn, tx, tx->tx_id.txe_idx);
+
+ LASSERTF(conn->gnc_tx_ref_table[tx->tx_id.txe_idx] != NULL,
+ "msg_id %d already NULL\n", tx->tx_id.txe_idx);
+
+ conn->gnc_tx_ref_table[tx->tx_id.txe_idx] = NULL;
+ spin_unlock(&conn->gnc_tx_lock);
+ }
+
+ kgnilnd_free_tx(tx);
+
+ /* finalize AFTER freeing lnet msgs */
+
+ /* warning - we should hold no locks here - calling lnet_finalize
+ * could free up lnet credits, resulting in a call chain back into
+ * the LND via kgnilnd_send and friends */
+ lnet_finalize(ni, lntmsg0, status0);
+
+ if (lntmsg1 != NULL) {
+ lnet_finalize(ni, lntmsg1, status1);
+ }
+}
+
+void
+kgnilnd_txlist_done(struct list_head *txlist, int error)
+{
+ kgn_tx_t *tx, *txn;
+ int err_printed = 0;
+
+ if (list_empty(txlist))
+ return;
+
+ list_for_each_entry_safe(tx, txn, txlist, tx_list) {
+ /* only print the first error */
+ if (err_printed)
+ tx->tx_state |= GNILND_TX_QUIET_ERROR;
+ list_del_init(&tx->tx_list);
+ kgnilnd_tx_done(tx, error);
+ err_printed++;
+ }
+}
+int
+kgnilnd_set_tx_id(kgn_tx_t *tx, kgn_conn_t *conn)
+{
+ int id;
+
+ spin_lock(&conn->gnc_tx_lock);
+
+ /* ID zero is NOT ALLOWED!!! */
+
+search_again:
+ id = find_next_zero_bit((unsigned long *)&conn->gnc_tx_bits,
+ GNILND_MAX_MSG_ID, conn->gnc_next_tx);
+ if (id == GNILND_MAX_MSG_ID) {
+ if (conn->gnc_next_tx != 1) {
+ /* we only searched from next_tx to end and didn't find
+ * one, so search again from start */
+ conn->gnc_next_tx = 1;
+ goto search_again;
+ }
+ /* couldn't find one! */
+ spin_unlock(&conn->gnc_tx_lock);
+ return -E2BIG;
+ }
+
+ /* bump next_tx to prevent immediate reuse */
+ conn->gnc_next_tx = id + 1;
+
+ set_bit(id, (volatile unsigned long *)&conn->gnc_tx_bits);
+ LASSERTF(conn->gnc_tx_ref_table[id] == NULL,
+ "tx 0x%p already at id %d\n",
+ conn->gnc_tx_ref_table[id], id);
+
+ /* delay these until we have a valid ID - prevents bad clear of the bit
+ * in kgnilnd_tx_done */
+ tx->tx_conn = conn;
+ tx->tx_id.txe_cqid = conn->gnc_cqid;
+
+ tx->tx_id.txe_idx = id;
+ conn->gnc_tx_ref_table[id] = tx;
+
+ /* Using jiffies to help differentiate against TX reuse - with
+ * the usual minimum of a 250HZ clock, we wrap jiffies on the same TX
+ * if we are sending to the same node faster than 256000/sec.
+ * To help guard against this, we OR in the tx_seq - that is 32 bits */
+
+ tx->tx_id.txe_chips = (__u32)(jiffies | conn->gnc_tx_seq);
+
+ GNIDBG_TX(D_NET, tx, "set cookie/id/bits", NULL);
+
+ spin_unlock(&conn->gnc_tx_lock);
+ return 0;
+}
+
+static inline int
+kgnilnd_tx_should_retry(kgn_conn_t *conn, kgn_tx_t *tx)
+{
+ int max_retrans = *kgnilnd_tunables.kgn_max_retransmits;
+ int log_retrans;
+ int log_retrans_level;
+
+ /* I need kgni credits to send this. Replace tx at the head of the
+ * fmaq and I'll get rescheduled when credits appear */
+ tx->tx_state = 0;
+ tx->tx_retrans++;
+ conn->gnc_tx_retrans++;
+ log_retrans = ((tx->tx_retrans < 25) || ((tx->tx_retrans % 25) == 0) ||
+ (tx->tx_retrans > (max_retrans / 2)));
+ log_retrans_level = tx->tx_retrans < (max_retrans / 2) ? D_NET : D_NETERROR;
+
+ /* Decision time - either error, warn or just retransmit */
+
+ /* we don't care about TX timeout - it could be that the network is slower
+ * or throttled. We'll keep retranmitting - so if the network is so slow
+ * that we fill up our mailbox, we'll keep trying to resend that msg
+ * until we exceed the max_retrans _or_ gnc_last_rx expires, indicating
+ * that he hasn't send us any traffic in return */
+
+ if (tx->tx_retrans > max_retrans) {
+ /* this means we are not backing off the retransmits
+ * in a healthy manner and are likely chewing up the
+ * CPU cycles quite badly */
+ GNIDBG_TOMSG(D_ERROR, &tx->tx_msg,
+ "SOFTWARE BUG: too many retransmits (%d) for tx id %x "
+ "conn 0x%p->%s\n",
+ tx->tx_retrans, tx->tx_id, conn,
+ libcfs_nid2str(conn->gnc_peer->gnp_nid));
+
+ /* yes - double errors to help debug this condition */
+ GNIDBG_TOMSG(D_NETERROR, &tx->tx_msg, "connection dead. "
+ "unable to send to %s for %lu secs (%d tries)",
+ libcfs_nid2str(tx->tx_conn->gnc_peer->gnp_nid),
+ cfs_duration_sec(jiffies - tx->tx_cred_wait),
+ tx->tx_retrans);
+
+ kgnilnd_close_conn(conn, -ETIMEDOUT);
+
+ /* caller should terminate */
+ RETURN(0);
+ } else {
+ /* some reasonable throttling of the debug message */
+ if (log_retrans) {
+ unsigned long now = jiffies;
+ /* XXX Nic: Mystical TX debug here... */
+ GNIDBG_SMSG_CREDS(log_retrans_level, conn);
+ GNIDBG_TOMSG(log_retrans_level, &tx->tx_msg,
+ "NOT_DONE on conn 0x%p->%s id %x retrans %d wait %dus"
+ " last_msg %uus/%uus last_cq %uus/%uus",
+ conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+ tx->tx_id, tx->tx_retrans,
+ jiffies_to_usecs(now - tx->tx_cred_wait),
+ jiffies_to_usecs(now - conn->gnc_last_tx),
+ jiffies_to_usecs(now - conn->gnc_last_rx),
+ jiffies_to_usecs(now - conn->gnc_last_tx_cq),
+ jiffies_to_usecs(now - conn->gnc_last_rx_cq));
+ }
+ /* caller should retry */
+ RETURN(1);
+ }
+}
+
+/* caller must be holding gnd_cq_mutex and not unlock it afterwards, as we need to drop it
+ * to avoid bad ordering with state_lock */
+
+static inline int
+kgnilnd_sendmsg_nolock(kgn_tx_t *tx, void *immediate, unsigned int immediatenob,
+ spinlock_t *state_lock, kgn_tx_list_state_t state)
+{
+ kgn_conn_t *conn = tx->tx_conn;
+ kgn_msg_t *msg = &tx->tx_msg;
+ int retry_send;
+ gni_return_t rrc;
+ unsigned long newest_last_rx, timeout;
+ unsigned long now;
+
+ LASSERTF((msg->gnm_type == GNILND_MSG_IMMEDIATE) ?
+ immediatenob <= *kgnilnd_tunables.kgn_max_immediate :
+ immediatenob == 0,
+ "msg 0x%p type %d wrong payload size %d\n",
+ msg, msg->gnm_type, immediatenob);
+
+ /* make sure we catch all the cases where we'd send on a dirty old mbox
+ * but allow case for sending CLOSE. Since this check is within the CQ
+ * mutex barrier and the close message is only sent through
+ * kgnilnd_send_conn_close the last message out the door will be the
+ * close message.
+ */
+ if (atomic_read(&conn->gnc_peer->gnp_dirty_eps) != 0 && msg->gnm_type != GNILND_MSG_CLOSE) {
+ mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+ /* Return -ETIME, we are closing the connection already so we dont want to
+ * have this tx hit the wire. The tx will be killed by the calling function.
+ * Once the EP is marked dirty the close message will be the last
+ * thing to hit the wire */
+ return -ETIME;
+ }
+
+ now = jiffies;
+ timeout = cfs_time_seconds(conn->gnc_timeout);
+
+ newest_last_rx = GNILND_LASTRX(conn);
+
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SEND_TIMEOUT)) {
+ now = now + (GNILND_TIMEOUTRX(timeout) * 2);
+ }
+
+ if (time_after_eq(now, newest_last_rx + GNILND_TIMEOUTRX(timeout))) {
+ GNIDBG_CONN(D_NETERROR|D_CONSOLE, conn, "Cant send to %s after timeout lapse of %lu; TO %lu",
+ libcfs_nid2str(conn->gnc_peer->gnp_nid),
+ cfs_duration_sec(now - newest_last_rx),
+ cfs_duration_sec(GNILND_TIMEOUTRX(timeout)));
+ mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+ return -ETIME;
+ }
+
+ GNITX_ASSERTF(tx, (conn != NULL) && (tx->tx_id.txe_idx != 0), "tx id unset!", NULL);
+ /* msg->gnm_srcnid is set when the message is initialized by whatever function is
+ * creating the message this allows the message to contain the correct LNET NID/NET needed
+ * instead of the one that the peer/conn uses for sending the data.
+ */
+ msg->gnm_connstamp = conn->gnc_my_connstamp;
+ msg->gnm_payload_len = immediatenob;
+ msg->gnm_seq = conn->gnc_tx_seq;
+
+ /* always init here - kgn_checksum is a /sys module tunable
+ * and can be flipped at any point, even between msg init and sending */
+ msg->gnm_cksum = 0;
+ if (*kgnilnd_tunables.kgn_checksum) {
+ /* We must set here and not in kgnilnd_init_msg,
+ * we could resend this msg many times
+ * (NOT_DONE from gni_smsg_send below) and wouldn't pass
+ * through init_msg again */
+ msg->gnm_cksum = kgnilnd_cksum(msg, sizeof(kgn_msg_t));
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_CKSUM1)) {
+ msg->gnm_cksum += 0xf00f;
+ }
+ }
+
+ GNIDBG_TOMSG(D_NET, msg, "tx 0x%p conn 0x%p->%s sending SMSG sz %u id %x/%d [%p for %u]",
+ tx, conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+ sizeof(kgn_msg_t), tx->tx_id.txe_smsg_id,
+ tx->tx_id.txe_idx, immediate, immediatenob);
+
+ if (unlikely(tx->tx_state & GNILND_TX_FAIL_SMSG)) {
+ rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE;
+ } else {
+ rrc = kgnilnd_smsg_send(conn->gnc_ephandle,
+ msg, sizeof(*msg), immediate, immediatenob,
+ tx->tx_id.txe_smsg_id);
+ }
+
+ switch (rrc) {
+ case GNI_RC_SUCCESS:
+ conn->gnc_tx_seq++;
+ conn->gnc_last_tx = jiffies;
+ /* no locking here as LIVE isn't a list */
+ kgnilnd_tx_add_state_locked(tx, NULL, conn, GNILND_TX_LIVE_FMAQ, 1);
+
+ /* this needs to be checked under lock as it might be freed from a completion
+ * event.
+ */
+ if (msg->gnm_type == GNILND_MSG_NOOP) {
+ set_mb(conn->gnc_last_noop_sent, jiffies);
+ }
+
+ /* serialize with seeing CQ events for completion on this, as well as
+ * tx_seq */
+ mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+
+ atomic_inc(&conn->gnc_device->gnd_short_ntx);
+ atomic64_add(immediatenob, &conn->gnc_device->gnd_short_txbytes);
+ kgnilnd_peer_alive(conn->gnc_peer);
+ GNIDBG_SMSG_CREDS(D_NET, conn);
+ return 0;
+
+ case GNI_RC_NOT_DONE:
+ /* XXX Nic: We need to figure out how to track this
+ * - there are bound to be good reasons for it,
+ * but we want to know when it happens */
+
+ mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+ /* We'll handle this error inline - makes the calling logic much more
+ * clean */
+
+ /* If no lock, caller doesn't want us to retry */
+ if (state_lock == NULL) {
+ return -EAGAIN;
+ }
+
+ retry_send = kgnilnd_tx_should_retry(conn, tx);
+ if (retry_send) {
+ /* add to head of list for the state and retries */
+ spin_lock(state_lock);
+ kgnilnd_tx_add_state_locked(tx, conn->gnc_peer, conn, state, 0);
+ spin_unlock(state_lock);
+
+ /* We only reschedule for a certain number of retries, then
+ * we will wait for the CQ events indicating a release of SMSG
+ * credits */
+ if (tx->tx_retrans < (*kgnilnd_tunables.kgn_max_retransmits/4)) {
+ kgnilnd_schedule_conn(conn);
+ return 0;
+ } else {
+ /* CQ event coming in signifies either TX completed or
+ * RX receive. Either of these *could* free up credits
+ * in the SMSG mbox and we should try sending again */
+ GNIDBG_TX(D_NET, tx, "waiting for CQID %u event to resend",
+ tx->tx_conn->gnc_cqid);
+ /* use +ve return code to let upper layers know they
+ * should stop looping on sends */
+ return EAGAIN;
+ }
+ } else {
+ return -EAGAIN;
+ }
+ default:
+ /* handle bad retcode gracefully */
+ mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+ return -EIO;
+ }
+}
+
+/* kgnilnd_sendmsg has hard wait on gnd_cq_mutex */
+static inline int
+kgnilnd_sendmsg(kgn_tx_t *tx, void *immediate, unsigned int immediatenob,
+ spinlock_t *state_lock, kgn_tx_list_state_t state)
+{
+ kgn_device_t *dev = tx->tx_conn->gnc_device;
+ unsigned long timestamp;
+ int rc;
+
+ timestamp = jiffies;
+ mutex_lock(&dev->gnd_cq_mutex);
+ /* delay in jiffies - we are really concerned only with things that
+ * result in a schedule() or really holding this off for long times .
+ * NB - mutex_lock could spin for 2 jiffies before going to sleep to wait */
+ dev->gnd_mutex_delay += (long) jiffies - timestamp;
+
+ rc = kgnilnd_sendmsg_nolock(tx, immediate, immediatenob, state_lock, state);
+
+ RETURN(rc);
+}
+
+
+/* returns -EAGAIN for lock miss, anything else < 0 is hard error, >=0 for success */
+static inline int
+kgnilnd_sendmsg_trylock(kgn_tx_t *tx, void *immediate, unsigned int immediatenob,
+ spinlock_t *state_lock, kgn_tx_list_state_t state)
+{
+ kgn_conn_t *conn = tx->tx_conn;
+ kgn_device_t *dev = conn->gnc_device;
+ unsigned long timestamp;
+ int rc;
+
+ timestamp = jiffies;
+
+ /* technically we are doing bad things with the read_lock on the peer_conn
+ * table, but we shouldn't be sleeping inside here - and we don't sleep/block
+ * for the mutex. I bet lockdep is gonna flag this one though... */
+
+ /* there are a few cases where we don't want the immediate send - like
+ * when we are in the scheduler thread and it'd harm the latency of
+ * getting messages up to LNet */
+
+ /* rmb for gnd_ready */
+ smp_rmb();
+ if (conn->gnc_device->gnd_ready == GNILND_DEV_LOOP) {
+ rc = 0;
+ atomic_inc(&conn->gnc_device->gnd_fast_block);
+ } else if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+ /* dont hit HW during quiesce */
+ rc = 0;
+ } else if (unlikely(atomic_read(&conn->gnc_peer->gnp_dirty_eps))) {
+ /* dont hit HW if stale EPs and conns left to close */
+ rc = 0;
+ } else {
+ atomic_inc(&conn->gnc_device->gnd_fast_try);
+ rc = mutex_trylock(&conn->gnc_device->gnd_cq_mutex);
+ }
+ if (!rc) {
+ rc = -EAGAIN;
+ } else {
+ /* we got the mutex and weren't blocked */
+
+ /* delay in jiffies - we are really concerned only with things that
+ * result in a schedule() or really holding this off for long times .
+ * NB - mutex_lock could spin for 2 jiffies before going to sleep to wait */
+ dev->gnd_mutex_delay += (long) jiffies - timestamp;
+
+ atomic_inc(&conn->gnc_device->gnd_fast_ok);
+ tx->tx_qtime = jiffies;
+ tx->tx_state = GNILND_TX_WAITING_COMPLETION;
+ rc = kgnilnd_sendmsg_nolock(tx, tx->tx_buffer, tx->tx_nob, &conn->gnc_list_lock, GNILND_TX_FMAQ);
+ /* _nolock unlocks the mutex for us */
+ }
+
+ RETURN(rc);
+}
+
+/* lets us know if we can push this RDMA through now */
+inline int
+kgnilnd_auth_rdma_bytes(kgn_device_t *dev, kgn_tx_t *tx)
+{
+ long bytes_left;
+
+ bytes_left = atomic64_sub_return(tx->tx_nob, &dev->gnd_rdmaq_bytes_ok);
+
+ if (bytes_left < 0) {
+ atomic64_add(tx->tx_nob, &dev->gnd_rdmaq_bytes_ok);
+ atomic_inc(&dev->gnd_rdmaq_nstalls);
+ smp_wmb();
+
+ CDEBUG(D_NET, "no bytes to send, turning on timer for %lu\n",
+ dev->gnd_rdmaq_deadline);
+ mod_timer(&dev->gnd_rdmaq_timer, dev->gnd_rdmaq_deadline);
+ /* we never del this timer - at worst it schedules us.. */
+ return -EAGAIN;
+ } else {
+ return 0;
+ }
+}
+
+/* this adds a TX to the queue pending throttling authorization before
+ * we allow our remote peer to launch a PUT at us */
+void
+kgnilnd_queue_rdma(kgn_conn_t *conn, kgn_tx_t *tx)
+{
+ int rc;
+
+ /* we cannot go into send_mapped_tx from here as we are holding locks
+ * and mem registration might end up allocating memory in kgni.
+ * That said, we'll push this as far as we can into the queue process */
+ rc = kgnilnd_auth_rdma_bytes(conn->gnc_device, tx);
+
+ if (rc < 0) {
+ spin_lock(&conn->gnc_device->gnd_rdmaq_lock);
+ kgnilnd_tx_add_state_locked(tx, NULL, conn, GNILND_TX_RDMAQ, 0);
+ /* lets us know how delayed RDMA is */
+ tx->tx_qtime = jiffies;
+ spin_unlock(&conn->gnc_device->gnd_rdmaq_lock);
+ } else {
+ /* we have RDMA authorized, now it just needs a MDD and to hit the wire */
+ spin_lock(&tx->tx_conn->gnc_device->gnd_lock);
+ kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_MAPQ, 0);
+ /* lets us know how delayed mapping is */
+ tx->tx_qtime = jiffies;
+ spin_unlock(&tx->tx_conn->gnc_device->gnd_lock);
+ }
+
+ /* make sure we wake up sched to run this */
+ kgnilnd_schedule_device(tx->tx_conn->gnc_device);
+}
+
+/* push TX through state machine */
+void
+kgnilnd_queue_tx(kgn_conn_t *conn, kgn_tx_t *tx)
+{
+ int rc;
+ int add_tail = 1;
+
+ /* set the tx_id here, we delay it until we have an actual conn
+ * to fiddle with
+ * in some cases, the tx_id is already set to provide for things
+ * like RDMA completion cookies, etc */
+ if (tx->tx_id.txe_idx == 0) {
+ rc = kgnilnd_set_tx_id(tx, conn);
+ if (rc != 0) {
+ kgnilnd_tx_done(tx, rc);
+ return;
+ }
+ }
+
+ CDEBUG(D_NET, "%s to conn %p for %s\n", kgnilnd_msgtype2str(tx->tx_msg.gnm_type),
+ conn, libcfs_nid2str(conn->gnc_peer->gnp_nid));
+
+ /* Only let NOOPs to be sent while fail loc is set, otherwise kill the tx.
+ */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_ONLY_NOOP) && (tx->tx_msg.gnm_type != GNILND_MSG_NOOP)) {
+ kgnilnd_tx_done(tx, rc);
+ return;
+ }
+
+ switch (tx->tx_msg.gnm_type) {
+ case GNILND_MSG_PUT_ACK:
+ case GNILND_MSG_GET_REQ:
+ /* hijacking time! If this messages will authorize our peer to
+ * send his dirty little bytes in an RDMA, we need to get permission */
+ kgnilnd_queue_rdma(conn, tx);
+ break;
+ case GNILND_MSG_IMMEDIATE:
+ /* try to send right now, can help reduce latency */
+ rc = kgnilnd_sendmsg_trylock(tx, tx->tx_buffer, tx->tx_nob, &conn->gnc_list_lock, GNILND_TX_FMAQ);
+
+ if (rc >= 0) {
+ /* it was sent, break out of switch to avoid default case of queueing */
+ break;
+ } else if (rc == -EAGAIN) {
+ /* needs to queue to try again, so fall through to default case */
+ } else {
+ /* bail: it wasnt sent and we didn't get EAGAIN indicating
+ * we should retrans - We do not close the conn due to locking
+ * we let the reaper thread take care of it. There are no hard
+ * errors from send_msg that would require close to be called
+ */
+ kgnilnd_tx_done(tx, rc);
+ break;
+ }
+ case GNILND_MSG_NOOP:
+ /* Just make sure this goes out first for this conn */
+ add_tail = 0;
+ /* fall through... */
+ default:
+ spin_lock(&conn->gnc_list_lock);
+ kgnilnd_tx_add_state_locked(tx, conn->gnc_peer, conn, GNILND_TX_FMAQ, add_tail);
+ tx->tx_qtime = jiffies;
+ spin_unlock(&conn->gnc_list_lock);
+ kgnilnd_schedule_conn(conn);
+ }
+}
+
+void
+kgnilnd_launch_tx(kgn_tx_t *tx, kgn_net_t *net, lnet_process_id_t *target)
+{
+ kgn_peer_t *peer;
+ kgn_peer_t *new_peer = NULL;
+ kgn_conn_t *conn = NULL;
+ int rc;
+
+ ENTRY;
+
+ /* If I get here, I've committed to send, so I complete the tx with
+ * failure on any problems */
+
+ GNITX_ASSERTF(tx, tx->tx_conn == NULL,
+ "tx already has connection %p", tx->tx_conn);
+
+ /* do all of the peer & conn searching in one swoop - this avoids
+ * nastiness when dropping locks and needing to maintain a sane state
+ * in the face of stack reset or something else nuking peers & conns */
+
+ /* I expect to find him, so only take a read lock */
+ read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+ peer = kgnilnd_find_peer_locked(target->nid);
+ if (peer != NULL) {
+ conn = kgnilnd_find_conn_locked(peer);
+ /* this could be NULL during quiesce */
+ if (conn != NULL) {
+ /* Connection exists; queue message on it */
+ kgnilnd_queue_tx(conn, tx);
+ read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+ RETURN_EXIT;
+ }
+ }
+
+ /* creating peer or conn; I'll need a write lock... */
+ read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+ CFS_RACE(CFS_FAIL_GNI_FIND_TARGET);
+
+ /* NB - this will not block during normal operations -
+ * the only writer of this is in the startup/shutdown path. */
+ rc = down_read_trylock(&kgnilnd_data.kgn_net_rw_sem);
+ if (!rc) {
+ rc = -ESHUTDOWN;
+ GOTO(no_peer, rc);
+ }
+
+ /* ignore previous peer entirely - we cycled the lock, so we
+ * will create new peer and at worst drop it if peer is still
+ * in the tables */
+ rc = kgnilnd_create_peer_safe(&new_peer, target->nid, net);
+ if (rc != 0) {
+ up_read(&kgnilnd_data.kgn_net_rw_sem);
+ GOTO(no_peer, rc);
+ }
+
+ write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+ up_read(&kgnilnd_data.kgn_net_rw_sem);
+
+ /* search for peer again now that we have the lock
+ * if we don't find it, add our new one to the list */
+ kgnilnd_add_peer_locked(target->nid, new_peer, &peer);
+
+ conn = kgnilnd_find_or_create_conn_locked(peer);
+ if (conn != NULL) {
+ /* oh hey, found a conn now... magical */
+ kgnilnd_queue_tx(conn, tx);
+ } else {
+ /* no conn, must be trying to connect - so we queue for now */
+ tx->tx_qtime = jiffies;
+ kgnilnd_tx_add_state_locked(tx, peer, NULL, GNILND_TX_PEERQ, 1);
+ }
+ write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+ RETURN_EXIT;
+no_peer:
+ kgnilnd_tx_done(tx, rc);
+ RETURN_EXIT;
+}
+
+void
+kgnilnd_rdma(kgn_tx_t *tx, int type,
+ kgn_rdma_desc_t *sink, unsigned int nob, __u64 cookie)
+{
+ kgn_conn_t *conn = tx->tx_conn;
+ unsigned long timestamp;
+ gni_return_t rrc;
+
+ LASSERTF(kgnilnd_tx_mapped(tx),
+ "unmapped tx %p\n", tx);
+ LASSERTF(conn != NULL,
+ "NULL conn on tx %p, naughty, naughty\n", tx);
+ LASSERTF(nob <= sink->gnrd_nob,
+ "nob %u > sink->gnrd_nob %d (%p)\n",
+ nob, sink->gnrd_nob, sink);
+ LASSERTF(nob <= tx->tx_nob,
+ "nob %d > tx(%p)->tx_nob %d\n",
+ nob, tx, tx->tx_nob);
+
+ memset(&tx->tx_rdma_desc, 0, sizeof(tx->tx_rdma_desc));
+ tx->tx_rdma_desc.post_id = tx->tx_id.txe_cookie;
+ tx->tx_rdma_desc.type = GNI_POST_RDMA_PUT;
+ tx->tx_rdma_desc.cq_mode = GNI_CQMODE_GLOBAL_EVENT;
+ tx->tx_rdma_desc.local_addr = (__u64)((unsigned long)tx->tx_buffer);
+ tx->tx_rdma_desc.local_mem_hndl = tx->tx_map_key;
+ tx->tx_rdma_desc.remote_addr = sink->gnrd_addr;
+ tx->tx_rdma_desc.remote_mem_hndl = sink->gnrd_key;
+ tx->tx_rdma_desc.length = nob;
+ if (!*kgnilnd_tunables.kgn_bte_hash)
+ tx->tx_rdma_desc.dlvr_mode |= GNI_DLVMODE_NO_HASH;
+ if (!*kgnilnd_tunables.kgn_bte_adapt)
+ tx->tx_rdma_desc.dlvr_mode |= (GNI_DLVMODE_NO_ADAPT | GNI_DLVMODE_NO_RADAPT);
+
+ /* prep final completion message */
+ kgnilnd_init_msg(&tx->tx_msg, type, tx->tx_msg.gnm_srcnid);
+ tx->tx_msg.gnm_u.completion.gncm_cookie = cookie;
+ /* send actual size RDMA'd in retval */
+ tx->tx_msg.gnm_u.completion.gncm_retval = nob;
+
+ kgnilnd_compute_rdma_cksum(tx);
+
+ if (nob == 0) {
+ kgnilnd_queue_tx(conn, tx);
+ return;
+ }
+
+ /* Don't lie (CLOSE == RDMA idle) */
+ LASSERTF(!conn->gnc_close_sent, "tx %p on conn %p after close sent %d\n",
+ tx, conn, conn->gnc_close_sent);
+
+ GNIDBG_TX(D_NET, tx, "Post RDMA type 0x%02x dlvr_mode 0x%x",
+ type, tx->tx_rdma_desc.dlvr_mode);
+
+ /* set CQ dedicated for RDMA */
+ tx->tx_rdma_desc.src_cq_hndl = conn->gnc_device->gnd_snd_rdma_cqh;
+
+ timestamp = jiffies;
+ mutex_lock(&conn->gnc_device->gnd_cq_mutex);
+ /* delay in jiffies - we are really concerned only with things that
+ * result in a schedule() or really holding this off for long times .
+ * NB - mutex_lock could spin for 2 jiffies before going to sleep to wait */
+ conn->gnc_device->gnd_mutex_delay += (long) jiffies - timestamp;
+
+ rrc = kgnilnd_post_rdma(conn->gnc_ephandle, &tx->tx_rdma_desc);
+
+ spin_lock(&conn->gnc_list_lock);
+ kgnilnd_tx_add_state_locked(tx, conn->gnc_peer, conn, GNILND_TX_LIVE_RDMAQ, 1);
+ tx->tx_qtime = jiffies;
+ spin_unlock(&conn->gnc_list_lock);
+
+ mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+
+ /* XXX Nic: is this a place we should handle more errors for
+ * robustness sake */
+ LASSERT(rrc == GNI_RC_SUCCESS);
+
+}
+
+kgn_rx_t *
+kgnilnd_alloc_rx(void)
+{
+ kgn_rx_t *rx;
+
+ rx = cfs_mem_cache_alloc(kgnilnd_data.kgn_rx_cache, CFS_ALLOC_ATOMIC);
+ if (rx == NULL) {
+ CERROR("failed to allocate rx\n");
+ return NULL;
+ }
+ CDEBUG(D_MALLOC, "slab-alloced 'rx': %lu at %p.\n",
+ sizeof(*rx), rx);
+
+ /* no memset to zero, we'll always fill all members */
+ return rx;
+}
+
+/* release is to just free connection resources
+ * we use this for the eager path after copying */
+void
+kgnilnd_release_msg(kgn_conn_t *conn)
+{
+ gni_return_t rrc;
+ unsigned long timestamp;
+
+ CDEBUG(D_NET, "consuming %p\n", conn);
+
+ timestamp = jiffies;
+ mutex_lock(&conn->gnc_device->gnd_cq_mutex);
+ /* delay in jiffies - we are really concerned only with things that
+ * result in a schedule() or really holding this off for long times .
+ * NB - mutex_lock could spin for 2 jiffies before going to sleep to wait */
+ conn->gnc_device->gnd_mutex_delay += (long) jiffies - timestamp;
+
+ rrc = kgnilnd_smsg_release(conn->gnc_ephandle);
+ mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+
+ LASSERTF(rrc == GNI_RC_SUCCESS, "bad rrc %d\n", rrc);
+ GNIDBG_SMSG_CREDS(D_NET, conn);
+
+ return;
+}
+
+void
+kgnilnd_consume_rx(kgn_rx_t *rx)
+{
+ kgn_conn_t *conn = rx->grx_conn;
+ kgn_msg_t *rxmsg = rx->grx_msg;
+
+ /* if we are eager, free the cache alloc'd msg */
+ if (unlikely(rx->grx_eager)) {
+ LIBCFS_FREE(rxmsg, sizeof(*rxmsg) + *kgnilnd_tunables.kgn_max_immediate);
+
+ /* release ref from eager_recv */
+ kgnilnd_conn_decref(conn);
+ } else {
+ GNIDBG_MSG(D_NET, rxmsg, "rx %p processed", rx);
+ kgnilnd_release_msg(conn);
+ }
+
+ cfs_mem_cache_free(kgnilnd_data.kgn_rx_cache, rx);
+ CDEBUG(D_MALLOC, "slab-freed 'rx': %lu at %p.\n",
+ sizeof(*rx), rx);
+
+ return;
+}
+
+int
+kgnilnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+ lnet_hdr_t *hdr = &lntmsg->msg_hdr;
+ int type = lntmsg->msg_type;
+ lnet_process_id_t target = lntmsg->msg_target;
+ int target_is_router = lntmsg->msg_target_is_router;
+ int routing = lntmsg->msg_routing;
+ unsigned int niov = lntmsg->msg_niov;
+ struct iovec *iov = lntmsg->msg_iov;
+ lnet_kiov_t *kiov = lntmsg->msg_kiov;
+ unsigned int offset = lntmsg->msg_offset;
+ unsigned int nob = lntmsg->msg_len;
+ unsigned int msg_vmflush = lntmsg->msg_vmflush;
+ kgn_net_t *net = ni->ni_data;
+ kgn_tx_t *tx;
+ int rc = 0;
+ int mpflag = 0;
+
+ /* NB 'private' is different depending on what we're sending.... */
+ LASSERT(!in_interrupt());
+
+ CDEBUG(D_NET, "sending msg type %d with %d bytes in %d frags to %s\n",
+ type, nob, niov, libcfs_id2str(target));
+
+ LASSERTF(nob == 0 || niov > 0,
+ "lntmsg %p nob %d niov %d\n", lntmsg, nob, niov);
+ LASSERTF(niov <= LNET_MAX_IOV,
+ "lntmsg %p niov %d\n", lntmsg, niov);
+
+ /* payload is either all vaddrs or all pages */
+ LASSERTF(!(kiov != NULL && iov != NULL),
+ "lntmsg %p kiov %p iov %p\n", lntmsg, kiov, iov);
+
+ if (msg_vmflush)
+ mpflag = cfs_memory_pressure_get_and_set();
+
+ switch (type) {
+ default:
+ CERROR("lntmsg %p with unexpected type %d\n",
+ lntmsg, type);
+ LBUG();
+
+ case LNET_MSG_ACK:
+ LASSERTF(nob == 0, "lntmsg %p nob %d\n",
+ lntmsg, nob);
+ break;
+
+ case LNET_MSG_GET:
+ LASSERT(niov == 0);
+ LASSERT(nob == 0);
+
+ if (routing || target_is_router)
+ break; /* send IMMEDIATE */
+
+ /* it is safe to do direct GET with out mapping buffer for RDMA as we
+ * check the eventual sink buffer here - if small enough, remote
+ * end is perfectly capable of returning data in short message -
+ * The magic is that we call lnet_parse in kgnilnd_recv with rdma_req=0
+ * for IMMEDIATE messages which will have it send a real reply instead
+ * of doing kgnilnd_recv to have the RDMA continued */
+ if (lntmsg->msg_md->md_length <= *kgnilnd_tunables.kgn_max_immediate)
+ break;
+
+ tx = kgnilnd_new_tx_msg(GNILND_MSG_GET_REQ, ni->ni_nid);
+ if (tx == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ /* slightly different options as we might actually have a GET with a
+ * MD_KIOV set but a non-NULL md_iov.iov */
+ if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0)
+ rc = kgnilnd_setup_rdma_buffer(tx, lntmsg->msg_md->md_niov,
+ lntmsg->msg_md->md_iov.iov, NULL,
+ 0, lntmsg->msg_md->md_length);
+ else
+ rc = kgnilnd_setup_rdma_buffer(tx, lntmsg->msg_md->md_niov,
+ NULL, lntmsg->msg_md->md_iov.kiov,
+ 0, lntmsg->msg_md->md_length);
+ if (rc != 0) {
+ CERROR("unable to setup buffer: %d\n", rc);
+ kgnilnd_tx_done(tx, rc);
+ rc = -EIO;
+ goto out;
+ }
+
+ tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg);
+ if (tx->tx_lntmsg[1] == NULL) {
+ CERROR("Can't create reply for GET to %s\n",
+ libcfs_nid2str(target.nid));
+ kgnilnd_tx_done(tx, rc);
+ rc = -EIO;
+ goto out;
+ }
+
+ tx->tx_lntmsg[0] = lntmsg;
+ tx->tx_msg.gnm_u.get.gngm_hdr = *hdr;
+ /* rest of tx_msg is setup just before it is sent */
+ kgnilnd_launch_tx(tx, net, &target);
+ goto out;
+
+ case LNET_MSG_REPLY:
+ case LNET_MSG_PUT:
+ /* to save on MDDs, we'll handle short kiov by vmap'ing
+ * and sending via SMSG */
+ if (nob <= *kgnilnd_tunables.kgn_max_immediate)
+ break;
+
+ tx = kgnilnd_new_tx_msg(GNILND_MSG_PUT_REQ, ni->ni_nid);
+ if (tx == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ rc = kgnilnd_setup_rdma_buffer(tx, niov, iov, kiov, offset, nob);
+ if (rc != 0) {
+ kgnilnd_tx_done(tx, rc);
+ rc = -EIO;
+ goto out;
+ }
+
+ tx->tx_lntmsg[0] = lntmsg;
+ tx->tx_msg.gnm_u.putreq.gnprm_hdr = *hdr;
+ /* rest of tx_msg is setup just before it is sent */
+ kgnilnd_launch_tx(tx, net, &target);
+ goto out;
+ }
+
+ /* send IMMEDIATE */
+
+ LASSERTF(nob <= *kgnilnd_tunables.kgn_max_immediate,
+ "lntmsg 0x%p too large %d\n", lntmsg, nob);
+
+ tx = kgnilnd_new_tx_msg(GNILND_MSG_IMMEDIATE, ni->ni_nid);
+ if (tx == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ rc = kgnilnd_setup_immediate_buffer(tx, niov, iov, kiov, offset, nob);
+ if (rc != 0) {
+ kgnilnd_tx_done(tx, rc);
+ goto out;
+ }
+
+ tx->tx_msg.gnm_u.immediate.gnim_hdr = *hdr;
+ tx->tx_lntmsg[0] = lntmsg;
+ kgnilnd_launch_tx(tx, net, &target);
+
+out:
+ /* use stored value as we could have already finalized lntmsg here from a failed launch */
+ if (msg_vmflush)
+ cfs_memory_pressure_restore(mpflag);
+ return rc;
+}
+
+void
+kgnilnd_reply(lnet_ni_t *ni, kgn_rx_t *rx, lnet_msg_t *lntmsg)
+{
+ kgn_conn_t *conn = rx->grx_conn;
+ kgn_msg_t *rxmsg = rx->grx_msg;
+ unsigned int niov = lntmsg->msg_niov;
+ struct iovec *iov = lntmsg->msg_iov;
+ lnet_kiov_t *kiov = lntmsg->msg_kiov;
+ unsigned int offset = lntmsg->msg_offset;
+ unsigned int nob = lntmsg->msg_len;
+ kgn_tx_t *tx;
+ int rc = 0;
+
+ tx = kgnilnd_new_tx_msg(GNILND_MSG_GET_DONE, ni->ni_nid);
+ if (tx == NULL)
+ goto failed_0;
+
+ rc = kgnilnd_set_tx_id(tx, conn);
+ if (rc != 0)
+ goto failed_1;
+
+ rc = kgnilnd_setup_rdma_buffer(tx, niov, iov, kiov, offset, nob);
+ if (rc != 0)
+ goto failed_1;
+
+ tx->tx_lntmsg[0] = lntmsg;
+ tx->tx_getinfo = rxmsg->gnm_u.get;
+
+ /* we only queue from kgnilnd_recv - we might get called from other contexts
+ * and we don't want to block the mutex in those cases */
+
+ spin_lock(&tx->tx_conn->gnc_device->gnd_lock);
+ kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_MAPQ, 1);
+ spin_unlock(&tx->tx_conn->gnc_device->gnd_lock);
+ kgnilnd_schedule_device(tx->tx_conn->gnc_device);
+
+ return;
+
+ failed_1:
+ kgnilnd_tx_done(tx, rc);
+ kgnilnd_nak_rdma(conn, GNILND_MSG_GET_NAK, rc, rxmsg->gnm_u.get.gngm_cookie, ni->ni_nid);
+ failed_0:
+ lnet_finalize(ni, lntmsg, rc);
+}
+
+int
+kgnilnd_eager_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+ void **new_private)
+{
+ kgn_rx_t *rx = private;
+ kgn_conn_t *conn = rx->grx_conn;
+ kgn_msg_t *rxmsg = rx->grx_msg;
+ kgn_msg_t *eagermsg = NULL;
+
+ GNIDBG_MSG(D_NET, rxmsg, "eager recv for conn %p, rxmsg %p, lntmsg %p",
+ conn, rxmsg, lntmsg);
+
+ if (rxmsg->gnm_payload_len > *kgnilnd_tunables.kgn_max_immediate) {
+ GNIDBG_MSG(D_ERROR, rxmsg, "payload too large %d",
+ rxmsg->gnm_payload_len);
+ return -EPROTO;
+ }
+
+ /* we have no credits or buffers for this message, so copy it
+ * somewhere for a later kgnilnd_recv */
+ LIBCFS_ALLOC(eagermsg, sizeof(*eagermsg) + *kgnilnd_tunables.kgn_max_immediate);
+ if (eagermsg == NULL) {
+ CERROR("couldn't allocate eager rx message for conn %p to %s\n",
+ conn, libcfs_nid2str(conn->gnc_peer->gnp_nid));
+ return -ENOMEM;
+ }
+
+ /* copy msg and payload */
+ memcpy(eagermsg, rxmsg, sizeof(*rxmsg) + rxmsg->gnm_payload_len);
+ rx->grx_msg = eagermsg;
+ rx->grx_eager = 1;
+
+ /* stash this for lnet_finalize on cancel-on-conn-close */
+ rx->grx_lntmsg = lntmsg;
+
+ /* add conn ref to ensure it doesn't go away until all eager messages processed */
+ kgnilnd_conn_addref(conn);
+
+ /* keep the same rx_t, it just has a new grx_msg now */
+ *new_private = private;
+
+ /* release SMSG buffer */
+ kgnilnd_release_msg(conn);
+
+ return 0;
+}
+
+int
+kgnilnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+ int delayed, unsigned int niov,
+ struct iovec *iov, lnet_kiov_t *kiov,
+ unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+ kgn_rx_t *rx = private;
+ kgn_conn_t *conn = rx->grx_conn;
+ kgn_msg_t *rxmsg = rx->grx_msg;
+ kgn_tx_t *tx;
+ int rc = 0;
+ __u32 pload_cksum;
+ ENTRY;
+
+ LASSERT(!in_interrupt());
+ LASSERTF(mlen <= rlen, "%d <= %d\n", mlen, rlen);
+ /* Either all pages or all vaddrs */
+ LASSERTF(!(kiov != NULL && iov != NULL), "kiov %p iov %p\n",
+ kiov, iov);
+
+ GNIDBG_MSG(D_NET, rxmsg, "conn %p, rxmsg %p, lntmsg %p"
+ " niov=%d kiov=%p iov=%p offset=%d mlen=%d rlen=%d",
+ conn, rxmsg, lntmsg,
+ niov, kiov, iov, offset, mlen, rlen);
+
+ /* we need to lock here as recv can be called from any context */
+ read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+ if (rx->grx_eager && conn->gnc_state != GNILND_CONN_ESTABLISHED) {
+ read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+ /* someone closed the conn after we copied this out, nuke it */
+ kgnilnd_consume_rx(rx);
+ lnet_finalize(ni, lntmsg, conn->gnc_error);
+ RETURN(0);
+ }
+ read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+ switch (rxmsg->gnm_type) {
+ default:
+ LBUG();
+
+ case GNILND_MSG_IMMEDIATE:
+ if (mlen > rxmsg->gnm_payload_len) {
+ GNIDBG_MSG(D_ERROR, rxmsg,
+ "Immediate message from %s too big: %d > %d",
+ libcfs_nid2str(conn->gnc_peer->gnp_nid), mlen,
+ rxmsg->gnm_payload_len);
+ rc = -EINVAL;
+ kgnilnd_consume_rx(rx);
+ RETURN(rc);
+ }
+
+ /* rxmsg[1] is a pointer to the payload, sitting in the buffer
+ * right after the kgn_msg_t header - so just 'cute' way of saying
+ * rxmsg + sizeof(kgn_msg_t) */
+
+ /* check payload checksum if sent */
+
+ if (*kgnilnd_tunables.kgn_checksum >= 2 &&
+ !rxmsg->gnm_payload_cksum &&
+ rxmsg->gnm_payload_len != 0)
+ GNIDBG_MSG(D_WARNING, rxmsg, "no msg payload checksum when enabled");
+
+ if (rxmsg->gnm_payload_cksum != 0) {
+ /* gnm_payload_len set in kgnilnd_sendmsg from tx->tx_nob,
+ * which is what is used to calculate the cksum on the TX side */
+ pload_cksum = kgnilnd_cksum(&rxmsg[1], rxmsg->gnm_payload_len);
+
+ if (rxmsg->gnm_payload_cksum != pload_cksum) {
+ GNIDBG_MSG(D_NETERROR, rxmsg,
+ "Bad payload checksum (%x expected %x)",
+ pload_cksum, rxmsg->gnm_payload_cksum);
+ switch (*kgnilnd_tunables.kgn_checksum_dump) {
+ case 2:
+ kgnilnd_dump_blob(D_BUFFS, "bad payload checksum",
+ &rxmsg[1], rxmsg->gnm_payload_len);
+ /* fall through to dump */
+ case 1:
+ libcfs_debug_dumplog();
+ break;
+ default:
+ break;
+ }
+ rc = -ENOKEY;
+ /* checksum problems are fatal, kill the conn */
+ kgnilnd_consume_rx(rx);
+ kgnilnd_close_conn(conn, rc);
+ RETURN(rc);
+ }
+ }
+
+ if (kiov != NULL)
+ lnet_copy_flat2kiov(
+ niov, kiov, offset,
+ *kgnilnd_tunables.kgn_max_immediate,
+ &rxmsg[1], 0, mlen);
+ else
+ lnet_copy_flat2iov(
+ niov, iov, offset,
+ *kgnilnd_tunables.kgn_max_immediate,
+ &rxmsg[1], 0, mlen);
+
+ kgnilnd_consume_rx(rx);
+ lnet_finalize(ni, lntmsg, 0);
+ RETURN(0);
+
+ case GNILND_MSG_PUT_REQ:
+ /* LNET wants to truncate or drop transaction, sending NAK */
+ if (mlen == 0) {
+ kgnilnd_consume_rx(rx);
+ lnet_finalize(ni, lntmsg, 0);
+
+ /* only error if lntmsg == NULL, otherwise we are just
+ * short circuiting the rdma process of 0 bytes */
+ kgnilnd_nak_rdma(conn, GNILND_MSG_PUT_NAK,
+ lntmsg == NULL ? -ENOENT : 0,
+ rxmsg->gnm_u.get.gngm_cookie,
+ ni->ni_nid);
+ RETURN(0);
+ }
+ /* sending ACK with sink buff. info */
+ tx = kgnilnd_new_tx_msg(GNILND_MSG_PUT_ACK, ni->ni_nid);
+ if (tx == NULL) {
+ kgnilnd_consume_rx(rx);
+ RETURN(-ENOMEM);
+ }
+
+ rc = kgnilnd_set_tx_id(tx, conn);
+ if (rc != 0) {
+ GOTO(nak_put_req, rc);
+ }
+
+ rc = kgnilnd_setup_rdma_buffer(tx, niov, iov, kiov, offset, mlen);
+ if (rc != 0) {
+ GOTO(nak_put_req, rc);
+ }
+
+ tx->tx_msg.gnm_u.putack.gnpam_src_cookie =
+ rxmsg->gnm_u.putreq.gnprm_cookie;
+ tx->tx_msg.gnm_u.putack.gnpam_dst_cookie = tx->tx_id.txe_cookie;
+ tx->tx_msg.gnm_u.putack.gnpam_desc.gnrd_addr =
+ (__u64)((unsigned long)tx->tx_buffer);
+ tx->tx_msg.gnm_u.putack.gnpam_desc.gnrd_nob = mlen;
+
+ tx->tx_lntmsg[0] = lntmsg; /* finalize this on RDMA_DONE */
+
+ /* we only queue from kgnilnd_recv - we might get called from other contexts
+ * and we don't want to block the mutex in those cases */
+
+ spin_lock(&tx->tx_conn->gnc_device->gnd_lock);
+ kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_MAPQ, 1);
+ spin_unlock(&tx->tx_conn->gnc_device->gnd_lock);
+ kgnilnd_schedule_device(tx->tx_conn->gnc_device);
+
+ kgnilnd_consume_rx(rx);
+ RETURN(0);
+
+nak_put_req:
+ /* make sure we send an error back when the PUT fails */
+ kgnilnd_nak_rdma(conn, GNILND_MSG_PUT_NAK, rc, rxmsg->gnm_u.get.gngm_cookie, ni->ni_nid);
+ kgnilnd_tx_done(tx, rc);
+ kgnilnd_consume_rx(rx);
+
+ /* return magic LNet network error */
+ RETURN(-EIO);
+
+ case GNILND_MSG_GET_REQ:
+ if (lntmsg != NULL) {
+ /* Matched! */
+ kgnilnd_reply(ni, rx, lntmsg);
+ } else {
+ /* No match */
+ kgnilnd_nak_rdma(conn, GNILND_MSG_GET_NAK,
+ -ENOENT,
+ rxmsg->gnm_u.get.gngm_cookie,
+ ni->ni_nid);
+ }
+ kgnilnd_consume_rx(rx);
+ RETURN(0);
+ }
+ RETURN(0);
+}
+
+/* needs write_lock on kgn_peer_conn_lock held */
+int
+kgnilnd_check_conn_timeouts_locked(kgn_conn_t *conn)
+{
+ unsigned long timeout, keepalive;
+ unsigned long now = jiffies;
+ unsigned long newest_last_rx;
+ kgn_tx_t *tx;
+
+ /* given that we found this conn hanging off a peer, it better damned
+ * well be connected */
+ LASSERTF(conn->gnc_state == GNILND_CONN_ESTABLISHED,
+ "conn 0x%p->%s with bad state%s\n", conn,
+ conn->gnc_peer ? libcfs_nid2str(conn->gnc_peer->gnp_nid)
+ : "<?>",
+ kgnilnd_conn_state2str(conn));
+
+ CDEBUG(D_NET, "checking conn %p->%s timeout %d keepalive %d "
+ "rx_diff %lu tx_diff %lu\n",
+ conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+ conn->gnc_timeout, GNILND_TO2KA(conn->gnc_timeout),
+ cfs_duration_sec(now - conn->gnc_last_rx_cq),
+ cfs_duration_sec(now - conn->gnc_last_tx));
+
+ timeout = cfs_time_seconds(conn->gnc_timeout);
+ keepalive = cfs_time_seconds(GNILND_TO2KA(conn->gnc_timeout));
+
+ /* just in case our lack of RX msg processing is gumming up the works - give the
+ * remove an extra chance */
+
+ newest_last_rx = GNILND_LASTRX(conn);
+
+ if (time_after_eq(now, newest_last_rx + timeout)) {
+ GNIDBG_CONN(D_CONSOLE|D_NETERROR, conn, "No gnilnd traffic received from %s for %lu "
+ "seconds, terminating connection. Is node down? ",
+ libcfs_nid2str(conn->gnc_peer->gnp_nid),
+ cfs_duration_sec(now - newest_last_rx));
+ return -ETIMEDOUT;
+ }
+
+ /* we don't timeout on last_tx stalls - we are going to trust the
+ * underlying network to let us know when sends are failing.
+ * At worst, the peer will timeout our RX stamp and drop the connection
+ * at that point. We'll then see his CLOSE or at worst his RX
+ * stamp stop and drop the connection on our end */
+
+ if (time_after_eq(now, conn->gnc_last_tx + keepalive)) {
+ CDEBUG(D_NET, "sending NOOP -> %s (%p idle %lu(%lu)) "
+ "last %lu/%lu/%lu %lus/%lus/%lus\n",
+ libcfs_nid2str(conn->gnc_peer->gnp_nid), conn,
+ cfs_duration_sec(jiffies - conn->gnc_last_tx),
+ keepalive,
+ conn->gnc_last_noop_want, conn->gnc_last_noop_sent,
+ conn->gnc_last_noop_cq,
+ cfs_duration_sec(jiffies - conn->gnc_last_noop_want),
+ cfs_duration_sec(jiffies - conn->gnc_last_noop_sent),
+ cfs_duration_sec(jiffies - conn->gnc_last_noop_cq));
+ set_mb(conn->gnc_last_noop_want, jiffies);
+ atomic_inc(&conn->gnc_reaper_noop);
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_NOOP_SEND))
+ return 0;
+
+ tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP, conn->gnc_peer->gnp_net->gnn_ni->ni_nid);
+ if (tx == NULL)
+ return 0;
+ kgnilnd_queue_tx(conn, tx);
+ }
+
+ return 0;
+}
+
+/* needs write_lock on kgn_peer_conn_lock held */
+void
+kgnilnd_check_peer_timeouts_locked(kgn_peer_t *peer, struct list_head *todie,
+ struct list_head *souls)
+{
+ unsigned long timeout;
+ kgn_conn_t *conn, *connN = NULL;
+ kgn_tx_t *tx, *txN;
+ int rc = 0;
+ int count = 0;
+ int reconnect;
+ short releaseconn = 0;
+ unsigned long first_rx = 0;
+
+ CDEBUG(D_NET, "checking peer 0x%p->%s for timeouts; interval %lus\n",
+ peer, libcfs_nid2str(peer->gnp_nid),
+ peer->gnp_reconnect_interval);
+
+ timeout = cfs_time_seconds(MAX(*kgnilnd_tunables.kgn_timeout,
+ GNILND_MIN_TIMEOUT));
+
+ conn = kgnilnd_find_conn_locked(peer);
+ if (conn) {
+ /* if there is a valid conn, check the queues for timeouts */
+ rc = kgnilnd_check_conn_timeouts_locked(conn);
+ if (rc) {
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_RX_CLOSE_CLOSING)) {
+ /* simulate a RX CLOSE after the timeout but before
+ * the scheduler thread gets it */
+ conn->gnc_close_recvd = GNILND_CLOSE_INJECT1;
+ conn->gnc_peer_error = -ETIMEDOUT;
+ }
+ /* Once we mark closed, any of the scheduler threads could
+ * get it and move through before we hit the fail loc code */
+ kgnilnd_close_conn_locked(conn, rc);
+ } else {
+ /* first_rx is used to decide when to release a conn from purgatory.
+ */
+ first_rx = conn->gnc_first_rx;
+ }
+ }
+
+ /* now regardless of starting new conn, find tx on peer queue that
+ * are old and smell bad - do this first so we don't trigger
+ * reconnect on empty queue if we timeout all */
+ list_for_each_entry_safe(tx, txN, &peer->gnp_tx_queue, tx_list) {
+ if (time_after_eq(jiffies, tx->tx_qtime + timeout)) {
+ if (count == 0) {
+ LCONSOLE_INFO("could not send to %s due to connection"
+ " setup failure after %lu seconds\n",
+ libcfs_nid2str(peer->gnp_nid),
+ cfs_duration_sec(jiffies - tx->tx_qtime));
+ }
+ kgnilnd_tx_del_state_locked(tx, peer, NULL,
+ GNILND_TX_ALLOCD);
+ list_add_tail(&tx->tx_list, todie);
+ count++;
+ }
+ }
+
+ if (count || peer->gnp_connecting == GNILND_PEER_KILL) {
+ CDEBUG(D_NET, "canceling %d tx for peer 0x%p->%s\n",
+ count, peer, libcfs_nid2str(peer->gnp_nid));
+ /* if we nuked all the TX, stop peer connection attempt (if there is one..) */
+ if (list_empty(&peer->gnp_tx_queue) ||
+ peer->gnp_connecting == GNILND_PEER_KILL) {
+ /* we pass down todie to use a common function - but we know there are
+ * no TX to add */
+ kgnilnd_cancel_peer_connect_locked(peer, todie);
+ }
+ }
+
+ /* Don't reconnect if we are still trying to clear out old conns.
+ * This prevents us sending traffic on the new mbox before ensuring we are done
+ * with the old one */
+ reconnect = (atomic_read(&peer->gnp_dirty_eps) == 0);
+
+ /* if we are not connected and there are tx on the gnp_tx_queue waiting
+ * to be sent, we'll check the reconnect interval and fire up a new
+ * connection request */
+
+ if ((peer->gnp_connecting == GNILND_PEER_IDLE) &&
+ (time_after_eq(jiffies, peer->gnp_reconnect_time)) &&
+ !list_empty(&peer->gnp_tx_queue) && reconnect) {
+
+ CDEBUG(D_NET, "starting connect to %s\n",
+ libcfs_nid2str(peer->gnp_nid));
+ LASSERTF(peer->gnp_connecting == GNILND_PEER_IDLE, "Peer was idle and we"
+ "have a write_lock, state issue %d\n", peer->gnp_connecting);
+
+ peer->gnp_connecting = GNILND_PEER_CONNECT;
+ kgnilnd_peer_addref(peer); /* extra ref for connd */
+
+ spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
+ list_add_tail(&peer->gnp_connd_list,
+ &peer->gnp_net->gnn_dev->gnd_connd_peers);
+ spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
+
+ kgnilnd_schedule_dgram(peer->gnp_net->gnn_dev);
+ }
+
+ /* fail_loc to allow us to delay release of purgatory */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PURG_REL_DELAY))
+ return;
+
+ /* This check allows us to verify that the new conn is actually being used. This allows us to
+ * pull the old conns out of purgatory if they have actually seen traffic.
+ * We only release a conn from purgatory during stack reset, admin command, or when a peer reconnects
+ */
+ if (first_rx &&
+ time_after(jiffies, first_rx + cfs_time_seconds(*kgnilnd_tunables.kgn_hardware_timeout))) {
+ CDEBUG(D_NET,"We can release conn %p from purgatory %lu\n",
+ conn, first_rx + cfs_time_seconds(*kgnilnd_tunables.kgn_hardware_timeout));
+ releaseconn = 1;
+ }
+
+ list_for_each_entry_safe (conn, connN, &peer->gnp_conns, gnc_list) {
+ /* check for purgatory timeouts */
+ if (conn->gnc_in_purgatory) {
+ /* We cannot detach this conn from purgatory if it has not been closed so we reschedule it
+ * that way the next time we check it we can detach it from purgatory
+ */
+
+ if (conn->gnc_state != GNILND_CONN_DONE) {
+ /* Skip over conns that are currently not DONE. If they arent already scheduled
+ * for completion something in the state machine is broken.
+ */
+ continue;
+ }
+
+ /* We only detach a conn that is in purgatory if we have received a close message,
+ * we have a new valid connection that has successfully received data, or an admin
+ * command tells us we need to detach.
+ */
+
+ if (conn->gnc_close_recvd || releaseconn || conn->gnc_needs_detach) {
+ unsigned long waiting;
+
+ waiting = (long) jiffies - conn->gnc_last_rx_cq;
+
+ /* C.E: The remote peer is expected to close the
+ * connection (see kgnilnd_check_conn_timeouts)
+ * via the reaper thread and nuke out the MDD and
+ * FMA resources after conn->gnc_timeout has expired
+ * without an FMA RX */
+ CDEBUG(D_NET, "Reconnected to %s in %lds or admin forced detach, dropping "
+ " held resources\n",
+ libcfs_nid2str(conn->gnc_peer->gnp_nid),
+ cfs_duration_sec(waiting));
+
+ kgnilnd_detach_purgatory_locked(conn, souls);
+ }
+ }
+ }
+
+ return;
+}
+
+void
+kgnilnd_reaper_check(int idx)
+{
+ struct list_head *peers = &kgnilnd_data.kgn_peers[idx];
+ struct list_head *ctmp, *ctmpN;
+ struct list_head geriatrics;
+ struct list_head souls;
+
+ INIT_LIST_HEAD(&geriatrics);
+ INIT_LIST_HEAD(&souls);
+
+ write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+ list_for_each_safe(ctmp, ctmpN, peers) {
+ kgn_peer_t *peer = NULL;
+
+ /* don't timeout stuff if the network is mucked or shutting down */
+ if (kgnilnd_check_hw_quiesce()) {
+ break;
+ }
+ peer = list_entry(ctmp, kgn_peer_t, gnp_list);
+
+ kgnilnd_check_peer_timeouts_locked(peer, &geriatrics, &souls);
+ }
+
+ write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+ kgnilnd_txlist_done(&geriatrics, -EHOSTUNREACH);
+ kgnilnd_release_purgatory_list(&souls);
+}
+
+void
+kgnilnd_update_reaper_timeout(long timeout)
+{
+ LASSERT(timeout > 0);
+
+ spin_lock(&kgnilnd_data.kgn_reaper_lock);
+
+ if (timeout < kgnilnd_data.kgn_new_min_timeout)
+ kgnilnd_data.kgn_new_min_timeout = timeout;
+
+ spin_unlock(&kgnilnd_data.kgn_reaper_lock);
+}
+
+static void
+kgnilnd_reaper_poke_with_stick(unsigned long arg)
+{
+ wake_up(&kgnilnd_data.kgn_reaper_waitq);
+}
+
+int
+kgnilnd_reaper(void *arg)
+{
+ long timeout;
+ int i;
+ int hash_index = 0;
+ unsigned long next_check_time = jiffies;
+ long current_min_timeout = MAX_SCHEDULE_TIMEOUT;
+ struct timer_list timer;
+ DEFINE_WAIT(wait);
+
+ cfs_daemonize("kgnilnd_rpr");
+ cfs_block_allsigs();
+
+ /* all gnilnd threads need to run fairly urgently */
+ set_user_nice(current, *kgnilnd_tunables.kgn_nice);
+ spin_lock(&kgnilnd_data.kgn_reaper_lock);
+
+ while (!kgnilnd_data.kgn_shutdown) {
+ /* I wake up every 'p' seconds to check for timeouts on some
+ * more peers. I try to check every connection 'n' times
+ * within the global minimum of all keepalive and timeout
+ * intervals, to ensure I attend to every connection within
+ * (n+1)/n times its timeout intervals. */
+ const int p = GNILND_REAPER_THREAD_WAKE;
+ const int n = GNILND_REAPER_NCHECKS;
+ int chunk;
+ /* to quiesce or to not quiesce, that is the question */
+ if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+ spin_unlock(&kgnilnd_data.kgn_reaper_lock);
+ KGNILND_SPIN_QUIESCE;
+ spin_lock(&kgnilnd_data.kgn_reaper_lock);
+ }
+
+ /* careful with the jiffy wrap... */
+ timeout = (long)(next_check_time - jiffies);
+
+ if (timeout > 0) {
+ prepare_to_wait(&kgnilnd_data.kgn_reaper_waitq, &wait,
+ TASK_INTERRUPTIBLE);
+ spin_unlock(&kgnilnd_data.kgn_reaper_lock);
+ setup_timer(&timer, kgnilnd_reaper_poke_with_stick,
+ next_check_time);
+ mod_timer(&timer, (long) jiffies + timeout);
+
+ /* check flag variables before comitting */
+ if (!kgnilnd_data.kgn_shutdown &&
+ !kgnilnd_data.kgn_quiesce_trigger) {
+ CDEBUG(D_INFO, "schedule timeout %ld (%lu sec)\n",
+ timeout, cfs_duration_sec(timeout));
+ schedule();
+ CDEBUG(D_INFO, "awake after schedule\n");
+ }
+
+ del_singleshot_timer_sync(&timer);
+ spin_lock(&kgnilnd_data.kgn_reaper_lock);
+ finish_wait(&kgnilnd_data.kgn_reaper_waitq, &wait);
+ continue;
+ }
+
+ /* new_min_timeout is set from the conn timeouts and keepalive
+ * this should end up with a min timeout of
+ * GNILND_TIMEOUT2KEEPALIVE(t) or roughly LND_TIMEOUT/2 */
+ if (kgnilnd_data.kgn_new_min_timeout < current_min_timeout) {
+ current_min_timeout = kgnilnd_data.kgn_new_min_timeout;
+ CDEBUG(D_NET, "Set new min timeout %ld\n",
+ current_min_timeout);
+ }
+
+ spin_unlock(&kgnilnd_data.kgn_reaper_lock);
+
+ /* Compute how many table entries to check now so I get round
+ * the whole table fast enough given that I do this at fixed
+ * intervals of 'p' seconds) */
+ chunk = *kgnilnd_tunables.kgn_peer_hash_size;
+ if (kgnilnd_data.kgn_new_min_timeout > n * p)
+ chunk = (chunk * n * p) /
+ kgnilnd_data.kgn_new_min_timeout;
+ if (chunk == 0)
+ chunk = 1;
+ for (i = 0; i < chunk; i++) {
+ kgnilnd_reaper_check(hash_index);
+ hash_index = (hash_index + 1) %
+ *kgnilnd_tunables.kgn_peer_hash_size;
+ }
+ next_check_time = (long) jiffies + cfs_time_seconds(p);
+ CDEBUG(D_INFO, "next check at %lu or in %d sec\n", next_check_time, p);
+
+ spin_lock(&kgnilnd_data.kgn_reaper_lock);
+ }
+
+ spin_unlock(&kgnilnd_data.kgn_reaper_lock);
+
+ kgnilnd_thread_fini();
+ return 0;
+}
+
+int
+kgnilnd_check_rdma_cq(kgn_device_t *dev)
+{
+ gni_return_t rrc;
+ gni_post_descriptor_t *desc;
+ __u64 event_data;
+ kgn_tx_ev_id_t ev_id;
+ char err_str[256];
+ int should_retry, rc;
+ long num_processed = 0;
+ kgn_conn_t *conn = NULL;
+ kgn_tx_t *tx = NULL;
+
+ for (;;) {
+ /* make sure we don't keep looping if we need to reset */
+ if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+ return num_processed;
+ }
+ rc = kgnilnd_mutex_trylock(&dev->gnd_cq_mutex);
+ if (!rc) {
+ /* we didn't get the mutex, so return that there is still work
+ * to be done */
+ return 1;
+ }
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_DELAY_RDMA)) {
+ /* a bit gross - but we need a good way to test for
+ * delayed RDMA completions and the easiest way to do
+ * that is to delay the RDMA CQ events */
+ rrc = GNI_RC_NOT_DONE;
+ } else {
+ rrc = kgnilnd_cq_get_event(dev->gnd_snd_rdma_cqh, &event_data);
+ }
+
+ if (rrc == GNI_RC_NOT_DONE) {
+ mutex_unlock(&dev->gnd_cq_mutex);
+ CDEBUG(D_INFO, "SEND RDMA CQ %d empty processed %ld\n",
+ dev->gnd_id, num_processed);
+ return num_processed;
+ }
+ dev->gnd_sched_alive = jiffies;
+ num_processed++;
+
+ LASSERTF(!GNI_CQ_OVERRUN(event_data),
+ "this is bad, somehow our credits didn't protect us"
+ " from CQ overrun\n");
+ LASSERTF(GNI_CQ_GET_TYPE(event_data) == GNI_CQ_EVENT_TYPE_POST,
+ "rrc %d, GNI_CQ_GET_TYPE("LPX64") = "LPX64"\n", rrc,
+ event_data, GNI_CQ_GET_TYPE(event_data));
+
+ rrc = kgnilnd_get_completed(dev->gnd_snd_rdma_cqh, event_data,
+ &desc);
+ mutex_unlock(&dev->gnd_cq_mutex);
+
+ /* XXX Nic: Need better error handling here... */
+ LASSERTF((rrc == GNI_RC_SUCCESS) ||
+ (rrc == GNI_RC_TRANSACTION_ERROR),
+ "rrc %d\n", rrc);
+
+ ev_id.txe_cookie = desc->post_id;
+
+ kgnilnd_validate_tx_ev_id(&ev_id, &tx, &conn);
+
+ if (conn == NULL || tx == NULL) {
+ /* either conn or tx was already nuked and this is a "late"
+ * completion, so drop it */
+ continue;
+ }
+
+ GNITX_ASSERTF(tx, tx->tx_msg.gnm_type == GNILND_MSG_PUT_DONE ||
+ tx->tx_msg.gnm_type == GNILND_MSG_GET_DONE,
+ "tx %p with type %d\n", tx, tx->tx_msg.gnm_type);
+
+ GNIDBG_TX(D_NET, tx, "RDMA completion for %d bytes", tx->tx_nob);
+
+ /* remove from rdmaq */
+ spin_lock(&conn->gnc_list_lock);
+ kgnilnd_tx_del_state_locked(tx, NULL, conn, GNILND_TX_ALLOCD);
+ spin_unlock(&conn->gnc_list_lock);
+
+ if (likely(desc->status == GNI_RC_SUCCESS)) {
+ atomic_inc(&dev->gnd_rdma_ntx);
+ atomic64_add(tx->tx_nob, &dev->gnd_rdma_txbytes);
+ /* transaction succeeded, add into fmaq */
+ kgnilnd_queue_tx(conn, tx);
+ kgnilnd_peer_alive(conn->gnc_peer);
+
+ /* drop ref from kgnilnd_validate_tx_ev_id */
+ kgnilnd_conn_decref(conn);
+ continue;
+ }
+
+ /* fall through to the TRANSACTION_ERROR case */
+ tx->tx_retrans++;
+
+ /* get stringified version for log messages */
+ kgnilnd_cq_error_str(event_data, &err_str, 256);
+ kgnilnd_cq_error_recoverable(event_data, &should_retry);
+
+ /* make sure we are not off in the weeds with this tx */
+ if (tx->tx_retrans >
+ *kgnilnd_tunables.kgn_max_retransmits) {
+ GNIDBG_TX(D_NETERROR, tx,
+ "giving up on TX, too many retries", NULL);
+ should_retry = 0;
+ }
+
+ GNIDBG_TX(D_NETERROR, tx, "RDMA %s error (%s)",
+ should_retry ? "transient" : "unrecoverable", err_str);
+
+ if (tx->tx_msg.gnm_type == GNILND_MSG_PUT_DONE) {
+ if (should_retry) {
+ kgnilnd_rdma(tx, GNILND_MSG_PUT_DONE,
+ &tx->tx_putinfo.gnpam_desc,
+ tx->tx_putinfo.gnpam_desc.gnrd_nob,
+ tx->tx_putinfo.gnpam_dst_cookie);
+ } else {
+ kgnilnd_nak_rdma(conn, GNILND_MSG_PUT_NAK,
+ -EFAULT,
+ tx->tx_putinfo.gnpam_dst_cookie,
+ tx->tx_msg.gnm_srcnid);
+ kgnilnd_tx_done(tx, -EFAULT);
+ }
+ } else {
+ if (should_retry) {
+ kgnilnd_rdma(tx, GNILND_MSG_GET_DONE,
+ &tx->tx_getinfo.gngm_desc,
+ tx->tx_lntmsg[0]->msg_len,
+ tx->tx_getinfo.gngm_cookie);
+ } else {
+ kgnilnd_nak_rdma(conn, GNILND_MSG_GET_NAK,
+ -EFAULT,
+ tx->tx_getinfo.gngm_cookie,
+ tx->tx_msg.gnm_srcnid);
+ kgnilnd_tx_done(tx, -EFAULT);
+ }
+ }
+
+ /* drop ref from kgnilnd_validate_tx_ev_id */
+ kgnilnd_conn_decref(conn);
+ }
+}
+
+int
+kgnilnd_check_fma_send_cq(kgn_device_t *dev)
+{
+ gni_return_t rrc;
+ __u64 event_data;
+ kgn_tx_ev_id_t ev_id;
+ kgn_tx_t *tx = NULL;
+ kgn_conn_t *conn = NULL;
+ int queued_fma, saw_reply, rc;
+ long num_processed = 0;
+
+ for (;;) {
+ /* make sure we don't keep looping if we need to reset */
+ if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+ return num_processed;
+ }
+
+ rc = kgnilnd_mutex_trylock(&dev->gnd_cq_mutex);
+ if (!rc) {
+ /* we didn't get the mutex, so return that there is still work
+ * to be done */
+ return 1;
+ }
+
+ rrc = kgnilnd_cq_get_event(dev->gnd_snd_fma_cqh, &event_data);
+ mutex_unlock(&dev->gnd_cq_mutex);
+
+ if (rrc == GNI_RC_NOT_DONE) {
+ CDEBUG(D_INFO,
+ "SMSG send CQ %d not ready (data "LPX64") "
+ "processed %ld\n", dev->gnd_id, event_data,
+ num_processed);
+ return num_processed;
+ }
+
+ dev->gnd_sched_alive = jiffies;
+ num_processed++;
+
+ LASSERTF(!GNI_CQ_OVERRUN(event_data),
+ "this is bad, somehow our credits didn't "
+ "protect us from CQ overrun\n");
+ LASSERTF(GNI_CQ_GET_TYPE(event_data) == GNI_CQ_EVENT_TYPE_SMSG,
+ "rrc %d, GNI_CQ_GET_TYPE("LPX64") = "LPX64"\n", rrc,
+ event_data, GNI_CQ_GET_TYPE(event_data));
+
+ /* if SMSG couldn't handle an error, time for conn to die */
+ if (unlikely(rrc == GNI_RC_TRANSACTION_ERROR)) {
+ char err_str[256];
+
+ /* need to take the write_lock to ensure atomicity
+ * on the conn state if we need to close it */
+ write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+ conn = kgnilnd_cqid2conn_locked(GNI_CQ_GET_INST_ID(event_data));
+ if (conn == NULL) {
+ /* Conn was destroyed? */
+ CDEBUG(D_NET,
+ "SMSG CQID lookup "LPX64" failed\n",
+ GNI_CQ_GET_INST_ID(event_data));
+ write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+ continue;
+ }
+
+ kgnilnd_cq_error_str(event_data, &err_str, 256);
+ CNETERR("SMSG send error to %s: rc %d (%s)\n",
+ libcfs_nid2str(conn->gnc_peer->gnp_nid),
+ rrc, err_str);
+ kgnilnd_close_conn_locked(conn, -ECOMM);
+
+ write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+ /* no need to process rest of this tx -
+ * it is getting canceled */
+ continue;
+ }
+
+ /* fall through to GNI_RC_SUCCESS case */
+ ev_id.txe_smsg_id = GNI_CQ_GET_MSG_ID(event_data);
+
+ kgnilnd_validate_tx_ev_id(&ev_id, &tx, &conn);
+ if (conn == NULL || tx == NULL) {
+ /* either conn or tx was already nuked and this is a "late"
+ * completion, so drop it */
+ continue;
+ }
+
+ tx->tx_conn->gnc_last_tx_cq = jiffies;
+ if (tx->tx_msg.gnm_type == GNILND_MSG_NOOP) {
+ set_mb(conn->gnc_last_noop_cq, jiffies);
+ }
+
+ /* lock tx_list_state and tx_state */
+ spin_lock(&tx->tx_conn->gnc_list_lock);
+
+ GNITX_ASSERTF(tx, tx->tx_list_state == GNILND_TX_LIVE_FMAQ,
+ "state not GNILND_TX_LIVE_FMAQ", NULL);
+ GNITX_ASSERTF(tx, tx->tx_state & GNILND_TX_WAITING_COMPLETION,
+ "not waiting for completion", NULL);
+
+ GNIDBG_TX(D_NET, tx, "SMSG complete tx_state %x rc %d",
+ tx->tx_state, rrc);
+
+ tx->tx_state &= ~GNILND_TX_WAITING_COMPLETION;
+
+ /* This will trigger other FMA sends that were
+ * pending this completion */
+ queued_fma = !list_empty(&tx->tx_conn->gnc_fmaq);
+
+ /* we either did not expect reply or we already got it */
+ saw_reply = !(tx->tx_state & GNILND_TX_WAITING_REPLY);
+
+ spin_unlock(&tx->tx_conn->gnc_list_lock);
+
+ if (queued_fma) {
+ CDEBUG(D_NET, "scheduling conn 0x%p->%s for fmaq\n",
+ conn,
+ libcfs_nid2str(conn->gnc_peer->gnp_nid));
+ kgnilnd_schedule_conn(conn);
+ }
+
+ /* If saw_reply is false as soon as gnc_list_lock is dropped the tx could be nuked
+ * If saw_reply is true we know that the tx is safe to use as the other thread
+ * is already finished with it.
+ */
+
+ if (saw_reply) {
+ /* no longer need to track on the live_fmaq */
+ kgnilnd_tx_del_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_ALLOCD);
+
+ if (tx->tx_state & GNILND_TX_PENDING_RDMA) {
+ /* we already got reply & were waiting for
+ * completion of initial send */
+ /* to initiate RDMA transaction */
+ GNIDBG_TX(D_NET, tx,
+ "Pending RDMA 0x%p type 0x%02x",
+ tx->tx_msg.gnm_type);
+ tx->tx_state &= ~GNILND_TX_PENDING_RDMA;
+ rc = kgnilnd_send_mapped_tx(tx, 0);
+ GNITX_ASSERTF(tx, rc == 0, "RDMA send failed: %d\n", rc);
+ } else {
+ /* we are done with this tx */
+ GNIDBG_TX(D_NET, tx,
+ "Done with tx type 0x%02x",
+ tx->tx_msg.gnm_type);
+ kgnilnd_tx_done(tx, tx->tx_rc);
+ }
+ }
+
+ /* drop ref from kgnilnd_validate_tx_ev_id */
+ kgnilnd_conn_decref(conn);
+
+ /* if we are waiting for a REPLY, we'll handle the tx then */
+ } /* end for loop */
+}
+
+int
+kgnilnd_check_fma_rcv_cq(kgn_device_t *dev)
+{
+ kgn_conn_t *conn;
+ gni_return_t rrc;
+ __u64 event_data;
+ long num_processed = 0;
+ struct list_head *conns;
+ struct list_head *tmp;
+ int rc;
+
+ for (;;) {
+ /* make sure we don't keep looping if we need to reset */
+ if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+ return num_processed;
+ }
+
+ rc = kgnilnd_mutex_trylock(&dev->gnd_cq_mutex);
+ if (!rc) {
+ /* we didn't get the mutex, so return that there is still work
+ * to be done */
+ return 1;
+ }
+ rrc = kgnilnd_cq_get_event(dev->gnd_rcv_fma_cqh, &event_data);
+ mutex_unlock(&dev->gnd_cq_mutex);
+
+ if (rrc == GNI_RC_NOT_DONE) {
+ CDEBUG(D_INFO, "SMSG RX CQ %d empty data "LPX64" "
+ "processed %ld\n",
+ dev->gnd_id, event_data, num_processed);
+ return num_processed;
+ }
+ dev->gnd_sched_alive = jiffies;
+ num_processed++;
+
+ /* this is the only CQ that can really handle transient
+ * CQ errors */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CQ_GET_EVENT)) {
+ rrc = cfs_fail_val ? cfs_fail_val
+ : GNI_RC_ERROR_RESOURCE;
+ if (rrc == GNI_RC_ERROR_RESOURCE) {
+ /* set overrun too */
+ event_data |= (1UL << 63);
+ LASSERTF(GNI_CQ_OVERRUN(event_data),
+ "(1UL << 63) is no longer the bit to"
+ "set to indicate CQ_OVERRUN\n");
+ }
+ }
+ /* sender should get error event too and take care
+ of failed transaction by re-transmitting */
+ if (rrc == GNI_RC_TRANSACTION_ERROR) {
+ CDEBUG(D_NET, "SMSG RX CQ error "LPX64"\n", event_data);
+ continue;
+ }
+
+ if (likely(!GNI_CQ_OVERRUN(event_data))) {
+ read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+ conn = kgnilnd_cqid2conn_locked(
+ GNI_CQ_GET_INST_ID(event_data));
+ if (conn == NULL) {
+ CDEBUG(D_NET, "SMSG RX CQID lookup "LPU64" "
+ "failed, dropping event "LPX64"\n",
+ GNI_CQ_GET_INST_ID(event_data),
+ event_data);
+ } else {
+ CDEBUG(D_NET, "SMSG RX: CQID "LPU64" "
+ "conn %p->%s\n",
+ GNI_CQ_GET_INST_ID(event_data),
+ conn, conn->gnc_peer ?
+ libcfs_nid2str(conn->gnc_peer->gnp_nid) :
+ "<?>");
+
+ conn->gnc_last_rx_cq = jiffies;
+
+ /* stash first rx so we can clear out purgatory.
+ */
+ if (conn->gnc_first_rx == 0) {
+ conn->gnc_first_rx = jiffies;
+ }
+ kgnilnd_peer_alive(conn->gnc_peer);
+ kgnilnd_schedule_conn(conn);
+ }
+ read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+ continue;
+ }
+
+ /* FMA CQ has overflowed: check ALL conns */
+ CNETERR("SMSG RX CQ overflow: scheduling ALL "
+ "conns on device %d\n", dev->gnd_id);
+
+ for (rc = 0; rc < *kgnilnd_tunables.kgn_peer_hash_size; rc++) {
+
+ read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+ conns = &kgnilnd_data.kgn_conns[rc];
+
+ list_for_each(tmp, conns) {
+ conn = list_entry(tmp, kgn_conn_t,
+ gnc_hashlist);
+
+ if (conn->gnc_device == dev) {
+ kgnilnd_schedule_conn(conn);
+ conn->gnc_last_rx_cq = jiffies;
+ }
+ }
+
+ /* don't block write lockers for too long... */
+ read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+ }
+ }
+}
+
+/* try_map_if_full should only be used when processing TX from list of
+ * backlog TX waiting on mappings to free up
+ *
+ * Return Codes:
+ * try_map_if_full = 0: 0 (sent or queued), (-|+)errno failure of kgnilnd_sendmsg
+ * try_map_if_full = 1: 0 (sent), -ENOMEM for caller to requeue, (-|+)errno failure of kgnilnd_sendmsg */
+
+int
+kgnilnd_send_mapped_tx(kgn_tx_t *tx, int try_map_if_full)
+{
+ /* slight bit of race if multiple people calling, but at worst we'll have
+ * order altered just a bit... which would not be determenistic anyways */
+ int rc = atomic_read(&tx->tx_conn->gnc_device->gnd_nq_map);
+
+ GNIDBG_TX(D_NET, tx, "try %d nq_map %d", try_map_if_full, rc);
+
+ /* We know that we have a GART reservation that should guarantee forward progress.
+ * This means we don't need to take any extraordinary efforts if we are failing
+ * mappings here - even if we are holding a very small number of these. */
+
+ if (try_map_if_full || (rc == 0)) {
+ rc = kgnilnd_map_buffer(tx);
+ }
+
+ /* rc should be 0 if we mapped succesfully here, if non-zero we are queueing */
+ if (rc != 0) {
+ /* if try_map_if_full set, they handle requeuing */
+ if (unlikely(try_map_if_full)) {
+ RETURN(rc);
+ } else {
+ spin_lock(&tx->tx_conn->gnc_device->gnd_lock);
+ kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_MAPQ, 1);
+ spin_unlock(&tx->tx_conn->gnc_device->gnd_lock);
+ /* make sure we wake up sched to run this */
+ kgnilnd_schedule_device(tx->tx_conn->gnc_device);
+ /* return 0 as this is now queued for later sending */
+ RETURN(0);
+ }
+ }
+
+ switch (tx->tx_msg.gnm_type) {
+ default:
+ LBUG();
+ break;
+ /* GET_REQ and PUT_ACK are outbound messages sending our mapping key to
+ * remote node where the RDMA will be started
+ * Special case -EAGAIN logic - this should just queued as if the mapping couldn't
+ * be satisified. The rest of the errors are "hard" errors that require
+ * upper layers to handle themselves */
+ case GNILND_MSG_GET_REQ:
+ tx->tx_msg.gnm_u.get.gngm_desc.gnrd_key = tx->tx_map_key;
+ tx->tx_msg.gnm_u.get.gngm_cookie = tx->tx_id.txe_cookie;
+ tx->tx_msg.gnm_u.get.gngm_desc.gnrd_addr = (__u64)((unsigned long)tx->tx_buffer);
+ tx->tx_msg.gnm_u.get.gngm_desc.gnrd_nob = tx->tx_nob;
+ tx->tx_state = GNILND_TX_WAITING_COMPLETION | GNILND_TX_WAITING_REPLY;
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GET_REQ_AGAIN)) {
+ tx->tx_state |= GNILND_TX_FAIL_SMSG;
+ }
+ /* redirect to FMAQ on failure, no need to infinite loop here in MAPQ */
+ rc = kgnilnd_sendmsg(tx, NULL, 0, &tx->tx_conn->gnc_list_lock, GNILND_TX_FMAQ);
+ break;
+ case GNILND_MSG_PUT_ACK:
+ tx->tx_msg.gnm_u.putack.gnpam_desc.gnrd_key = tx->tx_map_key;
+ tx->tx_state = GNILND_TX_WAITING_COMPLETION | GNILND_TX_WAITING_REPLY;
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PUT_ACK_AGAIN)) {
+ tx->tx_state |= GNILND_TX_FAIL_SMSG;
+ }
+ /* redirect to FMAQ on failure, no need to infinite loop here in MAPQ */
+ rc = kgnilnd_sendmsg(tx, NULL, 0, &tx->tx_conn->gnc_list_lock, GNILND_TX_FMAQ);
+ break;
+
+ /* PUT_REQ and GET_DONE are where we do the actual RDMA */
+ case GNILND_MSG_PUT_REQ:
+ kgnilnd_rdma(tx, GNILND_MSG_PUT_DONE,
+ &tx->tx_putinfo.gnpam_desc,
+ tx->tx_putinfo.gnpam_desc.gnrd_nob,
+ tx->tx_putinfo.gnpam_dst_cookie);
+ break;
+ case GNILND_MSG_GET_DONE:
+ kgnilnd_rdma(tx, GNILND_MSG_GET_DONE,
+ &tx->tx_getinfo.gngm_desc,
+ tx->tx_lntmsg[0]->msg_len,
+ tx->tx_getinfo.gngm_cookie);
+
+ break;
+ }
+
+ RETURN(rc);
+}
+
+void
+kgnilnd_process_fmaq(kgn_conn_t *conn)
+{
+ int more_to_do = 0;
+ kgn_tx_t *tx = NULL;
+ void *buffer = NULL;
+ unsigned int nob = 0;
+ int rc;
+
+ /* NB 1. kgnilnd_sendmsg() may fail if I'm out of credits right now.
+ * However I will be rescheduled by an FMA completion event
+ * when I eventually get some.
+ * NB 2. Sampling gnc_state here races with setting it elsewhere.
+ * But it doesn't matter if I try to send a "real" message just
+ * as I start closing because I'll get scheduled to send the
+ * close anyway. */
+
+ /* Short circuit if the ep_handle is null we cant send anyway. */
+ if (conn->gnc_ephandle == NULL)
+ return;
+
+ LASSERTF(!conn->gnc_close_sent, "Conn %p close was sent\n", conn);
+
+ spin_lock(&conn->gnc_list_lock);
+
+ if (list_empty(&conn->gnc_fmaq)) {
+ int keepalive = GNILND_TO2KA(conn->gnc_timeout);
+
+ spin_unlock(&conn->gnc_list_lock);
+
+ if (time_after_eq(jiffies, conn->gnc_last_tx + cfs_time_seconds(keepalive))) {
+ CDEBUG(D_NET, "sending NOOP -> %s (%p idle %lu(%d)) "
+ "last %lu/%lu/%lu %lus/%lus/%lus\n",
+ libcfs_nid2str(conn->gnc_peer->gnp_nid), conn,
+ cfs_duration_sec(jiffies - conn->gnc_last_tx),
+ keepalive,
+ conn->gnc_last_noop_want, conn->gnc_last_noop_sent,
+ conn->gnc_last_noop_cq,
+ cfs_duration_sec(jiffies - conn->gnc_last_noop_want),
+ cfs_duration_sec(jiffies - conn->gnc_last_noop_sent),
+ cfs_duration_sec(jiffies - conn->gnc_last_noop_cq));
+ atomic_inc(&conn->gnc_sched_noop);
+ set_mb(conn->gnc_last_noop_want, jiffies);
+
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_NOOP_SEND))
+ return;
+
+ tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP, conn->gnc_peer->gnp_net->gnn_ni->ni_nid);
+ if (tx != NULL) {
+ int rc;
+
+ rc = kgnilnd_set_tx_id(tx, conn);
+ if (rc != 0) {
+ kgnilnd_tx_done(tx, rc);
+ return;
+ }
+ }
+ }
+ } else {
+ tx = list_first_entry(&conn->gnc_fmaq, kgn_tx_t, tx_list);
+ /* move from fmaq to allocd, kgnilnd_sendmsg will move to live_fmaq */
+ kgnilnd_tx_del_state_locked(tx, NULL, conn, GNILND_TX_ALLOCD);
+ more_to_do = !list_empty(&conn->gnc_fmaq);
+ spin_unlock(&conn->gnc_list_lock);
+ }
+
+ /* if there is no real TX or no NOOP to send, bail */
+ if (tx == NULL) {
+ return;
+ }
+
+ if (!tx->tx_retrans)
+ tx->tx_cred_wait = jiffies;
+
+ GNITX_ASSERTF(tx, tx->tx_id.txe_smsg_id != 0,
+ "tx with zero id", NULL);
+
+ CDEBUG(D_NET, "sending regular msg: %p, type %s(0x%02x), cookie "LPX64"\n",
+ tx, kgnilnd_msgtype2str(tx->tx_msg.gnm_type),
+ tx->tx_msg.gnm_type, tx->tx_id.txe_cookie);
+
+ rc = 0;
+
+ switch (tx->tx_msg.gnm_type) {
+ default:
+ LBUG();
+
+ case GNILND_MSG_NOOP:
+ case GNILND_MSG_CLOSE:
+ case GNILND_MSG_IMMEDIATE:
+ tx->tx_state = GNILND_TX_WAITING_COMPLETION;
+ buffer = tx->tx_buffer;
+ nob = tx->tx_nob;
+ break;
+
+ case GNILND_MSG_GET_DONE:
+ case GNILND_MSG_PUT_DONE:
+ case GNILND_MSG_PUT_NAK:
+ case GNILND_MSG_GET_NAK:
+ tx->tx_state = GNILND_TX_WAITING_COMPLETION;
+ break;
+
+ case GNILND_MSG_PUT_REQ:
+ tx->tx_msg.gnm_u.putreq.gnprm_cookie = tx->tx_id.txe_cookie;
+
+ case GNILND_MSG_PUT_ACK:
+ case GNILND_MSG_GET_REQ:
+ /* This is really only to handle the retransmit of SMSG once these
+ * two messages are setup in send_mapped_tx */
+ tx->tx_state = GNILND_TX_WAITING_COMPLETION | GNILND_TX_WAITING_REPLY;
+ break;
+ }
+
+ if (likely(rc == 0)) {
+ rc = kgnilnd_sendmsg(tx, buffer, nob, &conn->gnc_list_lock, GNILND_TX_FMAQ);
+ }
+
+ if (rc > 0) {
+ /* don't explicitly reschedule here - we are short credits and will rely on
+ * kgnilnd_sendmsg to resched the conn if need be */
+ more_to_do = 0;
+ } else if (rc < 0) {
+ /* bail: it wasn't sent and we didn't get EAGAIN indicating we should retrans
+ * almost certainly a software bug, but lets play nice with the other kids */
+ kgnilnd_tx_done(tx, rc);
+ /* just for fun, kick peer in arse - resetting conn might help to correct
+ * this almost certainly buggy software caused return code */
+ kgnilnd_close_conn(conn, rc);
+ }
+
+ if (more_to_do) {
+ CDEBUG(D_NET, "Rescheduling %p (more to do)\n", conn);
+ kgnilnd_schedule_conn(conn);
+ }
+}
+
+int
+kgnilnd_process_rdmaq(kgn_device_t *dev)
+{
+ int found_work = 0;
+ kgn_tx_t *tx;
+
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_DELAY_RDMAQ)) {
+ RETURN(found_work);
+ }
+
+ if (time_after_eq(jiffies, dev->gnd_rdmaq_deadline)) {
+ unsigned long dead_bump;
+ long new_ok;
+
+ /* if we think we need to adjust, take lock to serialize and recheck */
+ spin_lock(&dev->gnd_rdmaq_lock);
+ if (time_after_eq(jiffies, dev->gnd_rdmaq_deadline)) {
+ del_singleshot_timer_sync(&dev->gnd_rdmaq_timer);
+
+ dead_bump = cfs_time_seconds(1) / *kgnilnd_tunables.kgn_rdmaq_intervals;
+
+ /* roll the bucket forward */
+ dev->gnd_rdmaq_deadline = jiffies + dead_bump;
+
+ if (kgnilnd_data.kgn_rdmaq_override &&
+ (*kgnilnd_tunables.kgn_rdmaq_intervals != 0)) {
+ new_ok = kgnilnd_data.kgn_rdmaq_override / *kgnilnd_tunables.kgn_rdmaq_intervals;
+ } else {
+ new_ok = ~0UL >> 1;
+ }
+
+ /* roll current outstanding forward to make sure we carry outstanding
+ * committment forward
+ * new_ok starts out as the whole interval value
+ * - first subtract bytes_out from last interval, as that would push us over
+ * strict limits for this interval
+ * - second, set bytes_ok to new_ok to ensure it doesn't exceed the current auth
+ *
+ * there is a small race here if someone is actively processing mappings and
+ * adding to rdmaq_bytes_out, but it should be small as the mappings are triggered
+ * quite quickly after kgnilnd_auth_rdma_bytes gives us the go-ahead
+ * - if this gives us problems in the future, we could use a read/write lock
+ * to protect the resetting of these values */
+ new_ok -= atomic64_read(&dev->gnd_rdmaq_bytes_out);
+ atomic64_set(&dev->gnd_rdmaq_bytes_ok, new_ok);
+
+ CDEBUG(D_NET, "resetting rdmaq bytes to %ld, deadline +%lu -> %lu, "
+ "current out %ld\n",
+ atomic64_read(&dev->gnd_rdmaq_bytes_ok), dead_bump, dev->gnd_rdmaq_deadline,
+ atomic64_read(&dev->gnd_rdmaq_bytes_out));
+ }
+ spin_unlock(&dev->gnd_rdmaq_lock);
+ }
+
+ spin_lock(&dev->gnd_rdmaq_lock);
+ while (!list_empty(&dev->gnd_rdmaq)) {
+ int rc;
+
+ /* make sure we break out early on quiesce */
+ if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+ /* always break with lock held - we unlock outside loop */
+ break;
+ }
+
+ tx = list_first_entry(&dev->gnd_rdmaq, kgn_tx_t, tx_list);
+ kgnilnd_tx_del_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_ALLOCD);
+ found_work++;
+
+ /* sample with lock held, serializing with kgnilnd_complete_closed_conn */
+ if (tx->tx_conn->gnc_state != GNILND_CONN_ESTABLISHED) {
+ /* if conn is dying, mark tx in tx_ref_table for
+ * kgnilnd_complete_closed_conn to finish up */
+ kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_DYING, 1);
+
+ /* tx was moved to DYING, get next */
+ continue;
+ }
+ spin_unlock(&dev->gnd_rdmaq_lock);
+
+ rc = kgnilnd_auth_rdma_bytes(dev, tx);
+ spin_lock(&dev->gnd_rdmaq_lock);
+
+ if (rc < 0) {
+ /* no ticket! add back to head */
+ kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_RDMAQ, 0);
+ /* clear found_work so scheduler threads wait for timer */
+ found_work = 0;
+ break;
+ } else {
+ /* TX is GO for launch */
+ tx->tx_qtime = jiffies;
+ kgnilnd_send_mapped_tx(tx, 0);
+ found_work++;
+ }
+ }
+ spin_unlock(&dev->gnd_rdmaq_lock);
+
+ RETURN(found_work);
+}
+
+static inline void
+kgnilnd_swab_rdma_desc(kgn_rdma_desc_t *d)
+{
+ __swab64s(&d->gnrd_key.qword1);
+ __swab64s(&d->gnrd_key.qword2);
+ __swab64s(&d->gnrd_addr);
+ __swab32s(&d->gnrd_nob);
+}
+
+#define kgnilnd_match_reply_either(w, x, y, z) _kgnilnd_match_reply(w, x, y, z)
+#define kgnilnd_match_reply(x, y, z) _kgnilnd_match_reply(x, y, GNILND_MSG_NONE, z)
+
+kgn_tx_t *
+_kgnilnd_match_reply(kgn_conn_t *conn, int type1, int type2, __u64 cookie)
+{
+ kgn_tx_ev_id_t ev_id;
+ kgn_tx_t *tx;
+
+ /* we use the cookie from the original TX, so we can find the match
+ * by parsing that and using the txe_idx */
+ ev_id.txe_cookie = cookie;
+
+ tx = conn->gnc_tx_ref_table[ev_id.txe_idx];
+
+ if (tx != NULL) {
+ /* check tx to make sure kgni didn't eat it */
+ GNITX_ASSERTF(tx, tx->tx_msg.gnm_magic == GNILND_MSG_MAGIC,
+ "came back from kgni with bad magic %x\n", tx->tx_msg.gnm_magic);
+
+ GNITX_ASSERTF(tx, ((tx->tx_id.txe_idx == ev_id.txe_idx) &&
+ (tx->tx_id.txe_cookie = cookie)),
+ "conn 0x%p->%s tx_ref_table hosed: wanted "
+ "txe_cookie "LPX64" txe_idx %d "
+ "found tx %p cookie "LPX64" txe_idx %d\n",
+ conn, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+ cookie, ev_id.txe_idx,
+ tx, tx->tx_id.txe_cookie, tx->tx_id.txe_idx);
+
+ LASSERTF((((tx->tx_msg.gnm_type == type1) || (tx->tx_msg.gnm_type == type2)) &&
+ (tx->tx_state & GNILND_TX_WAITING_REPLY)),
+ "Unexpected TX type (%x, %x or %x) "
+ "or state (%x, expected +%x) "
+ "matched reply from %s\n",
+ tx->tx_msg.gnm_type, type1, type2,
+ tx->tx_state, GNILND_TX_WAITING_REPLY,
+ libcfs_nid2str(conn->gnc_peer->gnp_nid));
+ } else {
+ CWARN("Unmatched reply %02x, or %02x/"LPX64" from %s\n",
+ type1, type2, cookie, libcfs_nid2str(conn->gnc_peer->gnp_nid));
+ }
+ return tx;
+}
+
+static inline void
+kgnilnd_complete_tx(kgn_tx_t *tx, int rc)
+{
+ int complete = 0;
+ kgn_conn_t *conn = tx->tx_conn;
+
+ spin_lock(&conn->gnc_list_lock);
+
+ GNITX_ASSERTF(tx, tx->tx_state & GNILND_TX_WAITING_REPLY,
+ "not waiting for reply", NULL);
+
+ tx->tx_rc = rc;
+ tx->tx_state &= ~GNILND_TX_WAITING_REPLY;
+
+ if (!(tx->tx_state & GNILND_TX_WAITING_COMPLETION)) {
+ kgnilnd_tx_del_state_locked(tx, NULL, conn, GNILND_TX_ALLOCD);
+ /* sample under lock as follow on steps require gnc_list_lock
+ * - or call kgnilnd_tx_done which requires no locks held over
+ * call to lnet_finalize */
+ complete = 1;
+ }
+ spin_unlock(&conn->gnc_list_lock);
+
+ if (complete) {
+ kgnilnd_tx_done(tx, tx->tx_rc);
+ }
+}
+
+static inline void
+kgnilnd_finalize_rx_done(kgn_tx_t *tx, kgn_msg_t *msg)
+{
+ int rc;
+ kgn_conn_t *conn = tx->tx_conn;
+
+ atomic_inc(&conn->gnc_device->gnd_rdma_nrx);
+ atomic64_add(tx->tx_nob, &conn->gnc_device->gnd_rdma_rxbytes);
+
+ rc = kgnilnd_verify_rdma_cksum(tx, msg->gnm_payload_cksum);
+
+ kgnilnd_complete_tx(tx, rc);
+}
+
+void
+kgnilnd_check_fma_rx(kgn_conn_t *conn)
+{
+ __u32 seq;
+ kgn_tx_t *tx;
+ kgn_rx_t *rx;
+ kgn_msg_t *msg;
+ void *prefix;
+ gni_return_t rrc;
+ kgn_peer_t *peer = conn->gnc_peer;
+ kgn_net_t *net;
+ int rc = 0;
+ __u16 tmp_cksum = 0, msg_cksum = 0;
+ int repost = 1, saw_complete;
+ unsigned long timestamp, newest_last_rx, timeout;
+ int last_seq;
+ void *memory = NULL;
+ ENTRY;
+
+ /* Short circuit if the ep_handle is null.
+ * It's likely that its about to be closed as stale.
+ */
+ if (conn->gnc_ephandle == NULL)
+ RETURN_EXIT;
+
+ timestamp = jiffies;
+ mutex_lock(&conn->gnc_device->gnd_cq_mutex);
+ /* delay in jiffies - we are really concerned only with things that
+ * result in a schedule() or really holding this off for long times .
+ * NB - mutex_lock could spin for 2 jiffies before going to sleep to wait */
+ conn->gnc_device->gnd_mutex_delay += (long) jiffies - timestamp;
+
+ /* Resample current time as we have no idea how long it took to get the mutex */
+ timestamp = jiffies;
+
+ /* We check here when the last time we received an rx, we do this before
+ * we call getnext in case the thread has been blocked for a while. If we
+ * havent received an rx since our timeout value we close the connection
+ * as we should assume the other side has closed the connection. This will
+ * stop us from sending replies to a mailbox that is already in purgatory.
+ */
+
+ timeout = cfs_time_seconds(conn->gnc_timeout);
+ newest_last_rx = GNILND_LASTRX(conn);
+
+ /* Error injection to validate that timestamp checking works and closing the conn */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_RECV_TIMEOUT)) {
+ timestamp = timestamp + (GNILND_TIMEOUTRX(timeout) * 2);
+ }
+
+ if (time_after_eq(timestamp, newest_last_rx + (GNILND_TIMEOUTRX(timeout)))) {
+ GNIDBG_CONN(D_NETERROR|D_CONSOLE, conn, "Cant receive from %s after timeout lapse of %lu; TO %lu",
+ libcfs_nid2str(conn->gnc_peer->gnp_nid),
+ cfs_duration_sec(timestamp - newest_last_rx),
+ cfs_duration_sec(GNILND_TIMEOUTRX(timeout)));
+ mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+ rc = -ETIME;
+ kgnilnd_close_conn(conn, rc);
+ RETURN_EXIT;
+ }
+
+ rrc = kgnilnd_smsg_getnext(conn->gnc_ephandle, &prefix);
+
+ if (rrc == GNI_RC_NOT_DONE) {
+ mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+ CDEBUG(D_INFO, "SMSG RX empty\n");
+ RETURN_EXIT;
+ }
+
+ if (rrc == GNI_RC_INVALID_STATE) {
+ LIBCFS_ALLOC(memory, conn->gnpr_smsg_attr.buff_size);
+ if (memory == NULL) {
+ memory = (void *)0xdeadbeef;
+ } else {
+ memcpy(memory, conn->gnpr_smsg_attr.msg_buffer + conn->gnpr_smsg_attr.mbox_offset, conn->gnpr_smsg_attr.buff_size);
+ }
+ }
+
+ LASSERTF(rrc == GNI_RC_SUCCESS,
+ "bad rc %d on conn %p from peer %s mailbox copy %p\n",
+ rrc, conn, libcfs_nid2str(peer->gnp_nid), memory);
+
+ msg = (kgn_msg_t *)prefix;
+
+ rx = kgnilnd_alloc_rx();
+ if (rx == NULL) {
+ mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+ kgnilnd_release_msg(conn);
+ GNIDBG_MSG(D_NETERROR, msg, "Dropping SMSG RX from 0x%p->%s, no RX memory",
+ conn, libcfs_nid2str(peer->gnp_nid));
+ RETURN_EXIT;
+ }
+
+ GNIDBG_MSG(D_INFO, msg, "SMSG RX on %p from %s",
+ conn, libcfs_nid2str(peer->gnp_nid));
+
+ timestamp = conn->gnc_last_rx;
+ last_seq = conn->gnc_rx_seq;
+
+ conn->gnc_last_rx = jiffies;
+ /* stash first rx so we can clear out purgatory
+ */
+ if (conn->gnc_first_rx == 0)
+ conn->gnc_first_rx = jiffies;
+
+ seq = conn->gnc_rx_seq++;
+
+ /* needs to linger to protect gnc_rx_seq like we do with gnc_tx_seq */
+ mutex_unlock(&conn->gnc_device->gnd_cq_mutex);
+ kgnilnd_peer_alive(conn->gnc_peer);
+
+ rx->grx_msg = msg;
+ rx->grx_conn = conn;
+ rx->grx_eager = 0;
+ rx->grx_received = current_kernel_time();
+
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_NET_LOOKUP)) {
+ rc = -ENONET;
+ } else {
+ rc = kgnilnd_find_net(msg->gnm_srcnid, &net);
+ }
+
+ if (rc < 0) {
+ GOTO(out, rc);
+ } else {
+ kgnilnd_net_decref(net);
+ }
+
+ if (*kgnilnd_tunables.kgn_checksum && !msg->gnm_cksum)
+ GNIDBG_MSG(D_WARNING, msg, "no msg header checksum when enabled");
+
+ /* XXX Nic: Do we need to swab cksum */
+ if (msg->gnm_cksum != 0) {
+ msg_cksum = msg->gnm_cksum;
+ msg->gnm_cksum = 0;
+ tmp_cksum = kgnilnd_cksum(msg, sizeof(kgn_msg_t));
+
+ if (tmp_cksum != msg_cksum) {
+ GNIDBG_MSG(D_NETERROR, msg, "Bad hdr checksum (%x expected %x)",
+ tmp_cksum, msg_cksum);
+ kgnilnd_dump_msg(D_BUFFS, msg);
+ rc = -ENOKEY;
+ GOTO(out, rc);
+ }
+ }
+ /* restore checksum for future debug messages */
+ msg->gnm_cksum = tmp_cksum;
+
+ if (msg->gnm_magic != GNILND_MSG_MAGIC) {
+ if (__swab32(msg->gnm_magic) != GNILND_MSG_MAGIC) {
+ GNIDBG_MSG(D_NETERROR, msg, "Unexpected magic %08x from %s",
+ msg->gnm_magic, libcfs_nid2str(peer->gnp_nid));
+ rc = -EPROTO;
+ GOTO(out, rc);
+ }
+
+ __swab32s(&msg->gnm_magic);
+ __swab16s(&msg->gnm_version);
+ __swab16s(&msg->gnm_type);
+ __swab64s(&msg->gnm_srcnid);
+ __swab64s(&msg->gnm_connstamp);
+ __swab32s(&msg->gnm_seq);
+
+ /* NB message type checked below; NOT here... */
+ switch (msg->gnm_type) {
+ case GNILND_MSG_PUT_ACK:
+ kgnilnd_swab_rdma_desc(&msg->gnm_u.putack.gnpam_desc);
+ break;
+
+ case GNILND_MSG_GET_REQ:
+ kgnilnd_swab_rdma_desc(&msg->gnm_u.get.gngm_desc);
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ if (msg->gnm_version != GNILND_MSG_VERSION) {
+ GNIDBG_MSG(D_NETERROR, msg, "Unexpected protocol version %d from %s",
+ msg->gnm_version, libcfs_nid2str(peer->gnp_nid));
+ rc = -EPROTO;
+ GOTO(out, rc);
+ }
+
+ if (LNET_NIDADDR(msg->gnm_srcnid) != LNET_NIDADDR(peer->gnp_nid)) {
+ GNIDBG_MSG(D_NETERROR, msg, "Unexpected peer %s from %s",
+ libcfs_nid2str(msg->gnm_srcnid),
+ libcfs_nid2str(peer->gnp_nid));
+ rc = -EPROTO;
+ GOTO(out, rc);
+ }
+
+ if (msg->gnm_connstamp != conn->gnc_peer_connstamp) {
+ GNIDBG_MSG(D_NETERROR, msg, "Unexpected connstamp "LPX64"("LPX64
+ " expected) from %s",
+ msg->gnm_connstamp, conn->gnc_peer_connstamp,
+ libcfs_nid2str(peer->gnp_nid));
+ rc = -EPROTO;
+ GOTO(out, rc);
+ }
+
+ if (msg->gnm_seq != seq) {
+ GNIDBG_MSG(D_NETERROR, msg, "Unexpected sequence number %d(%d expected) from %s",
+ msg->gnm_seq, seq, libcfs_nid2str(peer->gnp_nid));
+ rc = -EPROTO;
+ GOTO(out, rc);
+ }
+
+ atomic_inc(&conn->gnc_device->gnd_short_nrx);
+
+ if (msg->gnm_type == GNILND_MSG_CLOSE) {
+ CDEBUG(D_NETTRACE, "%s sent us CLOSE msg\n",
+ libcfs_nid2str(conn->gnc_peer->gnp_nid));
+ write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+ conn->gnc_close_recvd = GNILND_CLOSE_RX;
+ conn->gnc_peer_error = msg->gnm_u.completion.gncm_retval;
+ /* double check state with lock held */
+ if (conn->gnc_state == GNILND_CONN_ESTABLISHED) {
+ /* only error if we are not already closing */
+ if (conn->gnc_peer_error == -ETIMEDOUT) {
+ unsigned long now = jiffies;
+ CNETERR("peer 0x%p->%s closed connection 0x%p due to timeout. "
+ "Is node down? "
+ "RX %d @ %lus/%lus; TX %d @ %lus/%lus; "
+ "NOOP %lus/%lus/%lus; sched %lus/%lus/%lus ago\n",
+ conn->gnc_peer, libcfs_nid2str(conn->gnc_peer->gnp_nid),
+ conn, last_seq,
+ cfs_duration_sec(now - timestamp),
+ cfs_duration_sec(now - conn->gnc_last_rx_cq),
+ conn->gnc_tx_seq,
+ cfs_duration_sec(now - conn->gnc_last_tx),
+ cfs_duration_sec(now - conn->gnc_last_tx_cq),
+ cfs_duration_sec(now - conn->gnc_last_noop_want),
+ cfs_duration_sec(now - conn->gnc_last_noop_sent),
+ cfs_duration_sec(now - conn->gnc_last_noop_cq),
+ cfs_duration_sec(now - conn->gnc_last_sched_ask),
+ cfs_duration_sec(now - conn->gnc_last_sched_do),
+ cfs_duration_sec(now - conn->gnc_device->gnd_sched_alive));
+ }
+ kgnilnd_close_conn_locked(conn, -ECONNRESET);
+ }
+ write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+ GOTO(out, rc);
+ }
+
+ if (conn->gnc_close_recvd) {
+ GNIDBG_MSG(D_NETERROR, msg, "Unexpected message %s(%d/%d) after CLOSE from %s",
+ kgnilnd_msgtype2str(msg->gnm_type),
+ msg->gnm_type, conn->gnc_close_recvd,
+ libcfs_nid2str(conn->gnc_peer->gnp_nid));
+ rc = -EPROTO;
+ GOTO(out, rc);
+ }
+
+ if (conn->gnc_state != GNILND_CONN_ESTABLISHED) {
+ /* XXX Nic: log message received on bad connection state */
+ GOTO(out, rc);
+ }
+
+ switch (msg->gnm_type) {
+ case GNILND_MSG_NOOP:
+ /* Nothing to do; just a keepalive */
+ break;
+
+ case GNILND_MSG_IMMEDIATE:
+ /* only get SMSG payload for IMMEDIATE */
+ atomic64_add(msg->gnm_payload_len, &conn->gnc_device->gnd_short_rxbytes);
+ rc = lnet_parse(net->gnn_ni, &msg->gnm_u.immediate.gnim_hdr,
+ msg->gnm_srcnid, rx, 0);
+ repost = rc < 0;
+ break;
+
+ case GNILND_MSG_PUT_REQ:
+ rc = lnet_parse(net->gnn_ni, &msg->gnm_u.putreq.gnprm_hdr,
+ msg->gnm_srcnid, rx, 1);
+ repost = rc < 0;
+ break;
+
+ case GNILND_MSG_PUT_NAK:
+ tx = kgnilnd_match_reply_either(conn, GNILND_MSG_PUT_REQ, GNILND_MSG_PUT_ACK,
+ msg->gnm_u.completion.gncm_cookie);
+ if (tx == NULL)
+ break;
+
+ kgnilnd_complete_tx(tx, msg->gnm_u.completion.gncm_retval);
+ break;
+
+ case GNILND_MSG_PUT_ACK:
+ tx = kgnilnd_match_reply(conn, GNILND_MSG_PUT_REQ,
+ msg->gnm_u.putack.gnpam_src_cookie);
+ if (tx == NULL)
+ break;
+
+ /* store putack data for later: deferred rdma or re-try */
+ tx->tx_putinfo = msg->gnm_u.putack;
+
+ saw_complete = 0;
+ spin_lock(&tx->tx_conn->gnc_list_lock);
+
+ GNITX_ASSERTF(tx, tx->tx_state & GNILND_TX_WAITING_REPLY,
+ "not waiting for reply", NULL);
+
+ tx->tx_state &= ~GNILND_TX_WAITING_REPLY;
+
+ if (likely(!(tx->tx_state & GNILND_TX_WAITING_COMPLETION))) {
+ kgnilnd_tx_del_state_locked(tx, NULL, conn, GNILND_TX_ALLOCD);
+ /* sample under lock as follow on steps require gnc_list_lock
+ * - or call kgnilnd_tx_done which requires no locks held over
+ * call to lnet_finalize */
+ saw_complete = 1;
+ } else {
+ /* cannot launch rdma if still waiting for fma-msg completion */
+ CDEBUG(D_NET, "tx 0x%p type 0x%02x will need to "
+ "wait for SMSG completion\n", tx, tx->tx_msg.gnm_type);
+ tx->tx_state |= GNILND_TX_PENDING_RDMA;
+ }
+ spin_unlock(&tx->tx_conn->gnc_list_lock);
+
+ if (saw_complete) {
+ rc = kgnilnd_send_mapped_tx(tx, 0);
+ if (rc < 0)
+ kgnilnd_tx_done(tx, rc);
+ }
+ break;
+
+ case GNILND_MSG_PUT_DONE:
+ tx = kgnilnd_match_reply(conn, GNILND_MSG_PUT_ACK,
+ msg->gnm_u.completion.gncm_cookie);
+ if (tx == NULL)
+ break;
+
+ GNITX_ASSERTF(tx, tx->tx_buftype == GNILND_BUF_PHYS_MAPPED ||
+ tx->tx_buftype == GNILND_BUF_VIRT_MAPPED,
+ "bad tx buftype %d", tx->tx_buftype);
+
+ kgnilnd_finalize_rx_done(tx, msg);
+ break;
+
+ case GNILND_MSG_GET_REQ:
+ rc = lnet_parse(net->gnn_ni, &msg->gnm_u.get.gngm_hdr,
+ msg->gnm_srcnid, rx, 1);
+ repost = rc < 0;
+ break;
+
+ case GNILND_MSG_GET_NAK:
+ tx = kgnilnd_match_reply(conn, GNILND_MSG_GET_REQ,
+ msg->gnm_u.completion.gncm_cookie);
+ if (tx == NULL)
+ break;
+
+ GNITX_ASSERTF(tx, tx->tx_buftype == GNILND_BUF_PHYS_MAPPED ||
+ tx->tx_buftype == GNILND_BUF_VIRT_MAPPED,
+ "bad tx buftype %d", tx->tx_buftype);
+
+ kgnilnd_complete_tx(tx, msg->gnm_u.completion.gncm_retval);
+ break;
+
+ case GNILND_MSG_GET_DONE:
+ tx = kgnilnd_match_reply(conn, GNILND_MSG_GET_REQ,
+ msg->gnm_u.completion.gncm_cookie);
+ if (tx == NULL)
+ break;
+
+ GNITX_ASSERTF(tx, tx->tx_buftype == GNILND_BUF_PHYS_MAPPED ||
+ tx->tx_buftype == GNILND_BUF_VIRT_MAPPED,
+ "bad tx buftype %d", tx->tx_buftype);
+
+ lnet_set_reply_msg_len(net->gnn_ni, tx->tx_lntmsg[1],
+ msg->gnm_u.completion.gncm_retval);
+
+ kgnilnd_finalize_rx_done(tx, msg);
+ break;
+ }
+
+ out:
+ if (rc < 0) /* protocol/comms error */
+ kgnilnd_close_conn(conn, rc);
+
+ if (repost && rx != NULL) {
+ kgnilnd_consume_rx(rx);
+ }
+
+ /* we got an event so assume more there and call for reschedule */
+ if (rc >= 0)
+ kgnilnd_schedule_conn(conn);
+ EXIT;
+}
+
+/* Do the failure injections that we need to affect conn processing in the following function.
+ * When writing tests that use this function make sure to use a fail_loc with a fail mask.
+ * If you dont you can cause the scheduler threads to spin on the conn without it leaving
+ * process_conns.
+ *
+ * intent is used to signal the calling function whether or not the conn needs to be rescheduled.
+ */
+
+static inline int
+kgnilnd_check_conn_fail_loc(kgn_device_t *dev, kgn_conn_t *conn, int *intent)
+{
+ int rc = 0;
+
+ /* short circuit out when not set */
+ if (likely(!cfs_fail_loc)) {
+ RETURN(rc);
+ }
+
+ /* failure injection to test for stack reset clean ups */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_DROP_CLOSING)) {
+ /* we can't rely on busy loops being nice enough to get the
+ * stack reset triggered - it'd just spin on this conn */
+ CFS_RACE(CFS_FAIL_GNI_DROP_CLOSING);
+ rc = 1;
+ *intent = 1;
+ GOTO(did_fail_loc, rc);
+ }
+
+ if (conn->gnc_state == GNILND_CONN_DESTROY_EP) {
+ /* DESTROY_EP set in kgnilnd_conn_decref on gnc_refcount = 1 */
+
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_DROP_DESTROY_EP)) {
+ CFS_RACE(CFS_FAIL_GNI_DROP_DESTROY_EP);
+ rc = 1;
+ *intent = 1;
+ GOTO(did_fail_loc, rc);
+ }
+ }
+
+ /* CFS_FAIL_GNI_FINISH_PURG2 is used to stop a connection from fully closing. This scheduler
+ * will spin on the CFS_FAIL_TIMEOUT until the fail_loc is cleared at which time the connection
+ * will be closed by kgnilnd_complete_closed_conn.
+ */
+ if ((conn->gnc_state == GNILND_CONN_CLOSED) && CFS_FAIL_CHECK(CFS_FAIL_GNI_FINISH_PURG2)) {
+ while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_FINISH_PURG2, 1)) {};
+ rc = 1;
+ *intent = 1;
+ GOTO(did_fail_loc, rc);
+ }
+
+ /* this one is a bit gross - we can't hold the mutex from process_conns
+ * across a CFS_RACE here - it'd block the conn threads from doing an ep_bind
+ * and moving onto finish_connect
+ * so, we'll just set the rc - kgnilnd_process_conns will clear
+ * found_work on a fail_loc, getting the scheduler thread to call schedule()
+ * and effectively getting this thread to sleep */
+ if ((conn->gnc_state == GNILND_CONN_CLOSED) && CFS_FAIL_CHECK(CFS_FAIL_GNI_FINISH_PURG)) {
+ rc = 1;
+ *intent = 1;
+ GOTO(did_fail_loc, rc);
+ }
+
+did_fail_loc:
+ RETURN(rc);
+}
+
+static inline void
+kgnilnd_send_conn_close(kgn_conn_t *conn)
+{
+ kgn_tx_t *tx;
+
+ /* we are closing the conn - we will try to send the CLOSE msg
+ * but will not wait for anything else to flush */
+
+ /* send the close if not already done so or received one */
+ if (!conn->gnc_close_sent && !conn->gnc_close_recvd) {
+ /* set close_sent regardless of the success of the
+ * CLOSE message. We are going to try once and then
+ * kick him out of the sandbox */
+ conn->gnc_close_sent = 1;
+ mb();
+
+ /* EP might be null already if remote side initiated a new connection.
+ * kgnilnd_finish_connect destroys existing ep_handles before wiring up the new connection,
+ * so this check is here to make sure we dont attempt to send with a null ep_handle.
+ */
+ if (conn->gnc_ephandle != NULL) {
+ int rc = 0;
+
+ tx = kgnilnd_new_tx_msg(GNILND_MSG_CLOSE, conn->gnc_peer->gnp_net->gnn_ni->ni_nid);
+ if (tx != NULL) {
+ tx->tx_msg.gnm_u.completion.gncm_retval = conn->gnc_error;
+ tx->tx_state = GNILND_TX_WAITING_COMPLETION;
+ tx->tx_qtime = jiffies;
+
+ if (tx->tx_id.txe_idx == 0) {
+ rc = kgnilnd_set_tx_id(tx, conn);
+ if (rc != 0) {
+ kgnilnd_tx_done(tx, rc);
+ }
+ }
+
+ CDEBUG(D_NETTRACE, "sending close with errno %d\n",
+ conn->gnc_error);
+
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CLOSE_SEND)) {
+ kgnilnd_tx_done(tx, -EAGAIN);
+ } else if (!rc) {
+ rc = kgnilnd_sendmsg(tx, NULL, 0, NULL, GNILND_TX_FMAQ);
+ if (rc) {
+ /* It wasnt sent and we dont care. */
+ kgnilnd_tx_done(tx, rc);
+ }
+ }
+
+ }
+ }
+ }
+
+ conn->gnc_state = GNILND_CONN_CLOSED;
+ /* mark this conn as CLOSED now that we processed it
+ * do after TX, so we can use CLOSING in asserts */
+
+ mb();
+
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_RX_CLOSE_CLOSED)) {
+ /* simulate a RX CLOSE after the timeout but before
+ * the scheduler thread gets it */
+ conn->gnc_close_recvd = GNILND_CLOSE_INJECT2;
+ conn->gnc_peer_error = -ETIMEDOUT;
+ }
+ /* schedule to allow potential CLOSE and get the complete phase run */
+ kgnilnd_schedule_conn(conn);
+}
+
+int
+kgnilnd_process_mapped_tx(kgn_device_t *dev)
+{
+ int found_work = 0;
+ int rc = 0;
+ kgn_tx_t *tx;
+ int max_retrans = *kgnilnd_tunables.kgn_max_retransmits;
+ int log_retrans, log_retrans_level;
+ static int last_map_version;
+ ENTRY;
+
+ spin_lock(&dev->gnd_lock);
+ if (list_empty(&dev->gnd_map_tx)) {
+ spin_unlock(&dev->gnd_lock);
+ RETURN(0);
+ }
+
+ dev->gnd_sched_alive = jiffies;
+
+ /* we'll retry as fast as possible up to 25% of the limit, then we start
+ * backing off until our map version changes - indicating we unmapped
+ * something */
+ tx = list_first_entry(&dev->gnd_map_tx, kgn_tx_t, tx_list);
+ if ((tx->tx_retrans > (max_retrans / 4)) &&
+ (last_map_version == dev->gnd_map_version)) {
+ GNIDBG_TX(D_NET, tx, "waiting for mapping event event to retry", NULL);
+ spin_unlock(&dev->gnd_lock);
+ RETURN(0);
+ }
+
+ /* stash the last map version to let us know when a good one was seen */
+ last_map_version = dev->gnd_map_version;
+
+ /* we need to to take the lock and continually refresh the head of the list as
+ * kgnilnd_complete_closed_conn might be nuking stuff and we are cycling the lock
+ * allowing them to squeeze in */
+
+ while (!list_empty(&dev->gnd_map_tx)) {
+ /* make sure we break out early on quiesce */
+ if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+ /* always break with lock held - we unlock outside loop */
+ break;
+ }
+
+ tx = list_first_entry(&dev->gnd_map_tx, kgn_tx_t, tx_list);
+
+ kgnilnd_tx_del_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_ALLOCD);
+ found_work++;
+
+ /* sample with lock held, serializing with kgnilnd_complete_closed_conn */
+ if (tx->tx_conn->gnc_state != GNILND_CONN_ESTABLISHED) {
+ /* if conn is dying, mark tx in tx_ref_table for
+ * kgnilnd_complete_closed_conn to finish up */
+ kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_DYING, 1);
+ found_work++;
+
+ /* tx was moved to DYING, get next */
+ continue;
+ }
+
+ spin_unlock(&dev->gnd_lock);
+ rc = kgnilnd_send_mapped_tx(tx, 1);
+
+ /* We made it! skip error handling.. */
+ if (rc >= 0) {
+ /* OK to continue on +ve errors as it won't get seen until
+ * this function is called again - we operate on a copy of the original
+ * list and not the live list */
+ spin_lock(&dev->gnd_lock);
+ continue;
+ } else if (rc != -ENOMEM) {
+ /* carp, failure we can't handle */
+ kgnilnd_tx_done(tx, rc);
+ spin_lock(&dev->gnd_lock);
+ continue;
+ }
+
+ /* time to handle the retry cases.. */
+ tx->tx_retrans++;
+ if (tx->tx_retrans == 1)
+ tx->tx_qtime = jiffies;
+
+ /* only log occasionally once we've retried max / 2 */
+ log_retrans = (tx->tx_retrans >= (max_retrans / 2)) &&
+ ((tx->tx_retrans % 32) == 0);
+ log_retrans_level = log_retrans ? D_NETERROR : D_NET;
+
+ /* make sure we are not off in the weeds with this tx */
+ if (tx->tx_retrans > *kgnilnd_tunables.kgn_max_retransmits) {
+ GNIDBG_TX(D_NETERROR, tx,
+ "giving up on TX, too many retries", NULL);
+ kgnilnd_tx_done(tx, -ENOMEM);
+ GOTO(get_out_mapped, rc);
+ } else {
+ GNIDBG_TX(log_retrans_level, tx,
+ "transient map failure #%d %d pages/%d bytes phys %u@%u "
+ "virt %u@"LPU64" "
+ "nq_map %d mdd# %d/%d GART %ld",
+ tx->tx_retrans, tx->tx_phys_npages, tx->tx_nob,
+ dev->gnd_map_nphys, dev->gnd_map_physnop * PAGE_SIZE,
+ dev->gnd_map_nvirt, dev->gnd_map_virtnob,
+ atomic_read(&dev->gnd_nq_map),
+ atomic_read(&dev->gnd_n_mdd), atomic_read(&dev->gnd_n_mdd_held),
+ atomic64_read(&dev->gnd_nbytes_map));
+ }
+
+ /* we need to stop processing the rest of the list, so add it back in */
+ spin_lock(&dev->gnd_lock);
+ kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_MAPQ, 0);
+ spin_unlock(&dev->gnd_lock);
+ GOTO(get_out_mapped, rc);
+ }
+ spin_unlock(&dev->gnd_lock);
+get_out_mapped:
+ RETURN(found_work);
+}
+
+int
+kgnilnd_process_conns(kgn_device_t *dev)
+{
+ int found_work = 0;
+ int conn_sched;
+ int intent = 0;
+ kgn_conn_t *conn;
+
+ spin_lock(&dev->gnd_lock);
+ while (!list_empty(&dev->gnd_ready_conns)) {
+ dev->gnd_sched_alive = jiffies;
+
+ if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+ /* break with lock held */
+ break;
+ }
+
+ conn = list_first_entry(&dev->gnd_ready_conns, kgn_conn_t, gnc_schedlist);
+ list_del_init(&conn->gnc_schedlist);
+ spin_unlock(&dev->gnd_lock);
+
+ conn_sched = xchg(&conn->gnc_scheduled, GNILND_CONN_PROCESS);
+
+ LASSERTF(conn_sched != GNILND_CONN_IDLE &&
+ conn_sched != GNILND_CONN_PROCESS,
+ "conn %p on ready list but in bad state: %d\n",
+ conn, conn_sched);
+
+ CDEBUG(D_INFO, "conn %p@%s for processing\n",
+ conn, kgnilnd_conn_state2str(conn));
+
+ found_work++;
+ set_mb(conn->gnc_last_sched_do, jiffies);
+
+ if (kgnilnd_check_conn_fail_loc(dev, conn, &intent)) {
+
+ /* based on intent see if we should run again. */
+ kgnilnd_schedule_process_conn(conn, intent);
+
+ /* drop ref from gnd_ready_conns */
+ kgnilnd_conn_decref(conn);
+ /* clear this so that scheduler thread doesn't spin */
+ found_work = 0;
+ /* break with lock held... */
+ spin_lock(&dev->gnd_lock);
+ break;
+ }
+
+ if (unlikely(conn->gnc_state == GNILND_CONN_CLOSED)) {
+ /* CONN_CLOSED set in procces_fmaq when CLOSE is sent */
+ kgnilnd_complete_closed_conn(conn);
+ } else if (unlikely(conn->gnc_state == GNILND_CONN_DESTROY_EP)) {
+ /* DESTROY_EP set in kgnilnd_conn_decref on gnc_refcount = 1 */
+ /* serialize SMSG CQs with ep_bind and smsg_release */
+ kgnilnd_destroy_conn_ep(conn);
+ } else if (unlikely(conn->gnc_state == GNILND_CONN_CLOSING)) {
+ /* if we need to do some CLOSE sending, etc done here do it */
+ kgnilnd_send_conn_close(conn);
+ kgnilnd_check_fma_rx(conn);
+ } else if (atomic_read(&conn->gnc_peer->gnp_dirty_eps) == 0) {
+ /* start moving traffic if the old conns are cleared out */
+ kgnilnd_check_fma_rx(conn);
+ kgnilnd_process_fmaq(conn);
+ }
+
+ kgnilnd_schedule_process_conn(conn, 0);
+
+ /* drop ref from gnd_ready_conns */
+ kgnilnd_conn_decref(conn);
+
+ /* check list again with lock held */
+ spin_lock(&dev->gnd_lock);
+ }
+ spin_unlock(&dev->gnd_lock);
+
+ RETURN(found_work);
+}
+
+int
+kgnilnd_scheduler(void *arg)
+{
+ int threadno = (long)arg;
+ kgn_device_t *dev;
+ char name[16];
+ int busy_loops = 0;
+ DEFINE_WAIT(wait);
+
+ dev = &kgnilnd_data.kgn_devices[(threadno + 1) % kgnilnd_data.kgn_ndevs];
+
+ snprintf(name, sizeof(name), "kgnilnd_sd_%02d", threadno);
+ cfs_daemonize(name);
+ cfs_block_allsigs();
+
+ /* all gnilnd threads need to run fairly urgently */
+ set_user_nice(current, *kgnilnd_tunables.kgn_nice);
+
+ while (!kgnilnd_data.kgn_shutdown) {
+ int found_work = 0;
+ /* Safe: kgn_shutdown only set when quiescent */
+
+ /* to quiesce or to not quiesce, that is the question */
+
+ if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+ KGNILND_SPIN_QUIESCE;
+ }
+
+ /* tracking for when thread goes AWOL */
+ dev->gnd_sched_alive = jiffies;
+
+ /* let folks know we are up and kicking
+ * - they can use this for latency savings, etc
+ * - only change if IRQ, if IDLE leave alone as that
+ * schedule_device calls to put us back to IRQ */
+ (void)cmpxchg(&dev->gnd_ready, GNILND_DEV_IRQ, GNILND_DEV_LOOP);
+
+ /* always check these - they are super low cost */
+ found_work += kgnilnd_check_fma_send_cq(dev);
+ found_work += kgnilnd_check_fma_rcv_cq(dev);
+
+ /* rdma CQ doesn't care about eps */
+ found_work += kgnilnd_check_rdma_cq(dev);
+
+ /* move some RDMA ? */
+ found_work += kgnilnd_process_rdmaq(dev);
+
+ /* map some pending RDMA requests ? */
+ found_work += kgnilnd_process_mapped_tx(dev);
+
+ /* the EP for a conn is not destroyed until all the references
+ * to it are gone, so these checks should be safe
+ * even if run in parallel with the CQ checking functions
+ * _AND_ a thread that processes the CLOSED->DONE
+ * transistion
+ * ...should.... */
+
+ /* process all conns ready now */
+ found_work += kgnilnd_process_conns(dev);
+
+ /* do an eager check to avoid the IRQ disabling in
+ * prepare_to_wait and friends */
+
+ if (found_work && busy_loops++ < *kgnilnd_tunables.kgn_loops) {
+ found_work = 0;
+ if ((busy_loops % 10) == 0) {
+ /* tickle heartbeat and watchdog to ensure our
+ * piggishness doesn't turn into heartbeat failure */
+ touch_nmi_watchdog();
+ if (kgnilnd_hssops.hb_to_l0 != NULL) {
+ kgnilnd_hssops.hb_to_l0();
+ }
+ }
+ continue;
+ }
+
+ /* if we got here, found_work was zero or busy_loops means we
+ * need to take a break. We'll clear gnd_ready but we'll check
+ * one last time if there is an IRQ that needs processing */
+
+ prepare_to_wait(&dev->gnd_waitq, &wait, TASK_INTERRUPTIBLE);
+
+ /* the first time this will go LOOP -> IDLE and let us do one final check
+ * during which we might get an IRQ, then IDLE->IDLE and schedule()
+ * - this might allow other threads to block us for a bit if they
+ * try to get the mutex, but that is good as we'd need to wake
+ * up soon to handle the CQ or other processing anyways */
+
+ found_work += xchg(&dev->gnd_ready, GNILND_DEV_IDLE);
+
+ if (busy_loops >= *kgnilnd_tunables.kgn_loops) {
+ CDEBUG(D_INFO,
+ "yeilding: found_work %d busy_loops %d\n",
+ found_work, busy_loops);
+ busy_loops = 0;
+ /* use yield if we are bailing due to busy_loops
+ * - this will ensure we wake up soonish. This closes
+ * a race with kgnilnd_device_callback - where it'd
+ * not call wake_up() because gnd_ready == 1, but then
+ * we come down and schedule() because of busy_loops.
+ * We'd not be woken up until something poked our waitq
+ * again. yield() ensures we wake up without another
+ * waitq poke in that case */
+ atomic_inc(&dev->gnd_n_yield);
+ yield();
+ CDEBUG(D_INFO, "awake after yeild\n");
+ } else if (found_work == GNILND_DEV_IDLE) {
+ /* busy_loops is low and there is nothing to do,
+ * go to sleep and wait for a waitq poke */
+ CDEBUG(D_INFO,
+ "scheduling: found_work %d busy_loops %d\n",
+ found_work, busy_loops);
+ atomic_inc(&dev->gnd_n_schedule);
+ schedule();
+ CDEBUG(D_INFO, "awake after schedule\n");
+ }
+ finish_wait(&dev->gnd_waitq, &wait);
+ }
+
+ kgnilnd_thread_fini();
+ return 0;
+}
--- /dev/null
+/*
+ * Copyright (C) 2012 Cray, Inc.
+ *
+ * Author: Igor Gorodetsky <iogordet@cray.com>
+ * Author: Nic Henke <nic@cray.com>
+ * Author: James Shimek <jshimek@cray.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "gnilnd.h"
+
+void
+kgnilnd_setup_smsg_attr(gni_smsg_attr_t *smsg_attr)
+{
+ smsg_attr->mbox_maxcredit = *kgnilnd_tunables.kgn_mbox_credits;
+ smsg_attr->msg_maxsize = GNILND_MAX_MSG_SIZE;
+ smsg_attr->msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT;
+}
+
+int
+kgnilnd_map_fmablk(kgn_device_t *device, kgn_fma_memblock_t *fma_blk)
+{
+ gni_return_t rrc;
+ __u32 flags = GNI_MEM_READWRITE;
+
+ if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
+ flags |= GNI_MEM_PHYS_CONT;
+ }
+
+ /* make sure we are mapping a clean block */
+ LASSERTF(fma_blk->gnm_hndl.qword1 == 0UL, "fma_blk %p dirty\n", fma_blk);
+
+ rrc = kgnilnd_mem_register(device->gnd_handle, (__u64)fma_blk->gnm_block,
+ fma_blk->gnm_blk_size, device->gnd_rcv_fma_cqh,
+ flags, &fma_blk->gnm_hndl);
+ if (rrc != GNI_RC_SUCCESS) {
+ /* XXX Nic: need a way to silence this for runtime stuff that is ok to fail
+ * -- like when under MDD or GART pressure on big systems
+ */
+ CNETERR("register fmablk failed 0x%p mbox_size %d flags %u\n",
+ fma_blk, fma_blk->gnm_mbox_size, flags);
+ RETURN(-ENOMEM);
+ }
+
+ /* PHYS_CONT memory isn't really mapped, at least not in GART -
+ * but all mappings chew up a MDD
+ */
+ if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
+ atomic64_add(fma_blk->gnm_blk_size, &device->gnd_nbytes_map);
+ }
+
+ atomic_inc(&device->gnd_n_mdd);
+ /* nfmablk is live (mapped) blocks */
+ atomic_inc(&device->gnd_nfmablk);
+
+ RETURN(0);
+}
+
+int
+kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys)
+{
+ int rc = 0;
+ int num_mbox;
+ kgn_fma_memblock_t *fma_blk;
+ gni_smsg_attr_t smsg_attr;
+ unsigned long fmablk_vers;
+
+ /* we'll use fmablk_vers and the gnd_fmablk_sem to gate access
+ * to this allocation code. Everyone will sample the version
+ * before and after getting the semaphore. If it has changed,
+ * we'll bail out to check the lists again - this indicates that
+ * some sort of change was made to the lists and it is possible
+ * that there is a mailbox for us to find now. This should prevent
+ * a ton of spinning in the case where there are lots of threads
+ * that need a yet-to-be-allocated mailbox for a connection. */
+
+ fmablk_vers = atomic_read(&device->gnd_fmablk_vers);
+ down(&device->gnd_fmablk_sem);
+
+ if (fmablk_vers != atomic_read(&device->gnd_fmablk_vers)) {
+ /* version changed while we were waiting for semaphore,
+ * we'll recheck the lists assuming something nice happened */
+ up(&device->gnd_fmablk_sem);
+ return 0;
+ }
+
+ LIBCFS_ALLOC(fma_blk, sizeof(kgn_fma_memblock_t));
+ if (fma_blk == NULL) {
+ CNETERR("could not allocate fma block descriptor\n");
+ rc = -ENOMEM;
+ GOTO(out, rc);
+ }
+
+ INIT_LIST_HEAD(&fma_blk->gnm_bufflist);
+
+ kgnilnd_setup_smsg_attr(&smsg_attr);
+
+ gni_smsg_buff_size_needed(&smsg_attr, &fma_blk->gnm_mbox_size);
+
+ LASSERTF(fma_blk->gnm_mbox_size, "mbox size %d\n", fma_blk->gnm_mbox_size);
+
+ /* gni_smsg_buff_size_needed calculates the base mailbox size and since
+ * we want to hold kgn_peer_credits worth of messages in both directions,
+ * we add PAYLOAD to grow the mailbox size
+ */
+
+ fma_blk->gnm_mbox_size += GNILND_MBOX_PAYLOAD;
+
+ /* we'll only use physical during preallocate at startup -- this keeps it nice and
+ * clean for runtime decisions. We'll keep the PHYS ones around until shutdown
+ * as reallocating them is tough if there is memory fragmentation */
+
+ if (use_phys) {
+ fma_blk->gnm_block = cfs_mem_cache_alloc(kgnilnd_data.kgn_mbox_cache, CFS_ALLOC_ATOMIC);
+ if (fma_blk->gnm_block == NULL) {
+ CNETERR("could not allocate physical SMSG mailbox memory\n");
+ rc = -ENOMEM;
+ GOTO(free_desc, rc);
+ }
+ fma_blk->gnm_blk_size = KMALLOC_MAX_SIZE;
+ num_mbox = fma_blk->gnm_blk_size / fma_blk->gnm_mbox_size;
+
+ LASSERTF(num_mbox >= 1,
+ "num_mbox %d blk_size %u mbox_size %d\n",
+ num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size);
+
+ fma_blk->gnm_state = GNILND_FMABLK_PHYS;
+
+ } else {
+ num_mbox = *kgnilnd_tunables.kgn_mbox_per_block;
+ fma_blk->gnm_blk_size = num_mbox * fma_blk->gnm_mbox_size;
+
+ LASSERTF(num_mbox >= 1 && num_mbox >= *kgnilnd_tunables.kgn_mbox_per_block,
+ "num_mbox %d blk_size %u mbox_size %d tunable %d\n",
+ num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size,
+ *kgnilnd_tunables.kgn_mbox_per_block);
+
+ LIBCFS_ALLOC(fma_blk->gnm_block, fma_blk->gnm_blk_size);
+ if (fma_blk->gnm_block == NULL) {
+ CNETERR("could not allocate virtual SMSG mailbox memory, %d bytes\n", fma_blk->gnm_blk_size);
+ rc = -ENOMEM;
+ GOTO(free_desc, rc);
+ }
+
+ fma_blk->gnm_state = GNILND_FMABLK_VIRT;
+ }
+
+ /* allocate just enough space for the bits to track the mailboxes */
+ LIBCFS_ALLOC(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox) * sizeof(unsigned long));
+ if (fma_blk->gnm_bit_array == NULL) {
+ CNETERR("could not allocate mailbox bitmask, %lu bytes for %d mbox\n",
+ sizeof(unsigned long) * BITS_TO_LONGS(num_mbox), num_mbox);
+ rc = -ENOMEM;
+ GOTO(free_blk, rc);
+ }
+ bitmap_zero(fma_blk->gnm_bit_array, num_mbox);
+
+ /* now that the num_mbox is set based on allocation type, get debug info setup */
+ LIBCFS_ALLOC(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t) * num_mbox);
+ if (fma_blk->gnm_mbox_info == NULL) {
+ CNETERR("could not allocate mailbox debug, %lu bytes for %d mbox\n",
+ sizeof(kgn_mbox_info_t) * num_mbox, num_mbox);
+ rc = -ENOMEM;
+ GOTO(free_bit, rc);
+ }
+
+ rc = kgnilnd_map_fmablk(device, fma_blk);
+ if (rc) {
+ GOTO(free_info, rc);
+ }
+
+ fma_blk->gnm_next_avail_mbox = 0;
+ fma_blk->gnm_avail_mboxs = fma_blk->gnm_num_mboxs = num_mbox;
+
+ CDEBUG(D_MALLOC, "alloc fmablk 0x%p num %d msg_maxsize %d credits %d "
+ "mbox_size %d MDD "LPX64"."LPX64"\n",
+ fma_blk, num_mbox, smsg_attr.msg_maxsize, smsg_attr.mbox_maxcredit,
+ fma_blk->gnm_mbox_size, fma_blk->gnm_hndl.qword1,
+ fma_blk->gnm_hndl.qword2);
+
+ /* lock Is protecting data structures, not semaphore */
+
+ spin_lock(&device->gnd_fmablk_lock);
+ list_add_tail(&fma_blk->gnm_bufflist, &device->gnd_fma_buffs);
+
+ /* toggle under the lock so once they change the list is also
+ * ready for others to traverse */
+ atomic_inc(&device->gnd_fmablk_vers);
+
+ spin_unlock(&device->gnd_fmablk_lock);
+
+ up(&device->gnd_fmablk_sem);
+
+ return 0;
+
+free_info:
+ LIBCFS_FREE(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t)*num_mbox);
+free_bit:
+ LIBCFS_FREE(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox) * sizeof (unsigned long));
+free_blk:
+ if (fma_blk->gnm_state == GNILND_FMABLK_VIRT) {
+ LIBCFS_FREE(fma_blk->gnm_block, fma_blk->gnm_blk_size);
+ } else {
+ cfs_mem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
+ }
+free_desc:
+ LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
+out:
+ up(&device->gnd_fmablk_sem);
+ return rc;
+}
+
+void
+kgnilnd_unmap_fmablk(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
+{
+ gni_return_t rrc;
+
+ /* if some held, set hold_timeout from conn timeouts used in this block
+ * but not during shutdown, then just nuke and pave */
+ if (fma_blk->gnm_held_mboxs && (!kgnilnd_data.kgn_shutdown)) {
+ fma_blk->gnm_hold_timeout = GNILND_TIMEOUT2DEADMAN;
+ }
+
+ /* we are changing the state of a block, tickle version to tell
+ * proc code list is stale now */
+ atomic_inc(&dev->gnd_fmablk_vers);
+
+ rrc = kgnilnd_mem_deregister(dev->gnd_handle, &fma_blk->gnm_hndl, fma_blk->gnm_hold_timeout);
+
+ CDEBUG(rrc == GNI_RC_SUCCESS ? D_MALLOC : D_CONSOLE|D_NETERROR,
+ "unmap fmablk 0x%p@%s sz %u total %d avail %d held %d mbox_size %d "
+ "hold_timeout %d\n",
+ fma_blk, kgnilnd_fmablk_state2str(fma_blk->gnm_state),
+ fma_blk->gnm_blk_size, fma_blk->gnm_num_mboxs,
+ fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs,
+ fma_blk->gnm_mbox_size, fma_blk->gnm_hold_timeout);
+
+ LASSERTF(rrc == GNI_RC_SUCCESS,
+ "tried to double unmap or something bad, fma_blk %p (rrc %d)\n",
+ fma_blk, rrc);
+
+ if (fma_blk->gnm_hold_timeout) {
+ atomic_inc(&dev->gnd_n_mdd_held);
+ } else {
+ atomic_dec(&dev->gnd_n_mdd);
+ }
+
+ /* PHYS blocks don't get mapped */
+ if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
+ atomic64_sub(fma_blk->gnm_blk_size, &dev->gnd_nbytes_map);
+ } else if (kgnilnd_data.kgn_in_reset) {
+ /* in stack reset, clear MDD handle for PHYS blocks, as we'll
+ * re-use the fma_blk after reset so we don't have to drop/allocate
+ * all of those physical blocks */
+ fma_blk->gnm_hndl.qword1 = fma_blk->gnm_hndl.qword2 = 0UL;
+ }
+
+ /* Decrement here as this is the # of mapped blocks */
+ atomic_dec(&dev->gnd_nfmablk);
+}
+
+
+/* needs lock on gnd_fmablk_lock to cover gnd_fma_buffs */
+void
+kgnilnd_free_fmablk_locked(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk)
+{
+ LASSERTF(fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs,
+ "fma_blk %p@%d free in bad state (%d): blk total %d avail %d held %d\n",
+ fma_blk, fma_blk->gnm_state, fma_blk->gnm_hold_timeout, fma_blk->gnm_num_mboxs,
+ fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs);
+
+ atomic_inc(&dev->gnd_fmablk_vers);
+
+ if (fma_blk->gnm_hold_timeout) {
+ CDEBUG(D_MALLOC, "mdd release fmablk 0x%p sz %u avail %d held %d "
+ "mbox_size %d\n",
+ fma_blk, fma_blk->gnm_blk_size, fma_blk->gnm_avail_mboxs,
+ fma_blk->gnm_held_mboxs, fma_blk->gnm_mbox_size);
+
+ /* We leave MDD dangling over stack reset */
+ if (!kgnilnd_data.kgn_in_reset) {
+ kgnilnd_mem_mdd_release(dev->gnd_handle, &fma_blk->gnm_hndl);
+ }
+ /* ignoring the return code - if kgni/ghal can't find it
+ * it must be released already */
+ atomic_dec(&dev->gnd_n_mdd_held);
+ atomic_dec(&dev->gnd_n_mdd);
+ }
+
+ /* we cant' free the gnm_block until all the conns have released their
+ * purgatory holds. While we have purgatory holds, we might check the conn
+ * RX mailbox during the CLOSING process. It is possible that kgni might
+ * try to look into the RX side for credits when sending the CLOSE msg too */
+ CDEBUG(D_MALLOC, "fmablk %p free buffer %p mbox_size %d\n",
+ fma_blk, fma_blk->gnm_block, fma_blk->gnm_mbox_size);
+
+ if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) {
+ cfs_mem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block);
+ } else {
+ LIBCFS_FREE(fma_blk->gnm_block, fma_blk->gnm_blk_size);
+ }
+ fma_blk->gnm_state = GNILND_FMABLK_FREED;
+
+ list_del(&fma_blk->gnm_bufflist);
+
+ LIBCFS_FREE(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t)*fma_blk->gnm_num_mboxs);
+ LIBCFS_FREE(fma_blk->gnm_bit_array, BITS_TO_LONGS(fma_blk->gnm_num_mboxs) * sizeof (unsigned long));
+ LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t));
+}
+
+void
+kgnilnd_find_free_mbox(kgn_conn_t *conn)
+{
+ kgn_device_t *dev = conn->gnc_device;
+ gni_smsg_attr_t *smsg_attr = &conn->gnpr_smsg_attr;
+ kgn_fma_memblock_t *fma_blk;
+ kgn_mbox_info_t *mbox = NULL;
+ int id;
+
+ spin_lock(&dev->gnd_fmablk_lock);
+
+ list_for_each_entry(fma_blk, &conn->gnc_device->gnd_fma_buffs,
+ gnm_bufflist) {
+ if (fma_blk->gnm_avail_mboxs <= 0 ||
+ fma_blk->gnm_state <= GNILND_FMABLK_IDLE) {
+ continue;
+ }
+ /* look in bitarray for available mailbox */
+ do {
+ id = find_next_zero_bit(
+ fma_blk->gnm_bit_array,
+ fma_blk->gnm_num_mboxs,
+ fma_blk->gnm_next_avail_mbox);
+ if (id == fma_blk->gnm_num_mboxs &&
+ fma_blk->gnm_next_avail_mbox != 0) {
+ /* wrap around */
+ fma_blk->gnm_next_avail_mbox = 0;
+ } else {
+ break;
+ }
+ } while (1);
+
+ LASSERTF(id < fma_blk->gnm_num_mboxs, "id %d max %d\n",
+ id, fma_blk->gnm_num_mboxs);
+ set_bit(id, (volatile unsigned long *)fma_blk->gnm_bit_array);
+ conn->gnc_mbox_id = id;
+
+ fma_blk->gnm_next_avail_mbox =
+ (id == (fma_blk->gnm_num_mboxs - 1)) ? 0 : (id + 1);
+ fma_blk->gnm_avail_mboxs--;
+ conn->gnc_fma_blk = fma_blk;
+
+ kgnilnd_setup_smsg_attr(smsg_attr);
+
+ smsg_attr->msg_buffer = fma_blk->gnm_block;
+ smsg_attr->mbox_offset = fma_blk->gnm_mbox_size * id;
+ smsg_attr->mem_hndl = fma_blk->gnm_hndl;
+ smsg_attr->buff_size = fma_blk->gnm_mbox_size;
+
+ /* We'll set the hndl to zero for PHYS blocks unmapped during stack
+ * reset and re-use the same fma_blk after stack reset. This ensures we've
+ * properly mapped it before we use it */
+ LASSERTF(fma_blk->gnm_hndl.qword1 != 0UL, "unmapped fma_blk %p, state %d\n",
+ fma_blk, fma_blk->gnm_state);
+
+ CDEBUG(D_NET, "conn %p smsg %p fmablk %p "
+ "allocating SMSG mbox %d buf %p "
+ "offset %u hndl "LPX64"."LPX64"\n",
+ conn, smsg_attr, fma_blk, id,
+ smsg_attr->msg_buffer, smsg_attr->mbox_offset,
+ fma_blk->gnm_hndl.qword1,
+ fma_blk->gnm_hndl.qword2);
+
+ mbox = &fma_blk->gnm_mbox_info[id];
+ mbox->mbx_create_conn_memset = jiffies;
+
+ /* zero mbox to remove any old data from our last use.
+ * this better be safe, if not our purgatory timers
+ * are too short or a peer really is misbehaving */
+ memset(smsg_attr->msg_buffer + smsg_attr->mbox_offset,
+ 0, smsg_attr->buff_size);
+ break;
+ }
+
+ spin_unlock(&dev->gnd_fmablk_lock);
+}
+
+int
+kgnilnd_setup_mbox(kgn_conn_t *conn)
+{
+ gni_smsg_attr_t *smsg_attr = &conn->gnpr_smsg_attr;
+ int err = 0;
+
+ smsg_attr->msg_buffer = NULL;
+ /* Look for available mbox */
+ do {
+ kgnilnd_find_free_mbox(conn);
+
+ /* nothing in the existing buffers, make a new one */
+ if (smsg_attr->msg_buffer == NULL) {
+ /* for runtime allocations, we only want vmalloc */
+ err = kgnilnd_alloc_fmablk(conn->gnc_device, 0);
+ if (err) {
+ break;
+ }
+ }
+ } while (smsg_attr->msg_buffer == NULL);
+
+ if (err)
+ CNETERR("couldn't allocate SMSG mbox for conn %p Error: %d\n",
+ conn, err);
+ return err;
+}
+
+void
+kgnilnd_release_mbox(kgn_conn_t *conn, int purgatory_hold)
+{
+ kgn_device_t *dev = conn->gnc_device;
+ gni_smsg_attr_t *smsg_attr = &conn->gnpr_smsg_attr;
+ kgn_fma_memblock_t *fma_blk = NULL;
+ kgn_mbox_info_t *mbox = NULL;
+ int found = 0;
+ int id;
+
+ /* if we failed to setup mbox and now destroying conn */
+ if (smsg_attr->msg_buffer == NULL) {
+ return;
+ }
+
+ id = conn->gnc_mbox_id;
+
+ spin_lock(&dev->gnd_fmablk_lock);
+ /* make sure our conn points at a valid fma_blk
+ * We use this instead of a mem block search out of smsg_attr
+ * because we could have freed a block for fma_blk #1 but the fma_blk
+ * is still in the list for a purgatory hold. This would induce a false
+ * match if that same block gets reallocated to fma_blk #2 */
+ list_for_each_entry(fma_blk, &dev->gnd_fma_buffs, gnm_bufflist) {
+ if (fma_blk == conn->gnc_fma_blk) {
+ found = 1;
+ break;
+ }
+ }
+ LASSERTF(found, "unable to find conn 0x%p with gnc_fma_blk %p "
+ "anywhere in the world\n", conn, conn->gnc_fma_blk);
+
+ LASSERTF(id < fma_blk->gnm_num_mboxs,
+ "bad id %d max %d\n",
+ id, fma_blk->gnm_num_mboxs);
+
+ /* < 0 - was held, now free it
+ * == 0 - just free it
+ * > 0 - hold it for now */
+ if (purgatory_hold == 0) {
+ CDEBUG(D_NET, "conn %p smsg %p fmablk %p freeing SMSG mbox %d "
+ "hndl "LPX64"."LPX64"\n",
+ conn, smsg_attr, fma_blk, id,
+ fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
+ fma_blk->gnm_avail_mboxs++;
+
+ } else if (purgatory_hold > 0) {
+ CDEBUG(D_NET, "conn %p smsg %p fmablk %p holding SMSG mbox %d "
+ "hndl "LPX64"."LPX64"\n",
+ conn, smsg_attr, fma_blk, id,
+ fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
+
+ fma_blk->gnm_held_mboxs++;
+ fma_blk->gnm_max_timeout = MAX(fma_blk->gnm_max_timeout,
+ conn->gnc_timeout);
+ } else {
+ CDEBUG(D_NET, "conn %p smsg %p fmablk %p release SMSG mbox %d "
+ "hndl "LPX64"."LPX64"\n",
+ conn, smsg_attr, fma_blk, id,
+ fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2);
+
+ fma_blk->gnm_held_mboxs--;
+ fma_blk->gnm_avail_mboxs++;
+ }
+
+ if (purgatory_hold <= 0) {
+ /* if kgni is retransmitting, freeing the smsg block before the EP
+ * is destroyed gets messy. Bug 768295. */
+ LASSERTF(conn->gnc_ephandle == NULL,
+ "can't release mbox before EP is nuked. conn 0x%p\n", conn);
+
+ mbox = &fma_blk->gnm_mbox_info[id];
+ mbox->mbx_release_from_purgatory = jiffies;
+
+ /* clear conn gnc_fmablk if it is gone - this allows us to
+ * not worry about state so much in kgnilnd_destroy_conn
+ * and makes the guaranteed cleanup of the resources easier */
+ LASSERTF(test_and_clear_bit(id, fma_blk->gnm_bit_array),
+ "conn %p bit %d already cleared in fma_blk %p\n",
+ conn, id, fma_blk);
+ conn->gnc_fma_blk = NULL;
+ }
+
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FMABLK_AVAIL)) {
+ CERROR("LBUGs in your future: forcibly marking fma_blk %p "
+ "as mapped\n", fma_blk);
+ fma_blk->gnm_state = GNILND_FMABLK_VIRT;
+ }
+
+ /* we don't release or unmap PHYS blocks as part of the normal cycle --
+ * those are controlled manually from startup/shutdown */
+ if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) {
+ /* we can unmap once all are unused (held or avail)
+ * but check hold_timeout to make sure we are not trying to double
+ * unmap this buffer. If there was no hold_timeout set due to
+ * held_mboxs, we'll free the mobx here shortly and won't have to
+ * worry about catching a double free for a 'clean' fma_blk */
+ if (((fma_blk->gnm_avail_mboxs + fma_blk->gnm_held_mboxs) == fma_blk->gnm_num_mboxs) &&
+ (!fma_blk->gnm_hold_timeout)) {
+ kgnilnd_unmap_fmablk(dev, fma_blk);
+ }
+
+ /* But we can only free once they are all avail */
+ if (fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs &&
+ fma_blk->gnm_held_mboxs == 0) {
+ /* all mailboxes are released, free fma_blk */
+ kgnilnd_free_fmablk_locked(dev, fma_blk);
+ }
+ }
+
+ spin_unlock(&dev->gnd_fmablk_lock);
+}
+
+int
+kgnilnd_count_phys_mbox(kgn_device_t *device)
+{
+ int i = 0;
+ kgn_fma_memblock_t *fma_blk;
+
+ spin_lock(&device->gnd_fmablk_lock);
+
+ list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
+ if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
+ i += fma_blk->gnm_num_mboxs;
+ }
+ spin_unlock(&device->gnd_fmablk_lock);
+
+ RETURN(i);
+}
+
+int
+kgnilnd_allocate_phys_fmablk(kgn_device_t *device)
+{
+ int rc;
+
+ while (kgnilnd_count_phys_mbox(device) < *kgnilnd_tunables.kgn_nphys_mbox) {
+
+ rc = kgnilnd_alloc_fmablk(device, 1);
+ if (rc) {
+ CERROR("failed phys mbox allocation, stopping at %d, rc %d\n",
+ kgnilnd_count_phys_mbox(device), rc);
+ RETURN(rc);
+ }
+ }
+ RETURN(0);
+}
+
+int
+kgnilnd_map_phys_fmablk(kgn_device_t *device)
+{
+
+ int rc = 0;
+ kgn_fma_memblock_t *fma_blk;
+
+ /* use sem to gate access to single thread, just in case */
+ down(&device->gnd_fmablk_sem);
+
+ spin_lock(&device->gnd_fmablk_lock);
+
+ list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
+ if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
+ rc = kgnilnd_map_fmablk(device, fma_blk);
+ if (rc)
+ break;
+ }
+ spin_unlock(&device->gnd_fmablk_lock);
+
+ up(&device->gnd_fmablk_sem);
+
+ RETURN(rc);
+}
+
+void
+kgnilnd_unmap_phys_fmablk(kgn_device_t *device)
+{
+
+ kgn_fma_memblock_t *fma_blk;
+
+ /* use sem to gate access to single thread, just in case */
+ down(&device->gnd_fmablk_sem);
+
+ spin_lock(&device->gnd_fmablk_lock);
+
+ list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) {
+ if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
+ kgnilnd_unmap_fmablk(device, fma_blk);
+ }
+ spin_unlock(&device->gnd_fmablk_lock);
+
+ up(&device->gnd_fmablk_sem);
+}
+
+void
+kgnilnd_free_phys_fmablk(kgn_device_t *device)
+{
+
+ kgn_fma_memblock_t *fma_blk, *fma_blkN;
+
+ /* use sem to gate access to single thread, just in case */
+ down(&device->gnd_fmablk_sem);
+
+ spin_lock(&device->gnd_fmablk_lock);
+
+ list_for_each_entry_safe(fma_blk, fma_blkN, &device->gnd_fma_buffs, gnm_bufflist) {
+ if (fma_blk->gnm_state == GNILND_FMABLK_PHYS)
+ kgnilnd_free_fmablk_locked(device, fma_blk);
+ }
+ spin_unlock(&device->gnd_fmablk_lock);
+
+ up(&device->gnd_fmablk_sem);
+}
+
+/* kgnilnd dgram nid->struct managment */
+
+static inline struct list_head *
+kgnilnd_nid2dgramlist(kgn_device_t *dev, lnet_nid_t nid)
+{
+ unsigned int hash = ((unsigned int)nid) % *kgnilnd_tunables.kgn_peer_hash_size;
+
+ RETURN(&dev->gnd_dgrams[hash]);
+}
+
+
+/* needs dev->gnd_dgram_lock held */
+kgn_dgram_t *
+kgnilnd_find_dgram_locked(kgn_device_t *dev, lnet_nid_t dst_nid)
+{
+ struct list_head *dgram_list = kgnilnd_nid2dgramlist(dev, dst_nid);
+ kgn_dgram_t *dgram;
+
+ list_for_each_entry(dgram, dgram_list, gndg_list) {
+
+ /* if state > POSTED, we are already handling cancel/completion */
+ if ((dgram->gndg_conn_out.gncr_dstnid != dst_nid) ||
+ dgram->gndg_state > GNILND_DGRAM_POSTED)
+ continue;
+
+ CDEBUG(D_NET, "got dgram [%p] -> %s\n",
+ dgram, libcfs_nid2str(dst_nid));
+ return dgram;
+ }
+ return NULL;
+}
+
+int
+kgnilnd_find_and_cancel_dgram(kgn_device_t *dev, lnet_nid_t dst_nid)
+{
+ kgn_dgram_t *dgram;
+
+ spin_lock(&dev->gnd_dgram_lock);
+ dgram = kgnilnd_find_dgram_locked(dev, dst_nid);
+
+ if (dgram) {
+ kgnilnd_cancel_dgram_locked(dgram);
+ }
+ spin_unlock(&dev->gnd_dgram_lock);
+
+ RETURN(!!(dgram == NULL));
+}
+
+int
+kgnilnd_pack_connreq(kgn_connreq_t *connreq, kgn_conn_t *conn,
+ lnet_nid_t srcnid, lnet_nid_t dstnid,
+ kgn_connreq_type_t type)
+{
+ int err = 0;
+
+ /* ensure we haven't violated max datagram size */
+ CLASSERT(sizeof(kgn_connreq_t) <= GNI_DATAGRAM_MAXSIZE);
+
+ /* no need to zero out, we do that when allocating dgram */
+ connreq->gncr_magic = GNILND_MSG_MAGIC;
+
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_SRCNID)) {
+ srcnid = 0xABADBABE;
+ } else if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) {
+ dstnid = 0xDEFEC8ED;
+ }
+
+ connreq->gncr_srcnid = srcnid;
+ connreq->gncr_dstnid = dstnid;
+
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
+ connreq->gncr_version = 99;
+ } else {
+ connreq->gncr_version = GNILND_CONNREQ_VERSION;
+ }
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
+ connreq->gncr_type = 99;
+ } else {
+ connreq->gncr_type = type;
+ }
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
+ connreq->gncr_peerstamp = 0;
+ } else {
+ connreq->gncr_peerstamp = kgnilnd_data.kgn_peerstamp;
+ }
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
+ connreq->gncr_connstamp = 0;
+ } else {
+ connreq->gncr_connstamp = conn->gnc_my_connstamp;
+ }
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) {
+ connreq->gncr_timeout = 0;
+ } else {
+ connreq->gncr_timeout = conn->gnc_timeout;
+ }
+
+ /* the rest pack the data into the payload in other places */
+ if (type == GNILND_CONNREQ_REQ) {
+ kgn_gniparams_t *req_params = &connreq->gncr_gnparams;
+ req_params->gnpr_host_id = conn->gnc_device->gnd_host_id;
+ req_params->gnpr_cqid = conn->gnc_cqid;
+
+ /* allocate mailbox for this connection */
+ err = kgnilnd_setup_mbox(conn);
+ if (err != 0) {
+ CERROR("Failed to setup FMA mailbox (%d)\n", err);
+ }
+ req_params->gnpr_smsg_attr = conn->gnpr_smsg_attr;
+ }
+
+ /* XXX Nic: TBD - checksum computation */
+
+ return err;
+}
+
+int
+kgnilnd_unpack_connreq(kgn_dgram_t *dgram)
+{
+ kgn_connreq_t *connreq = &dgram->gndg_conn_in;
+ int swab, rc = 0;
+ kgn_net_t *net;
+
+ /* the following fields must be handled in a backwards compatible
+ * manner to ensure we can always send and interpret NAKs */
+
+ if (connreq->gncr_magic != GNILND_MSG_MAGIC &&
+ connreq->gncr_magic != __swab32(GNILND_MSG_MAGIC)) {
+ /* Unexpected magic! */
+ CERROR("Unexpected magic %08x\n",
+ connreq->gncr_magic);
+ return -EBADF;
+ }
+
+ swab = (connreq->gncr_magic == __swab32(GNILND_MSG_MAGIC));
+ if (swab) {
+ __swab32s(&connreq->gncr_magic);
+ __swab32s(&connreq->gncr_cksum);
+ __swab16s(&connreq->gncr_type);
+ __swab16s(&connreq->gncr_version);
+ __swab32s(&connreq->gncr_timeout);
+ __swab64s(&connreq->gncr_srcnid);
+ __swab64s(&connreq->gncr_dstnid);
+ __swab64s(&connreq->gncr_peerstamp);
+ __swab64s(&connreq->gncr_connstamp);
+ }
+
+ /* Do NOT return anything but -EBADF before we munge
+ * connreq->gncr_srcnid - we need that to send the nak */
+
+ if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
+ lnet_nid_t incoming = connreq->gncr_srcnid;
+
+ /* even if the incoming packet is hosed, we know who we sent
+ * the original and can set the srcnid so that we can properly
+ * look up our peer to close the loop on this connreq. We still use
+ * -EBADF to prevent a NAK - just in case there are issues with
+ * the payload coming from a random spot, etc. */
+ connreq->gncr_srcnid = dgram->gndg_conn_out.gncr_dstnid;
+
+ if (LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid) !=
+ LNET_NIDADDR(incoming)) {
+ /* we got a datagram match for the wrong nid... */
+ CERROR("matched datagram 0x%p with srcnid %s "
+ "(%x), expecting %s (%x)\n",
+ dgram,
+ libcfs_nid2str(incoming),
+ LNET_NIDADDR(incoming),
+ libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
+ LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid));
+ return -EBADF;
+ }
+ } else {
+ /* if we have a wildcard datagram it should match an
+ * incoming "active" datagram that should have a fully formed
+ * srcnid and dstnid. If we couldn't unpack it, we drop as
+ * corrupted packet, otherwise we'll just verify that the dstnid
+ * matches the NID for the NET that the dgram was posted */
+
+ /* make sure their wildcard didn't match ours, that is unpossible */
+ LASSERTF(connreq->gncr_dstnid != LNET_NID_ANY,
+ "dgram 0x%p from %s, connreq 0x%p; "
+ "wildcard matched wildcard \n", dgram,
+ libcfs_nid2str(connreq->gncr_srcnid), connreq);
+
+ rc = kgnilnd_find_net(connreq->gncr_dstnid, &net);
+
+ if (rc == -ESHUTDOWN) {
+ CERROR("Looking up network: device is in shutdown");
+ return rc;
+ } else if (rc == -ENONET) {
+ CERROR("Connection data from %s: she sent "
+ "dst_nid %s, but net lookup failed on "
+ "dgram 0x%p@%s\n",
+ libcfs_nid2str(connreq->gncr_srcnid),
+ libcfs_nid2str(connreq->gncr_dstnid),
+ dgram, kgnilnd_dgram_type2str(dgram));
+ return rc;
+ }
+
+ if (net->gnn_ni->ni_nid != connreq->gncr_dstnid) {
+ CERROR("Bad connection data from %s: she sent "
+ "dst_nid %s, but I am %s with dgram 0x%p@%s\n",
+ libcfs_nid2str(connreq->gncr_srcnid),
+ libcfs_nid2str(connreq->gncr_dstnid),
+ libcfs_nid2str(net->gnn_ni->ni_nid),
+ dgram, kgnilnd_dgram_type2str(dgram));
+ kgnilnd_net_decref(net);
+ return -EBADSLT;
+ }
+
+ /* kgnilnd_find_net takes a ref on the net it finds, You need to decref it when not needed. */
+ kgnilnd_net_decref(net);
+ }
+
+ if (connreq->gncr_version != GNILND_CONNREQ_VERSION) {
+ CERROR("Unexpected version %d\n", connreq->gncr_version);
+ return -EPROTO;
+ }
+
+ /* XXX Nic: TBD - checksum validation */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_DROP)) {
+ return -EBADF;
+ }
+
+ if (swab && connreq->gncr_type == GNILND_CONNREQ_REQ) {
+ __u64 msg_addr = (__u64) connreq->gncr_gnparams.gnpr_smsg_attr.msg_buffer;
+
+ __swab32s(&connreq->gncr_gnparams.gnpr_host_id);
+ __swab32s(&connreq->gncr_gnparams.gnpr_cqid);
+ __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.buff_size);
+ __swab16s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_maxcredit);
+ __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_offset);
+ __swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword1);
+ __swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword2);
+ __swab64s(&msg_addr);
+ __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_maxsize);
+ __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_type);
+ } else if (swab && connreq->gncr_type == GNILND_CONNREQ_NAK) {
+ __swab32s(&connreq->gncr_nakdata.gnnd_errno);
+ }
+
+ /* since we use a unique instance ID for each network, the driver
+ * will take care of dropping datagrams if we don't have that network.
+ */
+
+ /* few more idiot software or configuration checks */
+
+ switch (connreq->gncr_type) {
+ case GNILND_CONNREQ_REQ:
+ /* wire up EP and SMSG block - this will check the incoming data
+ * and barf a NAK back if need to */
+ rc = kgnilnd_set_conn_params(dgram);
+ if (rc)
+ return rc;
+ break;
+ case GNILND_CONNREQ_NAK:
+ case GNILND_CONNREQ_CLOSE:
+ break;
+ default:
+ CERROR("unknown connreq packet type %d\n", connreq->gncr_type);
+ return -EPROTO;
+ }
+
+ if (connreq->gncr_peerstamp == 0 || connreq->gncr_connstamp == 0) {
+ CERROR("Recived bad timestamps peer "LPU64" conn "LPU64"\n",
+ connreq->gncr_peerstamp, connreq->gncr_connstamp);
+ return -EPROTO;
+ }
+
+ if (connreq->gncr_timeout < GNILND_MIN_TIMEOUT) {
+ CERROR("Received timeout %d < MIN %d\n",
+ connreq->gncr_timeout, GNILND_MIN_TIMEOUT);
+ return -EPROTO;
+ }
+
+ return 0;
+}
+
+int
+kgnilnd_alloc_dgram(kgn_dgram_t **dgramp, kgn_device_t *dev, kgn_dgram_type_t type)
+{
+ kgn_dgram_t *dgram;
+
+ dgram = cfs_mem_cache_alloc(kgnilnd_data.kgn_dgram_cache,
+ CFS_ALLOC_ATOMIC);
+ if (dgram == NULL)
+ return -ENOMEM;
+
+ /* cache alloc'd memory is not zeroed */
+ memset((void *)dgram, 0, sizeof(*dgram)) ;
+
+ INIT_LIST_HEAD(&dgram->gndg_list);
+ dgram->gndg_state = GNILND_DGRAM_USED;
+ dgram->gndg_type = type;
+ dgram->gndg_magic = GNILND_DGRAM_MAGIC;
+
+ atomic_inc(&dev->gnd_ndgrams);
+
+ CDEBUG(D_MALLOC|D_NETTRACE, "slab-alloced 'dgram': %lu at %p.\n",
+ sizeof(*dgram), dgram);
+
+ *dgramp = dgram;
+ return 0;
+}
+
+/* call this on a dgram that came back from kgnilnd_ep_postdata_test_by_id
+ * returns < 0 on dgram to be cleaned up
+ * > 0 on dgram that isn't done yet
+ * == 0 on dgram that is ok and needs connreq processing */
+int
+kgnilnd_process_dgram(kgn_dgram_t *dgram, gni_post_state_t post_state)
+{
+ int rc = 0;
+
+ switch (post_state) {
+ case GNI_POST_COMPLETED:
+ /* normal state for dgrams that need actual processing */
+ /* GOTO to avoid processing dgram as canceled/done */
+ GOTO(process_out, rc);
+
+ case GNI_POST_PENDING:
+ /* we should only see this if we are testing a WC dgram after a
+ * cancel - it means that it needs a full cycle of waiting
+ * for kgni_sm_task to finish moving it to TERMINATED */
+ LASSERTF((dgram->gndg_type == GNILND_DGRAM_WC_REQ) &&
+ (dgram->gndg_state == GNILND_DGRAM_CANCELED),
+ "POST_PENDING dgram 0x%p with bad type %d(%s) or state %d(%s)\n",
+ dgram, dgram->gndg_type, kgnilnd_dgram_type2str(dgram),
+ dgram->gndg_state, kgnilnd_dgram_state2str(dgram));
+
+ /* positive RC as this dgram isn't done yet */
+ rc = EINPROGRESS;
+
+ /* GOTO as this isn't done yet */
+ GOTO(process_out, rc);
+ break;
+
+ case GNI_POST_TERMINATED:
+ /* we've called cancel and it is done or remote guy called cancel and
+ * we've receved it on a WC dgram */
+#if 0
+ /* we are seeing weird terminations on non WC dgrams when we have not
+ * canceled them */
+
+ LASSERTF(dgram->gndg_state == GNILND_DGRAM_CANCELED ||
+ dgram->gndg_conn_out.gncr_dstnid == LNET_NID_ANY,
+ "dgram 0x%p with bad state %d(%s) or dst nid %s\n",
+ dgram, dgram->gndg_state, kgnilnd_dgram_state2str(dgram),
+ libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid));
+#endif
+
+ CDEBUG(D_NETTRACE, "dgram 0x%p saw %s, cleaning it up\n", dgram,
+ dgram->gndg_state == GNILND_DGRAM_CANCELED ? "canceled" : "terminated");
+
+ rc = -ECANCELED;
+ break;
+
+ case GNI_POST_TIMEOUT:
+ /* we could have a timeout on a wildcard dgram too - if
+ * we got the incoming request but the remote node beefed
+ * before kgni could send the match data back. We'll just error
+ * on the active case and bail out gracefully */
+ if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
+ CNETERR("hardware timeout for connect to "
+ "%s after %lu seconds. Is node dead?\n",
+ libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
+ cfs_duration_sec(jiffies - dgram->gndg_post_time));
+ }
+
+ rc = -ETIMEDOUT;
+ break;
+
+ default:
+ CERROR("dgram 0x%p with bad post_state %d\n", dgram, post_state);
+ LBUG();
+ }
+
+ /* now finish cleaning up a dgram that is canceled/terminated and needs to
+ * go away */
+
+ /* If this was actively canceled, drop the count now that we are processing */
+ if (dgram->gndg_state == GNILND_DGRAM_CANCELED) {
+ atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
+ /* caller responsible for gndg_list removal */
+ }
+
+process_out:
+
+ RETURN(rc);
+}
+
+/* needs dev->gnd_dgram_lock held */
+void
+kgnilnd_cancel_dgram_locked(kgn_dgram_t *dgram)
+{
+ gni_return_t grc;
+
+ if (dgram->gndg_state != GNILND_DGRAM_POSTED) {
+ return;
+ }
+
+ LASSERTF(dgram->gndg_conn != NULL,
+ "dgram 0x%p with NULL conn\n", dgram);
+
+ /* C.E - WC dgrams could be canceled immediately but
+ * if there was some match pending, we need to call
+ * test_by_id to clear it out. If that test returns
+ * POST_PENDING, it is half done and needs to go along
+ * with the rest of dgrams and go through a kgni_sm_task cycle
+ * and deliver a GNI_POST_TERMINATED event before they
+ * are actually canceled */
+
+ dgram->gndg_state = GNILND_DGRAM_CANCELED;
+
+ if (dgram->gndg_conn->gnc_state >= GNILND_CONN_ESTABLISHED) {
+ /* we don't need to cancel_by_id if the datagram was good */
+ return;
+ }
+
+ /* let folks know there are outstanding cancels */
+ atomic_inc(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
+ /* leave on nid list until cancel is done for debugging fun */
+ grc = kgnilnd_ep_postdata_cancel_by_id(dgram->gndg_conn->gnc_ephandle, (__u64) dgram);
+
+ /* if we don't get success here, we have hosed up the dgram tracking
+ * code and need to bail out */
+ LASSERTF(grc == GNI_RC_SUCCESS,
+ "postdata_cancel returned %d for conn 0x%p to %s\n",
+ grc, dgram->gndg_conn,
+ dgram->gndg_conn->gnc_peer ?
+ libcfs_nid2str(dgram->gndg_conn->gnc_peer->gnp_nid)
+ : "<?>");
+
+ CDEBUG(D_NETTRACE,
+ "canceled dgram 0x%p conn 0x%p ephandle 0x%p\n",
+ dgram, dgram->gndg_conn,
+ dgram->gndg_conn->gnc_ephandle);
+
+ if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) {
+ gni_post_state_t post_state;
+ int rc = 0;
+ __u32 remote_addr = 0, remote_id = 0;
+
+ grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle,
+ (__u64)dgram, &post_state,
+ &remote_addr, &remote_id);
+
+ LASSERTF(grc == GNI_RC_NO_MATCH || grc == GNI_RC_SUCCESS,
+ "bad grc %d from test_by_id on dgram 0x%p\n",
+ grc, dgram);
+
+ /* if WC was canceled immediately, we get NO_MATCH, if needs to go
+ * through full cycle, we get SUCCESS and need to parse post_state */
+
+ CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
+ "remote_addr %u remote_id %u\n", grc, dgram,
+ kgnilnd_dgram_type2str(dgram),
+ post_state, remote_addr, remote_id);
+
+ if (grc == GNI_RC_NO_MATCH) {
+ /* she's gone, reduce count and move along */
+ dgram->gndg_state = GNILND_DGRAM_DONE;
+ atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
+ RETURN_EXIT;
+ }
+
+ rc = kgnilnd_process_dgram(dgram, post_state);
+
+ if (rc <= 0) {
+ /* if for some weird reason we get a valid dgram back, just mark as done
+ * so we can drop it and move along.
+ * C.E - if it was completed, we'll just release the conn/mbox
+ * back into the pool and it'll get reused. That said, we should only
+ * be canceling a WC dgram on stack rest or shutdown, so that is moot */
+ dgram->gndg_state = GNILND_DGRAM_DONE;
+ atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams);
+
+ /* caller context responsible for calling kgnilnd_release_dgram() */
+ } else {
+ /* still pending, let it simmer until golden brown and delicious */
+ }
+ }
+
+ /* for non WC dgrams, they are still on the nid list but marked canceled waiting
+ * for kgni to return their ID to us via probe - that is when we'll complete their
+ * cancel processing */
+}
+
+void
+kgnilnd_cleanup_dgram(kgn_dgram_t *dgram)
+{
+ /* release the dgram ref on conn */
+ if (dgram->gndg_conn) {
+ kgnilnd_conn_decref(dgram->gndg_conn);
+ dgram->gndg_conn = NULL;
+ }
+}
+
+void
+kgnilnd_free_dgram(kgn_device_t *dev, kgn_dgram_t *dgram)
+{
+ LASSERTF(dgram->gndg_state == GNILND_DGRAM_USED ||
+ dgram->gndg_state == GNILND_DGRAM_DONE,
+ "dgram 0x%p with bad state %s\n",
+ dgram, kgnilnd_dgram_state2str(dgram));
+
+ /* bit of poisoning to help detect bad driver data */
+ dgram->gndg_magic = 0x6f5a6b5f;
+ atomic_dec(&dev->gnd_ndgrams);
+
+ cfs_mem_cache_free(kgnilnd_data.kgn_dgram_cache, dgram);
+ CDEBUG(D_MALLOC|D_NETTRACE, "slab-freed 'dgram': %lu at %p.\n",
+ sizeof(*dgram), dgram);
+}
+
+int
+kgnilnd_post_dgram(kgn_device_t *dev, lnet_nid_t dstnid, kgn_connreq_type_t type,
+ int data_rc)
+{
+ int rc = 0;
+ kgn_dgram_t *dgram = NULL;
+ kgn_dgram_t *tmpdgram;
+ kgn_dgram_type_t dgtype;
+ gni_return_t grc;
+ __u64 srcnid;
+ ENTRY;
+
+ switch (type) {
+ case GNILND_CONNREQ_REQ:
+ if (dstnid == LNET_NID_ANY)
+ dgtype = GNILND_DGRAM_WC_REQ;
+ else
+ dgtype = GNILND_DGRAM_REQ;
+ break;
+ case GNILND_CONNREQ_NAK:
+ LASSERTF(dstnid != LNET_NID_ANY, "can't NAK to LNET_NID_ANY\n");
+ dgtype = GNILND_DGRAM_NAK;
+ break;
+ default:
+ CERROR("unknown connreq type %d\n", type);
+ LBUG();
+ }
+
+ rc = kgnilnd_alloc_dgram(&dgram, dev, dgtype);
+ if (rc < 0) {
+ rc = -ENOMEM;
+ GOTO(post_failed, rc);
+ }
+
+ rc = kgnilnd_create_conn(&dgram->gndg_conn, dev);
+ if (rc) {
+ GOTO(post_failed, rc);
+ }
+
+ if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) {
+ /* clear buffer for sanity on reuse of wildcard */
+ memset(&dgram->gndg_conn_in, 0, sizeof(kgn_connreq_t));
+ }
+
+ if (dstnid == LNET_NID_ANY) {
+ /* set here to reset any dgram re-use */
+ dgram->gndg_conn->gnc_state = GNILND_CONN_LISTEN;
+ } else {
+ __u32 host_id;
+
+ rc = kgnilnd_nid_to_nicaddrs(LNET_NIDADDR(dstnid), 1, &host_id);
+ if (rc <= 0) {
+ rc = -ESRCH;
+ GOTO(post_failed, rc);
+ }
+
+ dgram->gndg_conn->gnc_state = GNILND_CONN_CONNECTING;
+
+ /* don't need to serialize, there are no CQs for the dgram
+ * EP on the kgn_net_t */
+ grc = kgnilnd_ep_bind(dgram->gndg_conn->gnc_ephandle, host_id, dev->gnd_id);
+
+ if (grc != GNI_RC_SUCCESS) {
+ rc = -ECONNABORTED;
+ GOTO(post_failed, rc);
+ }
+
+ }
+
+ /* If we are posting wildcards post using a net of 0, otherwise we'll use the
+ * net of the destination node.
+ */
+
+ if (dstnid == LNET_NID_ANY) {
+ srcnid = LNET_MKNID(LNET_MKNET(GNILND, 0), dev->gnd_nid);
+ } else {
+ srcnid = LNET_MKNID(LNET_NIDNET(dstnid), dev->gnd_nid);
+ }
+
+ rc = kgnilnd_pack_connreq(&dgram->gndg_conn_out, dgram->gndg_conn,
+ srcnid, dstnid, type);
+ if (rc) {
+ GOTO(post_failed, rc);
+ }
+
+ if (type == GNILND_CONNREQ_NAK)
+ dgram->gndg_conn_out.gncr_nakdata.gnnd_errno = data_rc;
+
+ dgram->gndg_post_time = jiffies;
+
+ /* XXX Nic: here is where we'd add in logical network multiplexing */
+
+ CDEBUG(D_NETTRACE, "dgram 0x%p type %s %s->%s cdm %d\n",
+ dgram, kgnilnd_dgram_type2str(dgram),
+ libcfs_nid2str(srcnid),
+ libcfs_nid2str(dstnid), dev->gnd_id);
+
+ /* this allocates memory, can't hold locks across */
+ grc = kgnilnd_ep_postdata_w_id(dgram->gndg_conn->gnc_ephandle,
+ &dgram->gndg_conn_out, sizeof(kgn_connreq_t),
+ &dgram->gndg_conn_in, sizeof(kgn_connreq_t),
+ (__u64)dgram);
+
+ if (grc != GNI_RC_SUCCESS) {
+ CNETERR("dropping failed dgram post id 0x%p type %s"
+ " reqtype %s to %s: rc %d\n",
+ dgram, kgnilnd_dgram_type2str(dgram),
+ kgnilnd_connreq_type2str(&dgram->gndg_conn_out),
+ libcfs_nid2str(dstnid), grc);
+ rc = (grc == GNI_RC_ERROR_NOMEM) ? -ENOMEM : -EBADR;
+ GOTO(post_failed, rc);
+ }
+
+ /* we don't need to add earlier - if someone does del_peer during post,
+ * that peer will get marked as unlinked and the callers wil take care of it.
+ * The dgram code is largely kgn_peer_t ignorant, so at worst, we'll just drop
+ * the completed dgram later when we cant find a peer to stuff it into */
+
+ spin_lock(&dev->gnd_dgram_lock);
+
+ /* make sure we are not double posting targeted dgrams
+ * - we can multiple post WC dgrams to help with processing speed */
+ if (dstnid != LNET_NID_ANY) {
+ tmpdgram = kgnilnd_find_dgram_locked(dev, dstnid);
+
+ LASSERTF(tmpdgram == NULL,
+ "dgram 0x%p->%s already posted\n",
+ dgram, libcfs_nid2str(dstnid));
+ }
+
+ /* unmunge dstnid to help processing code cope... */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) {
+ dgram->gndg_conn_out.gncr_dstnid = dstnid;
+ }
+
+ list_add_tail(&dgram->gndg_list, kgnilnd_nid2dgramlist(dev, dstnid));
+ dgram->gndg_state = GNILND_DGRAM_POSTED;
+ spin_unlock(&dev->gnd_dgram_lock);
+
+post_failed:
+ if (rc < 0 && dgram != NULL) {
+ kgnilnd_cleanup_dgram(dgram);
+ kgnilnd_free_dgram(dev, dgram);
+ }
+
+ RETURN(rc);
+}
+
+void
+kgnilnd_release_dgram(kgn_device_t *dev, kgn_dgram_t *dgram)
+{
+ spin_lock(&dev->gnd_dgram_lock);
+ kgnilnd_cancel_dgram_locked(dgram);
+ spin_unlock(&dev->gnd_dgram_lock);
+
+ kgnilnd_cleanup_dgram(dgram);
+
+ /* if the dgram is 'canceled' it needs to be wait until the event
+ * comes up from kgni that tells us it is safe to release */
+ if (dgram->gndg_state != GNILND_DGRAM_CANCELED) {
+ dgram->gndg_state = GNILND_DGRAM_DONE;
+
+ LASSERTF(list_empty(&dgram->gndg_list), "dgram 0x%p on list\n", dgram);
+
+ /* if it is a wildcard and we are in an appropriate state, repost
+ * the wildcard */
+
+ if ((dgram->gndg_type == GNILND_DGRAM_WC_REQ) &&
+ (!kgnilnd_data.kgn_wc_kill && !kgnilnd_data.kgn_in_reset)) {
+ int rerc;
+
+ rerc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
+ LASSERTF(rerc == 0,
+ "error %d: dev %d could not repost wildcard datagram id 0x%p\n",
+ rerc, dev->gnd_id, dgram);
+ }
+
+ /* always free the old dgram */
+ kgnilnd_free_dgram(dev, dgram);
+ }
+}
+
+
+int
+kgnilnd_probe_for_dgram(kgn_device_t *dev, kgn_dgram_t **dgramp)
+{
+ kgn_dgram_t *dgram = NULL;
+ gni_post_state_t post_state;
+ gni_return_t grc;
+ int rc = 0;
+ __u64 readyid;
+ __u32 remote_addr = 0, remote_id = 0;
+ ENTRY;
+
+ /* Probe with the lock held. That way if we get a dgram we dont have it canceled
+ * between finding the ready dgram and grabbing the lock to remove it from the
+ * list. Otherwise we could be left in an inconsistent state. We own the dgram
+ * once its off the list so we don't need to worry about others changing it at
+ * that point. */
+ spin_lock(&dev->gnd_dgram_lock);
+ grc = kgnilnd_postdata_probe_by_id(dev->gnd_handle, &readyid);
+ if (grc != GNI_RC_SUCCESS) {
+ spin_unlock(&dev->gnd_dgram_lock);
+ /* return 0 to indicate nothing happened */
+ RETURN(0);
+ }
+
+ CDEBUG(D_NET, "ready "LPX64" on device 0x%p\n",
+ readyid, dev);
+
+ dgram = (kgn_dgram_t *)readyid;
+
+ LASSERTF(dgram->gndg_magic == GNILND_DGRAM_MAGIC,
+ "dgram 0x%p from id "LPX64" with bad magic %x\n",
+ dgram, readyid, dgram->gndg_magic);
+
+ LASSERTF(dgram->gndg_state == GNILND_DGRAM_POSTED ||
+ dgram->gndg_state == GNILND_DGRAM_CANCELED,
+ "dgram 0x%p with bad state %s\n",
+ dgram, kgnilnd_dgram_state2str(dgram));
+
+ LASSERTF(!list_empty(&dgram->gndg_list),
+ "dgram 0x%p with bad list state %s\n",
+ dgram, kgnilnd_dgram_state2str(dgram));
+
+ /* now we know that the datagram structure is ok, so pull off list */
+ list_del_init(&dgram->gndg_list);
+
+ /* while we have the gnn_dgram_lock and BEFORE we call test_by_id
+ * change the state from POSTED to PROCESSING to ensure that
+ * nobody cancels it after we've pulled it from the wire */
+ if (dgram->gndg_state == GNILND_DGRAM_POSTED) {
+ dgram->gndg_state = GNILND_DGRAM_PROCESSING;
+ }
+
+ spin_unlock(&dev->gnd_dgram_lock);
+
+ /* we now "own" this datagram */
+
+ LASSERTF(dgram->gndg_conn != NULL,
+ "dgram 0x%p with NULL conn\n", dgram);
+
+ grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle,
+ (__u64)dgram, &post_state,
+ &remote_addr, &remote_id);
+
+ LASSERTF(grc != GNI_RC_NO_MATCH, "kgni lied! probe_by_id told us that"
+ " id "LPU64" was ready\n", readyid);
+
+ CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d "
+ "remote_addr %u remote_id %u\n", grc, dgram,
+ kgnilnd_dgram_type2str(dgram),
+ post_state, remote_addr, remote_id);
+
+ if (unlikely(grc != GNI_RC_SUCCESS)) {
+ CNETERR("getting data for dgram 0x%p->%s failed rc %d. Dropping it\n",
+ dgram, libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
+ grc);
+ rc = -EINVAL;
+ GOTO(probe_for_out, rc);
+ }
+
+ rc = kgnilnd_process_dgram(dgram, post_state);
+
+ /* we should never get probe finding a dgram for us and then it
+ * being a WC dgram that is still in the middle of processing */
+ LASSERTF(rc <= 0, "bad rc %d from process_dgram 0x%p state %d\n",
+ rc, dgram, post_state);
+
+ if (rc == 0) {
+ /* dgram is good enough for the data to be used */
+ dgram->gndg_state = GNILND_DGRAM_PROCESSING;
+ /* fake rc to mark that we've done something */
+ rc = 1;
+ } else {
+ /* bring out your dead! */
+ dgram->gndg_state = GNILND_DGRAM_DONE;
+ }
+
+ *dgramp = dgram;
+ RETURN(rc);
+
+probe_for_out:
+
+ kgnilnd_release_dgram(dev, dgram);
+ RETURN(rc);
+}
+
+int
+kgnilnd_setup_wildcard_dgram(kgn_device_t *dev)
+{
+ /* if kgn_wildcard is zero, return error */
+ int rc = -ENOENT, i;
+ ENTRY;
+
+ for (i = 0; i < *kgnilnd_tunables.kgn_nwildcard; i++) {
+ rc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0);
+ if (rc < 0) {
+ CERROR("error %d: could not post wildcard datagram # %d\n",
+ rc, i);
+ rc = -EINVAL;
+ GOTO(failed, rc);
+ }
+ }
+
+failed:
+ RETURN(rc);
+}
+
+int
+kgnilnd_cancel_net_dgrams(kgn_net_t *net)
+{
+ kgn_dgram_t *dg, *dgN;
+ struct list_head zombies;
+ int i;
+ ENTRY;
+
+ /* we want to cancel any outstanding dgrams - we don't want to rely
+ * on del_peer_or_conn catching all of them. This helps protect us in cases
+ * where we don't quite keep the peer->dgram mapping in sync due to some
+ * race conditions */
+
+ LASSERTF(net->gnn_shutdown || kgnilnd_data.kgn_in_reset,
+ "called with LND invalid state: net shutdown %d "
+ "in reset %d\n", net->gnn_shutdown,
+ kgnilnd_data.kgn_in_reset);
+
+ INIT_LIST_HEAD(&zombies);
+
+ spin_lock(&net->gnn_dev->gnd_dgram_lock);
+
+ for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
+ list_for_each_entry_safe(dg, dgN, &net->gnn_dev->gnd_dgrams[i], gndg_list) {
+
+ /* skip nids not on our net or are wildcards */
+
+
+ if (dg->gndg_type == GNILND_DGRAM_WC_REQ ||
+ net->gnn_netnum != LNET_NETNUM(LNET_NIDNET(dg->gndg_conn_out.gncr_dstnid)))
+ continue;
+
+ kgnilnd_cancel_dgram_locked(dg);
+ }
+ }
+
+ spin_unlock(&net->gnn_dev->gnd_dgram_lock);
+
+ RETURN(0);
+}
+
+int
+kgnilnd_cancel_wc_dgrams(kgn_device_t *dev)
+{
+ kgn_dgram_t *dg, *dgN;
+ struct list_head zombies;
+ ENTRY;
+
+ /* Time to kill the outstanding WC's
+ * WC's exist on net 0 only but match on any net...
+ */
+
+ LASSERTF(kgnilnd_data.kgn_in_reset || kgnilnd_data.kgn_wc_kill,
+ "called with LND invalid state: WC shutdown %d "
+ "in reset %d\n", kgnilnd_data.kgn_wc_kill,
+ kgnilnd_data.kgn_in_reset);
+
+ INIT_LIST_HEAD(&zombies);
+ spin_lock(&dev->gnd_dgram_lock);
+
+ do {
+ dg = kgnilnd_find_dgram_locked(dev, LNET_NID_ANY);
+ if (dg != NULL) {
+ LASSERTF(dg->gndg_type == GNILND_DGRAM_WC_REQ,
+ "dgram 0x%p->%s with bad type %d (%s)\n",
+ dg, libcfs_nid2str(dg->gndg_conn_out.gncr_dstnid),
+ dg->gndg_type, kgnilnd_dgram_type2str(dg));
+
+ kgnilnd_cancel_dgram_locked(dg);
+
+ /* WC could be DONE already, check and if so add to list to be released */
+ if (dg->gndg_state == GNILND_DGRAM_DONE) {
+ list_del_init(&dg->gndg_list);
+ list_add_tail(&dg->gndg_list, &zombies);
+ }
+ }
+ } while (dg != NULL);
+
+ spin_unlock(&dev->gnd_dgram_lock);
+
+ list_for_each_entry_safe(dg, dgN, &zombies, gndg_list) {
+ list_del_init(&dg->gndg_list);
+ kgnilnd_release_dgram(dev, dg);
+ }
+ RETURN(0);
+
+}
+
+void
+kgnilnd_wait_for_canceled_dgrams(kgn_device_t *dev)
+{
+ int i = 4;
+ int rc;
+ gni_return_t grc;
+ __u64 readyid;
+ kgn_dgram_t *dgram;
+
+ /* use do while to get at least one check run to allow
+ * regression test for 762072 to hit bug if there */
+
+ /* This function races with the dgram mover during shutdown so it is possible for
+ * a dgram to be seen in kgnilnd_postdata_probe_wait_by_id but be handled in the
+ * dgram mover thread instead of inside of this function.
+ */
+
+ /* This should only be called from within shutdown, baseshutdown, or stack reset.
+ * there are no assertions here to verify since base_shutdown has nothing in it we can check
+ * the net is gone by then.
+ */
+
+ do {
+ i++;
+ CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
+ "Waiting for %d canceled datagrams to clear on device %d\n",
+ atomic_read(&dev->gnd_canceled_dgrams), dev->gnd_id);
+
+ /* check once a second */
+ grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle,
+ 250, &readyid);
+
+ if (grc != GNI_RC_SUCCESS)
+ continue;
+
+ CDEBUG(D_NET, "ready "LPX64" on device %d->0x%p\n",
+ readyid, dev->gnd_id, dev);
+
+ rc = kgnilnd_probe_for_dgram(dev, &dgram);
+ if (rc != 0) {
+ /* if we got a valid dgram or one that is now done, clean up */
+ kgnilnd_release_dgram(dev, dgram);
+ }
+ } while (atomic_read(&dev->gnd_canceled_dgrams));
+}
+
+int
+kgnilnd_start_connect(kgn_peer_t *peer)
+{
+ int rc = 0;
+ /* sync point for kgnilnd_del_peer_locked - do an early check to
+ * catch the most common hits where del_peer is done by the
+ * time we get here */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING1)) {
+ while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING1, 1)) {};
+ }
+
+ write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+ if (!kgnilnd_peer_active(peer) || peer->gnp_connecting != GNILND_PEER_CONNECT) {
+ /* raced with peer getting unlinked */
+ write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+ rc = ESTALE;
+ GOTO(out, rc);
+ }
+ peer->gnp_connecting = GNILND_PEER_POSTING;
+ write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+ set_mb(peer->gnp_last_dgram_time, jiffies);
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING2)) {
+ while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING2, 1)) {};
+ }
+
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING3)) {
+ while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING3, 1)) {};
+ rc = cfs_fail_val ? cfs_fail_val : -ENOMEM;
+ } else {
+ rc = kgnilnd_post_dgram(peer->gnp_net->gnn_dev,
+ peer->gnp_nid, GNILND_CONNREQ_REQ, 0);
+ }
+ if (rc < 0) {
+ set_mb(peer->gnp_last_dgram_errno, rc);
+ GOTO(failed, rc);
+ }
+
+ /* while we're posting someone could have decided this peer/dgram needed to
+ * die a quick death, so we check for state change and process accordingly */
+
+ write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+ if (!kgnilnd_peer_active(peer) || peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
+ if (peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) {
+ peer->gnp_connecting = GNILND_PEER_KILL;
+ }
+ write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+ /* positive RC to avoid dgram cleanup - we'll have to
+ * wait for the kgni GNI_POST_TERMINATED event to
+ * finish cleaning up */
+ rc = ESTALE;
+ kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev, peer->gnp_nid);
+ GOTO(out, rc);
+ }
+ peer->gnp_connecting = GNILND_PEER_POSTED;
+ write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+ /* reaper thread will take care of any timeouts */
+ CDEBUG(D_NET, "waiting for connect to finish to %s rc %d\n",
+ libcfs_nid2str(peer->gnp_nid), rc);
+
+ RETURN(rc);
+
+failed:
+ CDEBUG(D_NET, "connect to %s failed: rc %d \n",
+ libcfs_nid2str(peer->gnp_nid), rc);
+out:
+ RETURN(rc);
+}
+
+int
+kgnilnd_finish_connect(kgn_dgram_t *dgram)
+{
+ kgn_conn_t *conn = dgram->gndg_conn;
+ lnet_nid_t her_nid = dgram->gndg_conn_in.gncr_srcnid;
+ kgn_peer_t *new_peer, *peer = NULL;
+ kgn_tx_t *tx;
+ kgn_tx_t *txn;
+ kgn_mbox_info_t *mbox;
+ int rc;
+ int nstale;
+
+ /* try to find a peer that matches the nid we got in the connreq
+ * kgnilnd_unpack_connreq makes sure that conn_in.gncr_srcnid is
+ * HER and conn_out.gncr_srcnid is ME for both active and WC dgrams */
+
+ /* assume this is a new peer - it makes locking cleaner when it isn't */
+ /* no holding kgn_net_rw_sem - already are at the kgnilnd_dgram_mover level */
+
+ rc = kgnilnd_create_peer_safe(&new_peer, her_nid, NULL);
+ if (rc != 0) {
+ CERROR("Can't create peer for %s\n", libcfs_nid2str(her_nid));
+ return rc;
+ }
+
+ write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+ /* this transfers ref from create_peer to the kgn_peer table */
+ kgnilnd_add_peer_locked(her_nid, new_peer, &peer);
+
+ /* if we found an existing peer, is it really ready for a new conn ? */
+ if (peer != new_peer) {
+ /* if this was an active connect attempt but we can't find a peer waiting for it
+ * we will dump in the trash */
+
+ if (peer->gnp_connecting == GNILND_PEER_IDLE && dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
+ CDEBUG(D_NET, "dropping completed connreq for %s peer 0x%p->%s\n",
+ libcfs_nid2str(her_nid), peer, libcfs_nid2str(peer->gnp_nid));
+ write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+ rc = ECANCELED;
+ GOTO(out, rc);
+ }
+
+ /* check to see if we can catch a connecting peer before it is
+ * removed from the connd_peers list - if not, we need to
+ * let the connreqs race and be handled by kgnilnd_conn_isdup_locked() */
+ if (peer->gnp_connecting != GNILND_PEER_IDLE) {
+ spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
+ if (!list_empty(&peer->gnp_connd_list)) {
+ list_del_init(&peer->gnp_connd_list);
+ /* drop connd ref */
+ kgnilnd_peer_decref(peer);
+ }
+ spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
+ /* clear rc to make sure we don't have fake error */
+ rc = 0;
+ }
+
+ /* no matter what, we are no longer waiting to connect this peer now */
+ peer->gnp_connecting = GNILND_PEER_IDLE;
+
+ /* Refuse to duplicate an existing connection (both sides might try to
+ * connect at once). NB we return success! We _are_ connected so we
+ * _don't_ have any blocked txs to complete with failure. */
+ rc = kgnilnd_conn_isdup_locked(peer, conn);
+ if (rc != 0) {
+ write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+ CDEBUG(D_NET, "Not creating duplicate connection to %s: %d\n",
+ libcfs_nid2str(her_nid), rc);
+ rc = EALREADY;
+ GOTO(out, rc);
+ }
+ }
+
+ nstale = kgnilnd_close_stale_conns_locked(peer, conn);
+
+ /* either way with peer (new or existing), we are ok with ref counts here as the
+ * kgnilnd_add_peer_locked will use our ref on new_peer (from create_peer_safe) as the
+ * ref for the peer table. */
+
+ /* at this point, the connection request is a winner */
+
+ /* mark 'DONE' to avoid cancel being called from release */
+ dgram->gndg_state = GNILND_DGRAM_DONE;
+
+ /* initialise timestamps before reaper looks at them */
+ conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies;
+
+ /* last_tx is initialized to jiffies - (keepalive*2) so that if the NOOP fails it will
+ * immediatly send a NOOP in the reaper thread during the call to
+ * kgnilnd_check_conn_timeouts_locked
+ */
+ conn->gnc_last_tx = jiffies - (cfs_time_seconds(GNILND_TO2KA(conn->gnc_timeout)) * 2);
+ conn->gnc_state = GNILND_CONN_ESTABLISHED;
+
+ /* refs are not transferred from dgram to tables, so increment to
+ * take ownership */
+ kgnilnd_conn_addref(conn);
+ kgnilnd_peer_addref(peer);
+ conn->gnc_peer = peer;
+ list_add_tail(&conn->gnc_list, &peer->gnp_conns);
+
+ kgnilnd_conn_addref(conn); /* +1 ref for conn table */
+ list_add_tail(&conn->gnc_hashlist,
+ kgnilnd_cqid2connlist(conn->gnc_cqid));
+ kgnilnd_data.kgn_conn_version++;
+
+ /* Dont send NOOP if fail_loc is set
+ */
+ if (!CFS_FAIL_CHECK(CFS_FAIL_GNI_ONLY_NOOP)) {
+ tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP, peer->gnp_net->gnn_ni->ni_nid);
+ if (tx == NULL) {
+ CNETERR("can't get TX to initiate NOOP to %s\n",
+ libcfs_nid2str(peer->gnp_nid));
+ } else {
+ kgnilnd_queue_tx(conn, tx);
+ }
+ }
+
+ /* Schedule all packets blocking for a connection */
+ list_for_each_entry_safe(tx, txn, &peer->gnp_tx_queue, tx_list) {
+ /* lock held here is the peer_conn lock */
+ kgnilnd_tx_del_state_locked(tx, peer, NULL, GNILND_TX_ALLOCD);
+ kgnilnd_queue_tx(conn, tx);
+ }
+
+ /* If this is an active connection lets mark its timestamp on the MBoX */
+ if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) {
+ mbox = &conn->gnc_fma_blk->gnm_mbox_info[conn->gnc_mbox_id];
+ /* conn->gnc_last_rx is jiffies it better exist as it was just set */
+ mbox->mbx_release_purg_active_dgram = conn->gnc_last_rx;
+ }
+
+ /* Bug 765042: wake up scheduler for a race with finish_connect and
+ * complete_conn_closed with a conn in purgatory
+ * since we can't use CFS_RACE due to mutex_holds in kgnilnd_process_conns,
+ * we just check for set and then clear */
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FINISH_PURG)) {
+ cfs_fail_loc = 0x0;
+ /* get scheduler thread moving again */
+ kgnilnd_schedule_device(conn->gnc_device);
+ }
+
+ CDEBUG(D_NET, "New conn 0x%p->%s dev %d\n",
+ conn, libcfs_nid2str(her_nid), conn->gnc_device->gnd_id);
+
+ /* make sure we reset peer reconnect interval now that we have a good conn */
+ kgnilnd_peer_alive(peer);
+ peer->gnp_reconnect_interval = 0;
+
+ /* clear the unlink attribute if we dont clear it kgnilnd_del_conn_or_peer will wait
+ * on the atomic forever
+ */
+ if (peer->gnp_pending_unlink) {
+ peer->gnp_pending_unlink = 0;
+ kgnilnd_admin_decref(kgnilnd_data.kgn_npending_unlink);
+ CDEBUG(D_NET, "Clearing peer unlink %p\n",peer);
+ }
+
+ /* add ref to make it hang around until after we drop the lock */
+ kgnilnd_conn_addref(conn);
+
+ /* Once the peer_conn lock is dropped, the conn could actually move into
+ * CLOSING->CLOSED->DONE in the scheduler thread, so hold the
+ * lock until we are really done */
+ write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+ /* Notify LNET that we now have a working connection to this peer.
+ * This is a Cray extension to the "standard" LND behavior. */
+ lnet_notify(peer->gnp_net->gnn_ni, peer->gnp_nid,
+ 1, cfs_time_current());
+
+ /* schedule the conn to pick up any SMSG sent by peer before we could
+ * process this dgram */
+ kgnilnd_schedule_conn(conn);
+
+ /* drop our 'hold' ref */
+ kgnilnd_conn_decref(conn);
+
+out:
+ RETURN(rc);
+}
+
+void
+kgnilnd_send_nak(kgn_device_t *dev, lnet_nid_t dst_nid, int error)
+{
+ int rc = 0;
+ ENTRY;
+
+ LASSERTF(dst_nid != LNET_NID_ANY, "bad dst_nid %s\n", libcfs_nid2str(dst_nid));
+
+ CDEBUG(D_NET, "NAK to %s errno %d\n", libcfs_nid2str(dst_nid), error);
+
+ rc = kgnilnd_post_dgram(dev, dst_nid, GNILND_CONNREQ_NAK, error);
+
+ if (rc < 0) {
+ CDEBUG(D_NET, "NAK to %s failed: rc %d \n", libcfs_nid2str(dst_nid), rc);
+ }
+ EXIT;
+}
+
+int
+kgnilnd_process_nak(kgn_dgram_t *dgram)
+{
+ kgn_connreq_t *connreq = &dgram->gndg_conn_in;
+ lnet_nid_t src_nid = connreq->gncr_srcnid;
+ int errno = connreq->gncr_nakdata.gnnd_errno;
+ kgn_peer_t *peer;
+ int rc = 0;
+
+ write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+
+ peer = kgnilnd_find_peer_locked(src_nid);
+ if (peer == NULL) {
+ /* we likely dropped him from bad data when we processed
+ * the original REQ */
+ write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+ return -EBADSLT;
+ }
+
+ /* need to check peerstamp/connstamp against the ones we find
+ * to make sure we don't close new (and good?) conns that we
+ * formed after this connreq failed */
+ if (peer->gnp_connecting == GNILND_PEER_IDLE) {
+ kgn_conn_t conn;
+
+ if (list_empty(&peer->gnp_conns)) {
+ /* assume already procced datagram and it barfed up
+ * on this side too */
+ CDEBUG(D_NET, "dropping NAK from %s; "
+ "peer %s is already not connected\n",
+ libcfs_nid2str(connreq->gncr_srcnid),
+ libcfs_nid2str(connreq->gncr_dstnid));
+ write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+ return 0;
+ }
+
+ /* stub up a connection with the connreq XXX_stamps to allow
+ * use to use close_stale_conns_locked */
+ conn.gnc_peerstamp = connreq->gncr_peerstamp;
+ conn.gnc_my_connstamp = connreq->gncr_connstamp;
+ conn.gnc_peer_connstamp = connreq->gncr_connstamp;
+ conn.gnc_device = peer->gnp_net->gnn_dev;
+
+ rc = kgnilnd_close_stale_conns_locked(peer, &conn);
+
+ LCONSOLE_INFO("Received NAK from %s for %s errno %d; "
+ "closed %d connections\n",
+ libcfs_nid2str(connreq->gncr_srcnid),
+ libcfs_nid2str(connreq->gncr_dstnid), errno, rc);
+ } else {
+ rc = 0;
+ spin_lock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);
+
+ if (list_empty(&peer->gnp_connd_list)) {
+ /* if peer isn't on waiting list, try to find one to nuke */
+ rc = kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev,
+ peer->gnp_nid);
+
+ if (rc) {
+ LCONSOLE_INFO("Received NAK from %s for %s errno %d; "
+ "canceled pending connect request\n",
+ libcfs_nid2str(connreq->gncr_srcnid),
+ libcfs_nid2str(connreq->gncr_dstnid), errno);
+ }
+
+ /* if we can't find a waiting dgram, we just drop the nak - the conn
+ * connect must have failed (didn't find conn above and clear connecting
+ * -- so nothing to do besides drop */
+ } else {
+ /* peer is on list, meaning it is a new connect attempt from the one
+ * we started that generated the NAK - so just drop NAK */
+
+ /* use negative to prevent error message */
+ rc = -EAGAIN;
+ }
+ spin_unlock(&dgram->gndg_conn->gnc_device->gnd_connd_lock);
+ }
+
+ /* success! we found a peer and at least marked pending_nak */
+ write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+ return 0;
+}
+
+int
+kgnilnd_process_connreq(kgn_dgram_t *dgram, int *needs_nak)
+{
+ int rc;
+
+ rc = kgnilnd_unpack_connreq(dgram);
+ if (rc < 0) {
+ if (rc != -EBADF) {
+ /* only NAK if we have good srcnid to use */
+ *needs_nak = 1;
+ }
+ goto connreq_out;
+ }
+
+ switch (dgram->gndg_conn_in.gncr_type) {
+ case GNILND_CONNREQ_REQ:
+ /* wire up peer & conn, send queued TX */
+ rc = kgnilnd_finish_connect(dgram);
+
+ /* don't nak when the nid is hosed */
+ if ((rc < 0)) {
+ *needs_nak = 1;
+ }
+
+ break;
+ case GNILND_CONNREQ_NAK:
+ rc = kgnilnd_process_nak(dgram);
+ /* return early to prevent reconnect bump */
+ return rc;
+ default:
+ CERROR("unexpected connreq type %s (%d) from %s\n",
+ kgnilnd_connreq_type2str(&dgram->gndg_conn_in),
+ dgram->gndg_conn_in.gncr_type,
+ libcfs_nid2str(dgram->gndg_conn_in.gncr_srcnid));
+ rc = -EINVAL;
+ *needs_nak = 1;
+ break;
+ }
+
+connreq_out:
+ RETURN(rc);
+}
+
+int
+kgnilnd_probe_and_process_dgram(kgn_device_t *dev)
+{
+ int rc;
+ int needs_nak = 0;
+ lnet_nid_t nak_dstnid = LNET_NID_ANY;
+ lnet_nid_t orig_dstnid;
+ kgn_dgram_t *dgram = NULL;
+ kgn_peer_t *peer;
+ ENTRY;
+
+ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PAUSE_DGRAM_COMP)) {
+ rc = 0;
+ } else {
+ rc = kgnilnd_probe_for_dgram(dev, &dgram);
+ }
+
+ if (rc == 0) {
+ RETURN(0);
+ } else if (rc < 0) {
+ GOTO(inform_peer, rc);
+ } else {
+ /* rc > 1 means it did something, reset for this func */
+ rc = 0;
+ }
+
+ switch (dgram->gndg_type) {
+ case GNILND_DGRAM_WC_REQ:
+ case GNILND_DGRAM_REQ:
+ rc = kgnilnd_process_connreq(dgram, &needs_nak);
+ break;
+ case GNILND_DGRAM_NAK:
+ CDEBUG(D_NETTRACE, "NAK to %s done\n",
+ libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid));
+ break;
+ default:
+ CERROR("unknown datagram type %s (%d)\n",
+ kgnilnd_dgram_type2str(dgram), dgram->gndg_type);
+ break;
+ }
+
+ /* stash data to use after releasing current datagram */
+ /* don't stash net - we are operating on a net already,
+ * so the lock on rw_net_lock is sufficient */
+
+ nak_dstnid = dgram->gndg_conn_in.gncr_srcnid;
+
+inform_peer:
+ LASSERTF(dgram != NULL, "dgram 0x%p rc %d needs_nak %d\n", dgram, rc, needs_nak);
+
+ orig_dstnid = dgram->gndg_conn_out.gncr_dstnid;
+
+ kgnilnd_release_dgram(dev, dgram);
+
+ CDEBUG(D_NET, "cleaning up dgram to %s, rc %d\n",
+ libcfs_nid2str(orig_dstnid), rc);
+
+ /* if this was a WC_REQ that matched an existing peer, it'll get marked done
+ * in kgnilnd_finish_connect - if errors are from before we get to there,
+ * we just drop as it is a WC_REQ - the peer CAN'T be waiting for it */
+ if ((orig_dstnid != LNET_NID_ANY) && (rc < 0)) {
+ /* if we have a negative rc, we want to find a peer to inform about
+ * the bad connection attempt. Sorry buddy, better luck next time! */
+
+ write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+ peer = kgnilnd_find_peer_locked(orig_dstnid);
+
+ if (peer != NULL) {
+ /* add ref to make sure he stays around past the possible unlink
+ * so we can tell LNet about him */
+ kgnilnd_peer_addref(peer);
+
+ /* if he still cares about the outstanding connect */
+ if (peer->gnp_connecting >= GNILND_PEER_CONNECT) {
+ /* check if he is on the connd list and remove.. */
+ spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
+ if (!list_empty(&peer->gnp_connd_list)) {
+ list_del_init(&peer->gnp_connd_list);
+ /* drop connd ref */
+ kgnilnd_peer_decref(peer);
+ }
+ spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock);
+
+ /* clear gnp_connecting so we don't have a non-connecting peer
+ * on gnd_connd_list */
+ peer->gnp_connecting = GNILND_PEER_IDLE;
+
+ set_mb(peer->gnp_last_dgram_errno, rc);
+
+ kgnilnd_peer_increase_reconnect_locked(peer);
+ }
+ }
+ write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+ /* now that we are outside the lock, tell Mommy */
+ if (peer != NULL) {
+ kgnilnd_peer_notify(peer, rc);
+ kgnilnd_peer_decref(peer);
+ }
+ }
+
+ if (needs_nak) {
+ kgnilnd_send_nak(dev, nak_dstnid, rc);
+ }
+
+ RETURN(1);
+}
+
+void
+kgnilnd_reaper_dgram_check(kgn_device_t *dev)
+{
+ kgn_dgram_t *dgram, *tmp;
+ int i;
+
+ spin_lock(&dev->gnd_dgram_lock);
+
+ for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size - 1); i++) {
+ list_for_each_entry_safe(dgram, tmp, &dev->gnd_dgrams[i], gndg_list) {
+ unsigned long now = jiffies;
+ unsigned long timeout;
+
+ /* don't timeout stuff if the network is mucked or shutting down */
+ if (kgnilnd_check_hw_quiesce()) {
+ break;
+ }
+
+ if ((dgram->gndg_state != GNILND_DGRAM_POSTED) ||
+ (dgram->gndg_type == GNILND_DGRAM_WC_REQ)) {
+ continue;
+ }
+ CDEBUG(D_NETTRACE, "checking dgram 0x%p type %s "
+ "state %s conn 0x%p to %s age %lus\n",
+ dgram, kgnilnd_dgram_type2str(dgram),
+ kgnilnd_dgram_state2str(dgram), dgram->gndg_conn,
+ libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
+ cfs_duration_sec(now - dgram->gndg_post_time));
+
+ timeout = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout);
+
+ if (time_before(now, (dgram->gndg_post_time + timeout)))
+ continue;
+
+ CNETERR("%s datagram to %s timed out @ %lus dgram "
+ "0x%p state %s conn 0x%p\n",
+ kgnilnd_dgram_type2str(dgram),
+ libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid),
+ cfs_duration_sec(now - dgram->gndg_post_time),
+ dgram, kgnilnd_dgram_state2str(dgram),
+ dgram->gndg_conn);
+
+ kgnilnd_cancel_dgram_locked(dgram);
+ }
+ }
+ spin_unlock(&dev->gnd_dgram_lock);
+}
+
+
+/* use a thread for the possibly long-blocking wait_by_id to prevent
+ * stalling the global workqueues */
+int
+kgnilnd_dgram_waitq(void *arg)
+{
+ kgn_device_t *dev = (kgn_device_t *) arg;
+ char name[16];
+ gni_return_t grc;
+ __u64 readyid;
+ DEFINE_WAIT(mover_done);
+
+ snprintf(name, sizeof(name), "kgnilnd_dgn_%02d", dev->gnd_id);
+ cfs_daemonize(name);
+ cfs_block_allsigs();
+
+ /* all gnilnd threads need to run fairly urgently */
+ set_user_nice(current, *kgnilnd_tunables.kgn_nice);
+
+ /* we dont shut down until the device shuts down ... */
+ while (!kgnilnd_data.kgn_shutdown) {
+ /* to quiesce or to not quiesce, that is the question */
+ if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+ KGNILND_SPIN_QUIESCE;
+ }
+
+ while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_PAUSE_DGRAM_COMP, 1)) {}
+
+ /* check once a second */
+ grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle,
+ 1000, &readyid);
+
+ if (grc == GNI_RC_SUCCESS) {
+ CDEBUG(D_INFO, "waking up dgram mover thread\n");
+ kgnilnd_schedule_dgram(dev);
+
+ /* wait for dgram thread to ping us before spinning again */
+ prepare_to_wait(&dev->gnd_dgping_waitq, &mover_done,
+ TASK_INTERRUPTIBLE);
+
+ /* don't sleep if we need to quiesce */
+ if (likely(!kgnilnd_data.kgn_quiesce_trigger)) {
+ schedule();
+ }
+ finish_wait(&dev->gnd_dgping_waitq, &mover_done);
+ }
+ }
+
+ kgnilnd_thread_fini();
+ return 0;
+}
+
+int
+kgnilnd_start_outbound_dgrams(kgn_device_t *dev)
+{
+ int did_something = 0, rc;
+ kgn_peer_t *peer = NULL;
+
+ spin_lock(&dev->gnd_connd_lock);
+
+ /* Active connect - we added this in kgnilnd_launch_tx */
+ while (!list_empty(&dev->gnd_connd_peers)) {
+ peer = list_first_entry(&dev->gnd_connd_peers,
+ kgn_peer_t, gnp_connd_list);
+
+ /* ref for connd removed in if/else below */
+ list_del_init(&peer->gnp_connd_list);
+
+ /* gnp_connecting and membership on gnd_connd_peers should be
+ * done coherently to avoid double adding, etc */
+ /* don't need kgnilnd_data.kgn_peer_conn_lock here as that is only needed
+ * to get the peer to gnp_connecting in the first place. We just need to
+ * rely on gnd_connd_lock to serialize someone pulling him from the list
+ * BEFORE clearing gnp_connecting */
+ LASSERTF(peer->gnp_connecting != GNILND_PEER_IDLE, "peer 0x%p->%s not connecting\n",
+ peer, libcfs_nid2str(peer->gnp_nid));
+
+ spin_unlock(&dev->gnd_connd_lock);
+
+ CDEBUG(D_NET, "processing connect to %s\n",
+ libcfs_nid2str(peer->gnp_nid));
+
+ did_something += 1;
+ rc = kgnilnd_start_connect(peer);
+
+ if (likely(rc >= 0)) {
+ /* 0 on success, positive on 'just drop peer' errors */
+ kgnilnd_peer_decref(peer);
+ } else if (rc == -ENOMEM) {
+ /* if we are out of wildcards, add back to
+ * connd_list - then break out and we'll try later
+ * if other errors, we'll bail & cancel pending tx */
+ write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+ if (peer->gnp_connecting == GNILND_PEER_POSTING) {
+ peer->gnp_connecting = GNILND_PEER_CONNECT;
+ spin_lock(&dev->gnd_connd_lock);
+ list_add_tail(&peer->gnp_connd_list,
+ &dev->gnd_connd_peers);
+ } else {
+ /* connecting changed while we were posting */
+
+ LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid"
+ " state 0x%p->%s, connecting %d\n",
+ peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
+ peer->gnp_connecting = GNILND_PEER_KILL;
+ spin_lock(&dev->gnd_connd_lock);
+ /* remove the peer ref frrom the cond list */
+ kgnilnd_peer_decref(peer);
+ /* let the system handle itself */
+ }
+ write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+ /* the datagrams are a global pool,
+ * so break out of trying and hope some free
+ * up soon */
+ did_something -= 1;
+ break;
+ } else {
+ /* something bad happened, you lose */
+ CNETERR("could not start connecting to %s "
+ "rc %d: Will retry until TX timeout\n",
+ libcfs_nid2str(peer->gnp_nid), rc);
+ /* It didnt post so just set connecting back to zero now.
+ * The reaper will reattempt the connection if it needs too.
+ * If the peer needs death set it so the reaper will cleanup.
+ */
+ write_lock(&kgnilnd_data.kgn_peer_conn_lock);
+ if (peer->gnp_connecting == GNILND_PEER_POSTING) {
+ peer->gnp_connecting = GNILND_PEER_IDLE;
+ kgnilnd_peer_increase_reconnect_locked(peer);
+ } else {
+ LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid"
+ " state 0x%p->%s, connecting %d\n",
+ peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting);
+ peer->gnp_connecting = GNILND_PEER_KILL;
+ }
+ write_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+ /* hold onto ref until we are really done - if it was
+ * unlinked this could result in a destroy */
+ kgnilnd_peer_decref(peer);
+ }
+ spin_lock(&dev->gnd_connd_lock);
+ }
+
+ spin_unlock(&dev->gnd_connd_lock);
+ RETURN(did_something);
+}
+
+static void
+kgnilnd_dgram_poke_with_stick(unsigned long arg)
+{
+ int dev_id = arg;
+ kgn_device_t *dev = &kgnilnd_data.kgn_devices[dev_id];
+
+ wake_up(&dev->gnd_dgram_waitq);
+}
+
+/* use single thread for dgrams - should be sufficient for performance */
+int
+kgnilnd_dgram_mover(void *arg)
+{
+ kgn_device_t *dev = (kgn_device_t *)arg;
+ char name[16];
+ int rc, did_something;
+ unsigned long next_purge_check = jiffies - 1;
+ unsigned long timeout;
+ struct timer_list timer;
+ DEFINE_WAIT(wait);
+
+ snprintf(name, sizeof(name), "kgnilnd_dg_%02d", dev->gnd_id);
+ cfs_daemonize(name);
+ cfs_block_allsigs();
+ /* all gnilnd threads need to run fairly urgently */
+ set_user_nice(current, *kgnilnd_tunables.kgn_nice);
+
+ /* we are ok not locking for these variables as the dgram waitq threads
+ * will block both due to tying up net (kgn_shutdown) and the completion
+ * event for the dgram_waitq (kgn_quiesce_trigger) */
+
+ while (!kgnilnd_data.kgn_shutdown) {
+ /* Safe: kgn_shutdown only set when quiescent */
+
+ /* race with stack reset - we want to hold off seeing any new incoming dgrams
+ * so we can force a dirty WC dgram for Bug 762072 - put right before
+ * quiesce check so that it'll go right into that and not do any
+ * dgram mucking */
+ CFS_RACE(CFS_FAIL_GNI_WC_DGRAM_FREE);
+
+ /* to quiesce or to not quiesce, that is the question */
+ if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) {
+ KGNILND_SPIN_QUIESCE;
+ }
+ did_something = 0;
+
+ CFS_RACE(CFS_FAIL_GNI_QUIESCE_RACE);
+
+ /* process any newly completed dgrams */
+ down_read(&kgnilnd_data.kgn_net_rw_sem);
+
+ rc = kgnilnd_probe_and_process_dgram(dev);
+ if (rc > 0) {
+ did_something += rc;
+ }
+
+ up_read(&kgnilnd_data.kgn_net_rw_sem);
+
+ /* start new outbound dgrams */
+ did_something += kgnilnd_start_outbound_dgrams(dev);
+
+ /* find dead dgrams */
+ if (time_after_eq(jiffies, next_purge_check)) {
+ /* these don't need to be checked that often */
+ kgnilnd_reaper_dgram_check(dev);
+
+ next_purge_check = (long) jiffies +
+ cfs_time_seconds(kgnilnd_data.kgn_new_min_timeout / 4);
+ }
+
+ /* careful with the jiffy wrap... */
+ timeout = (long)(next_purge_check - jiffies);
+
+ CDEBUG(D_INFO, "did %d timeout %lu next %lu jiffies %lu\n",
+ did_something, timeout, next_purge_check, jiffies);
+
+ if (did_something || timeout <= 0) {
+ did_something = 0;
+ continue;
+ }
+
+ prepare_to_wait(&dev->gnd_dgram_waitq, &wait, TASK_INTERRUPTIBLE);
+
+ setup_timer(&timer, kgnilnd_dgram_poke_with_stick, dev->gnd_id);
+ mod_timer(&timer, (long) jiffies + timeout);
+
+ /* last second chance for others to poke us */
+ did_something += xchg(&dev->gnd_dgram_ready, GNILND_DGRAM_IDLE);
+
+ /* check flag variables before comitting */
+ if (!did_something &&
+ !kgnilnd_data.kgn_shutdown &&
+ !kgnilnd_data.kgn_quiesce_trigger) {
+ CDEBUG(D_INFO, "schedule timeout %ld (%lu sec)\n",
+ timeout, cfs_duration_sec(timeout));
+ wake_up_all(&dev->gnd_dgping_waitq);
+ schedule();
+ CDEBUG(D_INFO, "awake after schedule\n");
+ }
+
+ del_singleshot_timer_sync(&timer);
+ finish_wait(&dev->gnd_dgram_waitq, &wait);
+ }
+
+ kgnilnd_thread_fini();
+ return 0;
+}
+
--- /dev/null
+/*
+ * Copyright (C) 2009-2012 Cray, Inc.
+ *
+ * Author: Nic Henke <nic@cray.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#include "gnilnd.h"
+
+void
+_kgnilnd_debug_msg(kgn_msg_t *msg, struct libcfs_debug_msg_data *msgdata,
+ const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ /* XXX Nic TBD: add handling of gnm_u ? */
+ libcfs_debug_vmsg2(msgdata, fmt, args,
+ " msg@0x%p m/v/ty/ck/pck/pl %08x/%d/%d/%x/%x/%d x%d:%s\n",
+ msg, msg->gnm_magic, msg->gnm_version, msg->gnm_type,
+ msg->gnm_cksum, msg->gnm_payload_cksum,
+ msg->gnm_payload_len, msg->gnm_seq,
+ kgnilnd_msgtype2str(msg->gnm_type));
+ va_end(args);
+}
+
+void
+_kgnilnd_debug_conn(kgn_conn_t *conn, struct libcfs_debug_msg_data *msgdata,
+ const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ libcfs_debug_vmsg2(msgdata, fmt, args,
+ " conn@0x%p->%s:%s cq %u, to %ds, "
+ " RX %d @ %lu/%lus; TX %d @ %lus/%lus; "
+ " NOOP %lus/%lu/%lus; sched %lus/%lus/%lus ago \n",
+ conn, conn->gnc_peer ? libcfs_nid2str(conn->gnc_peer->gnp_nid) :
+ "<?>", kgnilnd_conn_state2str(conn),
+ conn->gnc_cqid, conn->gnc_timeout,
+ conn->gnc_rx_seq,
+ cfs_duration_sec(jiffies - conn->gnc_last_rx),
+ cfs_duration_sec(jiffies - conn->gnc_last_rx_cq),
+ conn->gnc_tx_seq,
+ cfs_duration_sec(jiffies - conn->gnc_last_tx),
+ cfs_duration_sec(jiffies - conn->gnc_last_tx_cq),
+ cfs_duration_sec(jiffies - conn->gnc_last_noop_want),
+ cfs_duration_sec(jiffies - conn->gnc_last_noop_sent),
+ cfs_duration_sec(jiffies - conn->gnc_last_noop_cq),
+ cfs_duration_sec(jiffies - conn->gnc_last_sched_ask),
+ cfs_duration_sec(jiffies - conn->gnc_last_sched_do),
+ cfs_duration_sec(jiffies - conn->gnc_device->gnd_sched_alive));
+
+
+ va_end(args);
+}
+
+void
+_kgnilnd_debug_tx(kgn_tx_t *tx, struct libcfs_debug_msg_data *msgdata,
+ const char *fmt, ...)
+{
+ kgn_tx_ev_id_t *id = &tx->tx_id;
+ char *nid = "<?>";
+ va_list args;
+
+ if (tx->tx_conn && tx->tx_conn->gnc_peer) {
+ nid = libcfs_nid2str(tx->tx_conn->gnc_peer->gnp_nid);
+ }
+
+ va_start(args, fmt);
+ libcfs_debug_vmsg2(msgdata, fmt, args,
+ " tx@0x%p->%s id "LPX64"/%u/%d:%d msg %x/%s/%d q %s@%lds->0x%p f %x re %d\n",
+ tx, nid, id->txe_cookie, id->txe_smsg_id, id->txe_cqid,
+ id->txe_idx, tx->tx_msg.gnm_type,
+ kgnilnd_msgtype2str(tx->tx_msg.gnm_type), tx->tx_buftype,
+ kgnilnd_tx_state2str(tx->tx_list_state),
+ cfs_duration_sec((long)jiffies - tx->tx_qtime), tx->tx_list_p,
+ tx->tx_state, tx->tx_retrans);
+ va_end(args);
+}
+
+void
+_kgnilnd_api_rc_lbug(const char* rcstr, int rc, struct libcfs_debug_msg_data *msgdata,
+ const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ libcfs_debug_vmsg2(msgdata, fmt, args,
+ " GNI API violated? Unexpected rc %s(%d)!\n",
+ rcstr, rc);
+ va_end(args);
+ LBUG();
+}
--- /dev/null
+/*
+ * Copyright (C) 2010-2012 Cray, Inc.
+ * Author: Nic Henke <nic@cray.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#ifndef _GNILND_HSS_OPS_H
+#define _GNILND_HSS_OPS_H
+
+/* for krca nid & nic translation */
+#include <krca_lib.h>
+#include <linux/typecheck.h>
+
+/* the SimNow nodes can't load rca.ko, so we need to detect this
+ * and fake a table that'd work for lookups there */
+
+typedef struct kgn_nid_entry {
+ __u32 nid;
+ __u32 nicaddr;
+} kgn_nid_entry_t;
+
+typedef struct kgn_hssops
+{
+ /* function pointers for nid and nic conversion */
+ /* from krca_lib.h */
+ int (*nid_to_nicaddr)(__u32 nid, int numnic, __u32 *nicaddr);
+ int (*nicaddr_to_nid)(__u32 nicaddr, __u32 *nid);
+ void (*hb_to_l0)(void);
+} kgn_hssops_t;
+
+/* pull in static store in gnilnd.c */
+extern kgn_hssops_t kgnilnd_hssops;
+
+#define GNILND_NO_RCA 0xdeadbeef
+#define GNILND_NO_QUIESCE 0xdeadbeef
+
+static inline int
+kgnilnd_lookup_rca_funcs(void)
+{
+ void *funcp;
+
+ funcp = __symbol_get("send_hb_2_l0");
+ if (funcp == 0) {
+ CERROR("couldn't find send_hb_2_l0\n");
+ /* not fatal for now */
+ } else {
+ kgnilnd_hssops.hb_to_l0 = funcp;
+ }
+
+ /* if we find one, we should get the other */
+
+ funcp = __symbol_get("krca_nid_to_nicaddrs");
+ if (funcp == 0) {
+ kgnilnd_hssops.nid_to_nicaddr = (void *)GNILND_NO_RCA;
+ kgnilnd_hssops.nicaddr_to_nid = (void *)GNILND_NO_RCA;
+ LCONSOLE_INFO("using SimNow nid table for RCA translation\n");
+ return 0;
+ }
+ kgnilnd_hssops.nid_to_nicaddr = funcp;
+
+ funcp = __symbol_get("krca_nicaddr_to_nid");
+ if (funcp == 0) {
+ CERROR("found krca_nid_to_nicaddrs but not "
+ "krca_nicaddr_to_nid\n");
+ return -ESRCH;
+ }
+ kgnilnd_hssops.nicaddr_to_nid = funcp;
+ return 0;
+}
+
+#if defined(CONFIG_CRAY_GEMINI)
+/* Gemini SimNow has a hard coded table to use - no RCA there */
+#define GNILND_MAX_NID_TABLE 0xffffffff
+/* this is all of the nodes defined in the Baker SimNow "sim_platforms" page */
+static kgn_nid_entry_t kgn_nid_table[] = {
+ {0x1, 0x100}, {0x2, 0x101}, {0x3, 0x104}, {0x4, 0x105},
+ {0x5, 0x108}, {0x6, 0x109}, {0x7, 0x10c}, {0x8, 0x10d},
+ {0x9, 0x110}, {0xa, 0x111}, {0xb, 0x114}, {0xc, 0x115},
+ {0xd, 0x118}, {0xe, 0x119}, {0xf, 0x11c}, {0x10, 0x11d},
+ {0x11, 0x120}, {0x12, 0x121}, {0x13, 0x124}, {0x14, 0x125},
+ {0x15, 0x128}, {0x16, 0x129}, {0x17, 0x12c}, {0x18, 0x12d},
+ {0x19, 0x130}, {0x1a, 0x131}, {0x1b, 0x134}, {0x1c, 0x135},
+ {0x1d, 0x138}, {0x1e, 0x139}, {0x1f, 0x13c}, {0x20, 0x13d},
+ {0x21, 0x140}, {0x22, 0x141}, {0x23, 0x144}, {0x24, 0x145},
+ {0x25, 0x148}, {0x26, 0x149}, {0x27, 0x14c}, {0x28, 0x14d},
+ {0x29, 0x150}, {0x2a, 0x151}, {0x2b, 0x154}, {0x2c, 0x155},
+ {0x2d, 0x158}, {0x2e, 0x159}, {0x2f, 0x15c}, {0x30, 0x15d},
+ {0x31, 0x160}, {0x32, 0x161}, {0x33, 0x164}, {0x3d, 0x178},
+ {0x34, 0x165}, {0x3e, 0x179}, {0x35, 0x168}, {0x3f, 0x17c},
+ {0x36, 0x169}, {0x40, 0x17d}, {0x37, 0x16c}, {0x41, 0x180},
+ {0x38, 0x16d}, {0x42, 0x181}, {0x39, 0x170}, {0x3a, 0x171},
+ {0x3b, 0x174}, {0x3c, 0x175}, {0x43, 0x184}, {0x44, 0x185},
+ {0x45, 0x188}, {0x46, 0x189}, {0x47, 0x18c}, {0x48, 0x18d},
+ /* entries after this are for 'dead' peer tests */
+ {0x63, 0x1ff}, {0x111, 0x209},
+ {GNILND_MAX_NID_TABLE, GNILND_MAX_NID_TABLE}
+};
+static int
+gemini_nid_to_nicaddr(__u32 nid, int numnic, __u32 *nicaddr)
+{
+ int i;
+
+ /* GNILND_NO_RCA, so use hardcoded table for Gemini SimNow */
+ if (numnic > 1) {
+ CERROR("manual nid2nic translation doesn't support"
+ "multiple nic addrs (you asked for %d)\n",
+ numnic);
+ return -EINVAL;
+ }
+
+ for (i = 0;;i++) {
+ if (kgn_nid_table[i].nid == GNILND_MAX_NID_TABLE) {
+ CERROR("could not translate %u to a NIC "
+ "address\n", nid);
+ return -ESRCH;
+ }
+ if (kgn_nid_table[i].nid == nid) {
+ *nicaddr = kgn_nid_table[i].nicaddr;
+ return 1;
+ }
+ }
+}
+
+static int
+gemini_nicaddr_to_nid(__u32 nicaddr, __u32 *nid)
+{
+ int i;
+
+ /* GNILND_RCA_NOT_HOME, so use hardcoded table for SimNow */
+ for (i = 0;;i++) {
+ if (kgn_nid_table[i].nicaddr == GNILND_MAX_NID_TABLE) {
+ CERROR("could not translate NIC address "
+ "%u\n",
+ nicaddr);
+ return -ESRCH;
+ }
+ if (kgn_nid_table[i].nicaddr == nicaddr) {
+ *nid = kgn_nid_table[i].nid;
+ return 1;
+ }
+ }
+}
+
+static inline int
+kgnilnd_setup_nic_translation(__u32 device_id)
+{
+ int rc;
+
+ /* do lookup on first use */
+ if (unlikely(kgnilnd_hssops.nid_to_nicaddr == NULL)) {
+ rc = kgnilnd_lookup_rca_funcs();
+ if (rc)
+ return rc;
+ }
+
+ /* if we have a real function, return - we'll use those going forward */
+ if (likely(kgnilnd_hssops.nid_to_nicaddr != (void *)GNILND_NO_RCA))
+ return 0;
+
+ kgnilnd_hssops.nid_to_nicaddr = gemini_nid_to_nicaddr;
+ kgnilnd_hssops.nicaddr_to_nid = gemini_nicaddr_to_nid;
+ return 0;
+}
+
+#elif defined(CONFIG_CRAY_ARIES)
+/* for libcfs_ipif_query */
+#include <libcfs/libcfs.h>
+
+/* Aries Sim doesn't have hardcoded tables, so we'll hijack the nic_pe
+ * and decode our address and nic addr from that - the rest are just offsets */
+static __u32 aries_sim_base_nid;
+static __u32 aries_sim_nic;
+
+static int
+aries_nid_to_nicaddr(__u32 nid, int numnic, __u32 *nicaddr)
+{
+ if (numnic > 1) {
+ CERROR("manual nid2nic translation doesn't support"
+ "multiple nic addrs (you asked for %d)\n",
+ numnic);
+ return -EINVAL;
+ }
+ if (nid < aries_sim_base_nid) {
+ CERROR("Request for invalid nid translation %u, minimum %u\n",
+ nid, aries_sim_base_nid);
+ return -ESRCH;
+ }
+
+ *nicaddr = nid - aries_sim_base_nid;
+ return 1;
+}
+
+static int
+aries_nicaddr_to_nid(__u32 nicaddr, __u32 *nid)
+{
+ *nid = aries_sim_base_nid + nicaddr;
+ return 1;
+}
+
+/* XXX Nic: This does not support multiple device!!!! */
+static inline int
+kgnilnd_setup_nic_translation(__u32 device_id)
+{
+ char *if_name = "ipogif0";
+ __u32 ipaddr, netmask, my_nid;
+ int up, rc;
+
+ /* do lookup on first use */
+ if (unlikely(kgnilnd_hssops.nid_to_nicaddr == NULL)) {
+ rc = kgnilnd_lookup_rca_funcs();
+ if (rc)
+ return rc;
+ }
+
+ /* if we have a real function, return - we'll use those going forward */
+ if (likely(kgnilnd_hssops.nid_to_nicaddr != (void *)GNILND_NO_RCA))
+ return 0;
+
+ rc = libcfs_ipif_query(if_name, &up, &ipaddr, &netmask);
+ if (rc != 0) {
+ CERROR("can't get IP interface for %s: %d\n", if_name, rc);
+ return rc;
+ }
+ if (!up) {
+ CERROR("IP interface %s is down\n", if_name);
+ return -ENODEV;
+ }
+
+ my_nid = ((ipaddr >> 8) & 0xFF) + (ipaddr & 0xFF);
+ aries_sim_nic = device_id;
+ aries_sim_base_nid = my_nid - aries_sim_nic;
+
+ kgnilnd_hssops.nid_to_nicaddr = aries_nid_to_nicaddr;
+ kgnilnd_hssops.nicaddr_to_nid = aries_nicaddr_to_nid;
+
+ return 0;
+}
+#else
+#error "Undefined Network Type"
+#endif
+
+/* we use RCA types here to get the compiler to whine when we have
+ * mismatched types */
+static inline int
+kgnilnd_nid_to_nicaddrs(rca_nid_t nid, int numnic, nic_addr_t *nicaddrs)
+{
+ /* compile time checks to ensure that the RCA types match
+ * the LNet idea of NID and NIC */
+ typecheck(__u32, nid);
+ typecheck(__u32, *nicaddrs);
+
+ LASSERTF(kgnilnd_hssops.nid_to_nicaddr != NULL, "missing setup?\n");
+
+ return kgnilnd_hssops.nid_to_nicaddr(nid, numnic, nicaddrs);
+}
+
+static inline int
+kgnilnd_nicaddr_to_nid(nic_addr_t nicaddr, rca_nid_t *nid)
+{
+ /* compile time checks to ensure that the RCA types match
+ * the LNet idea of NID and NIC */
+ typecheck(__u32, nicaddr);
+ typecheck(__u32, nid[0]);
+
+ LASSERTF(kgnilnd_hssops.nicaddr_to_nid != NULL, "missing setup ?\n");
+
+ return kgnilnd_hssops.nicaddr_to_nid(nicaddr, nid);
+}
+
+#endif /* _GNILND_HSS_OPS_H */
--- /dev/null
+/*
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ * Copyright (C) 2009-2012 Cray, Inc.
+ *
+ * Derived from work by: Eric Barton <eric@bartonsoftware.com>
+ * Author: Nic Henke <nic@cray.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "gnilnd.h"
+
+static int credits = 256;
+CFS_MODULE_PARM(credits, "i", int, 0444,
+ "# concurrent sends");
+
+static int peer_credits = 16;
+CFS_MODULE_PARM(peer_credits, "i", int, 0444,
+ "# LNet peer credits");
+
+/* NB - we'll not actually limit sends to this, we just size the mailbox buffer
+ * such that at most we'll have concurrent_sends * max_immediate messages
+ * in the mailbox */
+static int concurrent_sends = 0;
+CFS_MODULE_PARM(concurrent_sends, "i", int, 0444,
+ "# concurrent HW sends to 1 peer");
+
+/* default for 2k nodes @ 16 peer credits */
+static int fma_cq_size = 32768;
+CFS_MODULE_PARM(fma_cq_size, "i", int, 0444,
+ "size of the completion queue");
+
+static int timeout = GNILND_BASE_TIMEOUT;
+/* can't change @ runtime because LNet gets NI data at startup from
+ * this value */
+CFS_MODULE_PARM(timeout, "i", int, 0444,
+ "communications timeout (seconds)");
+
+/* time to wait between datagram timeout and sending of next dgram */
+static int min_reconnect_interval = GNILND_MIN_RECONNECT_TO;
+CFS_MODULE_PARM(min_reconnect_interval, "i", int, 0644,
+ "minimum connection retry interval (seconds)");
+
+/* if this goes longer than timeout, we'll timeout the TX before
+ * the dgram */
+static int max_reconnect_interval = GNILND_MAX_RECONNECT_TO;
+CFS_MODULE_PARM(max_reconnect_interval, "i", int, 0644,
+ "maximum connection retry interval (seconds)");
+
+static int max_immediate = (2<<10);
+CFS_MODULE_PARM(max_immediate, "i", int, 0644,
+ "immediate/RDMA breakpoint");
+
+#ifdef CONFIG_CRAY_GEMINI
+static int checksum = GNILND_CHECKSUM_SMSG_BTE;
+#else
+static int checksum = 0;
+#endif
+CFS_MODULE_PARM(checksum, "i", int, 0644,
+ "0: None, 1: headers, 2: short msg, 3: all traffic");
+
+static int checksum_dump = 0;
+CFS_MODULE_PARM(checksum_dump, "i", int, 0644,
+ "0: None, 1: dump log on failure, 2: payload data to D_INFO log");
+
+static int bte_hash = 1;
+CFS_MODULE_PARM(bte_hash, "i", int, 0644,
+ "enable hashing for BTE (RDMA) transfers");
+
+static int bte_adapt = 1;
+CFS_MODULE_PARM(bte_adapt, "i", int, 0644,
+ "enable adaptive request and response for BTE (RDMA) transfers");
+
+static int bte_relaxed_ordering = 1;
+CFS_MODULE_PARM(bte_relaxed_ordering, "i", int, 0644,
+ "enable relaxed ordering (PASSPW) for BTE (RDMA) transfers");
+
+static int ptag = GNI_PTAG_LND;
+CFS_MODULE_PARM(ptag, "i", int, 0444,
+ "ptag for Gemini CDM");
+
+static int max_retransmits = 1024;
+CFS_MODULE_PARM(max_retransmits, "i", int, 0644,
+ "max retransmits for FMA");
+
+static int nwildcard = 4;
+CFS_MODULE_PARM(nwildcard, "i", int, 0444,
+ "# wildcard datagrams to post per net (interface)");
+
+static int nice = -20;
+CFS_MODULE_PARM(nice, "i", int, 0444,
+ "nice value for kgnilnd threads, default -20");
+
+static int rdmaq_intervals = 4;
+CFS_MODULE_PARM(rdmaq_intervals, "i", int, 0644,
+ "# intervals per second for rdmaq throttling, default 4, 0 to disable");
+
+static int loops = 100;
+CFS_MODULE_PARM(loops, "i", int, 0644,
+ "# of loops before scheduler is friendly, default 100");
+
+static int hash_size = 503;
+CFS_MODULE_PARM(hash_size, "i", int, 0444,
+ "prime number for peer/conn hash sizing, default 503");
+
+static int peer_health = 0;
+CFS_MODULE_PARM(peer_health, "i", int, 0444,
+ "Disable peer timeout for LNet peer health, default off, > 0 to enable");
+
+static int vmap_cksum = 0;
+CFS_MODULE_PARM(vmap_cksum, "i", int, 0644,
+ "use vmap for all kiov checksumming, default off");
+
+static int mbox_per_block = GNILND_FMABLK;
+CFS_MODULE_PARM(mbox_per_block, "i", int, 0644,
+ "mailboxes per block");
+
+static int nphys_mbox = 0;
+CFS_MODULE_PARM(nphys_mbox, "i", int, 0444,
+ "# mbox to preallocate from physical memory, default 0");
+
+static int mbox_credits = GNILND_MBOX_CREDITS;
+CFS_MODULE_PARM(mbox_credits, "i", int, 0644,
+ "number of credits per mailbox");
+
+static int sched_threads = GNILND_SCHED_THREADS;
+CFS_MODULE_PARM(sched_threads, "i", int, 0444,
+ "number of threads for moving data");
+
+static int net_hash_size = 11;
+CFS_MODULE_PARM(net_hash_size, "i", int, 0444,
+ "prime number for net hash sizing, default 11");
+
+static int hardware_timeout = GNILND_HARDWARE_TIMEOUT;
+CFS_MODULE_PARM(hardware_timeout, "i", int, 0444,
+ "maximum time for traffic to get from one node to another");
+
+static int mdd_timeout = GNILND_MDD_TIMEOUT;
+CFS_MODULE_PARM(mdd_timeout, "i", int, 0644,
+ "maximum time (in minutes) for mdd to be held");
+
+kgn_tunables_t kgnilnd_tunables = {
+ .kgn_min_reconnect_interval = &min_reconnect_interval,
+ .kgn_max_reconnect_interval = &max_reconnect_interval,
+ .kgn_credits = &credits,
+ .kgn_peer_credits = &peer_credits,
+ .kgn_concurrent_sends = &concurrent_sends,
+ .kgn_fma_cq_size = &fma_cq_size,
+ .kgn_timeout = &timeout,
+ .kgn_max_immediate = &max_immediate,
+ .kgn_checksum = &checksum,
+ .kgn_checksum_dump = &checksum_dump,
+ .kgn_bte_hash = &bte_hash,
+ .kgn_bte_adapt = &bte_adapt,
+ .kgn_bte_relaxed_ordering = &bte_relaxed_ordering,
+ .kgn_ptag = &ptag,
+ .kgn_max_retransmits = &max_retransmits,
+ .kgn_nwildcard = &nwildcard,
+ .kgn_nice = &nice,
+ .kgn_rdmaq_intervals = &rdmaq_intervals,
+ .kgn_loops = &loops,
+ .kgn_peer_hash_size = &hash_size,
+ .kgn_peer_health = &peer_health,
+ .kgn_vmap_cksum = &vmap_cksum,
+ .kgn_mbox_per_block = &mbox_per_block,
+ .kgn_nphys_mbox = &nphys_mbox,
+ .kgn_mbox_credits = &mbox_credits,
+ .kgn_sched_threads = &sched_threads,
+ .kgn_net_hash_size = &net_hash_size,
+ .kgn_hardware_timeout = &hardware_timeout,
+ .kgn_mdd_timeout = &mdd_timeout
+};
+
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+static cfs_sysctl_table_t kgnilnd_ctl_table[] = {
+ {
+ INIT_CTL_NAME(2)
+ .procname = "min_reconnect_interval",
+ .data = &min_reconnect_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME(3)
+ .procname = "max_reconnect_interval",
+ .data = &max_reconnect_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME(5)
+ .procname = "credits",
+ .data = &credits,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME(6)
+ .procname = "peer_credits",
+ .data = &peer_credits,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME(7)
+ .procname = "fma_cq_size",
+ .data = &fma_cq_size,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME(8)
+ .procname = "timeout",
+ .data = &timeout,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME(9)
+ .procname = "max_immediate",
+ .data = &max_immediate,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME(10)
+ .procname = "checksum",
+ .data = &checksum,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME(11)
+ .procname = "bte_hash",
+ .data = &bte_hash,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME(12)
+ .procname = "bte_adapt",
+ .data = &bte_adapt,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME(13)
+ .procname = "ptag",
+ .data = &ptag,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME(14)
+ .procname = "nwildcard",
+ .data = &nwildcard,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME(15)
+ .procname = "bte_relaxed_ordering",
+ .data = &bte_relaxed_ordering,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME(16)
+ .procname = "checksum_dump",
+ .data = &checksum_dump,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME(17)
+ .procname = "nice",
+ .data = &nice,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME(18)
+ .procname = "rdmaq_intervals",
+ .data = &rdmaq_intervals,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME(19)
+ .procname = "loops",
+ .data = &loops,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME(20)
+ .procname = "hash_size",
+ .data = &hash_size,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME(21)
+ .procname = "peer_health",
+ .data = &peer_health,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME(22)
+ .procname = "vmap_cksum",
+ .data = &vmap_cksum,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME(23)
+ .procname = "mbox_per_block",
+ .data = &mbox_per_block,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME(24)
+ .procname = "mbox_credits"
+ .data = &mbox_credits,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME(25)
+ .procname = "sched_threads"
+ .data = &sched_threads,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME(26)
+ .procname = "net_hash_size",
+ .data = &net_hash_size,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME(27)
+ .procname = "hardware_timeout",
+ .data = &hardware_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME(28)
+ .procname = "mdd_timeout",
+ .data = &mdd_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME(29)
+ .procname = "max_retransmits"
+ .data = &max_retransmits,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME(30)
+ .procname = "concurrent_sends",
+ .data = &concurrent_sends,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME(31)
+ .procname = "nphys_mbox",
+ .data = &nphys_mbox,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ },
+ {0}
+};
+
+static cfs_sysctl_table_t kgnilnd_top_ctl_table[] = {
+ {
+ INIT_CTL_NAME(202)
+ .procname = "gnilnd",
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0555,
+ .child = kgnilnd_ctl_table
+ },
+ { INIT_CTL_NAME(0) }
+};
+#endif
+
+int
+kgnilnd_tunables_init()
+{
+ int rc = 0;
+
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+ kgnilnd_tunables.kgn_sysctl =
+ cfs_register_sysctl_table(kgnilnd_top_ctl_table, 0);
+
+ if (kgnilnd_tunables.kgn_sysctl == NULL)
+ CWARN("Can't setup /proc tunables\n");
+#endif
+ switch (*kgnilnd_tunables.kgn_checksum) {
+ default:
+ CERROR("Invalid checksum module parameter: %d\n",
+ *kgnilnd_tunables.kgn_checksum);
+ rc = -EINVAL;
+ GOTO(out, rc);
+ case GNILND_CHECKSUM_OFF:
+ /* no checksumming */
+ break;
+ case GNILND_CHECKSUM_SMSG_HEADER:
+ LCONSOLE_INFO("SMSG header only checksumming enabled\n");
+ break;
+ case GNILND_CHECKSUM_SMSG:
+ LCONSOLE_INFO("SMSG checksumming enabled\n");
+ break;
+ case GNILND_CHECKSUM_SMSG_BTE:
+ LCONSOLE_INFO("SMSG + BTE checksumming enabled\n");
+ break;
+ }
+
+ if (*kgnilnd_tunables.kgn_max_immediate > GNILND_MAX_IMMEDIATE) {
+ LCONSOLE_ERROR("kgnilnd module parameter 'max_immediate' too large %d > %d\n",
+ *kgnilnd_tunables.kgn_max_immediate, GNILND_MAX_IMMEDIATE);
+ rc = -EINVAL;
+ GOTO(out, rc);
+ }
+
+ if (*kgnilnd_tunables.kgn_mbox_per_block < 1) {
+ *kgnilnd_tunables.kgn_mbox_per_block = 1;
+ }
+
+ if (*kgnilnd_tunables.kgn_concurrent_sends == 0) {
+ *kgnilnd_tunables.kgn_concurrent_sends = *kgnilnd_tunables.kgn_peer_credits;
+ } else if (*kgnilnd_tunables.kgn_concurrent_sends > *kgnilnd_tunables.kgn_peer_credits) {
+ LCONSOLE_ERROR("kgnilnd parameter 'concurrent_sends' too large: %d > %d (peer_credits)\n",
+ *kgnilnd_tunables.kgn_concurrent_sends, *kgnilnd_tunables.kgn_peer_credits);
+ rc = -EINVAL;
+ }
+out:
+ return rc;
+}
+
+void
+kgnilnd_tunables_fini()
+{
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+ if (kgnilnd_tunables.kgn_sysctl != NULL)
+ cfs_unregister_sysctl_table(kgnilnd_tunables.kgn_sysctl);
+#endif
+}
--- /dev/null
+/*
+ * Copyright (C) 2009-2012 Cray, Inc.
+ *
+ * Author: Nic Henke <nic@cray.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* this code liberated and modified from lnet/lnet/router_proc.c */
+
+#define DEBUG_SUBSYSTEM S_LND
+#include "gnilnd.h"
+#include <linux/seq_file.h>
+
+#define GNILND_PROC_STATS "stats"
+#define GNILND_PROC_MDD "mdd"
+#define GNILND_PROC_SMSG "smsg"
+#define GNILND_PROC_CONN "conn"
+#define GNILND_PROC_PEER "peer"
+#define GNILND_PROC_CKSUM_TEST "cksum_test"
+
+static int
+_kgnilnd_proc_run_cksum_test(int caseno, int nloops, int nob)
+{
+ lnet_kiov_t *src, *dest;
+ struct timespec begin, end, diff;
+ int niov;
+ int i = 0, j = 0, n;
+ __u16 cksum, cksum2;
+ __u64 mbytes;
+
+ LIBCFS_ALLOC(src, LNET_MAX_IOV * sizeof(lnet_kiov_t));
+ LIBCFS_ALLOC(dest, LNET_MAX_IOV * sizeof(lnet_kiov_t));
+
+ if (src == NULL || dest == NULL) {
+ CERROR("couldn't allocate iovs\n");
+ GOTO(unwind, -ENOMEM);
+ }
+
+ for (i = 0; i < LNET_MAX_IOV; i++) {
+ src[i].kiov_offset = 0;
+ src[i].kiov_len = CFS_PAGE_SIZE;
+ src[i].kiov_page = cfs_alloc_page(CFS_ALLOC_STD|CFS_ALLOC_ZERO);
+
+ if (src[i].kiov_page == NULL) {
+ CERROR("couldn't allocate page %d\n", i);
+ GOTO(unwind, -ENOMEM);
+ }
+
+ dest[i].kiov_offset = 0;
+ dest[i].kiov_len = CFS_PAGE_SIZE;
+ dest[i].kiov_page = cfs_alloc_page(CFS_ALLOC_STD|CFS_ALLOC_ZERO);
+
+ if (dest[i].kiov_page == NULL) {
+ CERROR("couldn't allocate page %d\n", i);
+ GOTO(unwind, -ENOMEM);
+ }
+ }
+
+ /* add extra 2 pages - one for offset of src, 2nd to allow dest offset */
+ niov = (nob / PAGE_SIZE) + 2;
+ if (niov > LNET_MAX_IOV) {
+ CERROR("bytes %d too large, requires niov %d > %d\n",
+ nob, niov, LNET_MAX_IOV);
+ GOTO(unwind, -E2BIG);
+ }
+
+ /* setup real data */
+ src[0].kiov_offset = 317;
+ dest[0].kiov_offset = 592;
+ switch (caseno) {
+ default:
+ /* odd -> even */
+ break;
+ case 1:
+ /* odd -> odd */
+ dest[0].kiov_offset -= 1;
+ break;
+ case 2:
+ /* even -> even */
+ src[0].kiov_offset += 1;
+ break;
+ case 3:
+ /* even -> odd */
+ src[0].kiov_offset += 1;
+ dest[0].kiov_offset -= 1;
+ }
+ src[0].kiov_len = PAGE_SIZE - src[0].kiov_offset;
+ dest[0].kiov_len = PAGE_SIZE - dest[0].kiov_offset;
+
+ for (i = 0; i < niov; i++) {
+ memset(page_address(src[i].kiov_page) + src[i].kiov_offset,
+ 0xf0 + i, src[i].kiov_len);
+ }
+
+ lnet_copy_kiov2kiov(niov, dest, 0, niov, src, 0, nob);
+
+ getnstimeofday(&begin);
+
+ for (n = 0; n < nloops; n++) {
+ CDEBUG(D_BUFFS, "case %d loop %d src %d dest %d nob %d niov %d\n",
+ caseno, n, src[0].kiov_offset, dest[0].kiov_offset, nob, niov);
+ cksum = kgnilnd_cksum_kiov(niov, src, 0, nob - n, 1);
+ cksum2 = kgnilnd_cksum_kiov(niov, dest, 0, nob - n, 1);
+
+ if (cksum != cksum2) {
+ CERROR("case %d loop %d different checksums %x expected %x\n",
+ j, n, cksum2, cksum);
+ GOTO(unwind, -ENOKEY);
+ }
+ }
+
+ getnstimeofday(&end);
+
+ mbytes = (nloops * nob * 2) / (1024*1024);
+
+ diff = kgnilnd_ts_sub(end, begin);
+
+ LCONSOLE_INFO("running "LPD64"MB took %ld.%ld seconds\n",
+ mbytes, diff.tv_sec, diff.tv_nsec);
+
+unwind:
+ CDEBUG(D_NET, "freeing %d pages\n", i);
+ for (i -= 1; i >= 0; i--) {
+ if (src[i].kiov_page != NULL) {
+ cfs_free_page(src[i].kiov_page);
+ }
+ if (dest[i].kiov_page != NULL) {
+ cfs_free_page(dest[i].kiov_page);
+ }
+ }
+
+ if (src != NULL)
+ LIBCFS_FREE(src, LNET_MAX_IOV * sizeof(lnet_kiov_t));
+ if (dest != NULL)
+ LIBCFS_FREE(dest, LNET_MAX_IOV * sizeof(lnet_kiov_t));
+ return 0;
+}
+
+static int
+kgnilnd_proc_cksum_test_write(struct file *file, const char *ubuffer,
+ unsigned long count, void *data)
+{
+ char dummy[256 + 1] = { '\0' };
+ int testno, nloops, nbytes;
+ int rc;
+
+ if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) {
+ CERROR("can't run cksum test, kgnilnd is not initialized yet\n");
+ return -ENOSYS;
+ }
+
+ if (count >= sizeof(dummy) || count == 0)
+ return -EINVAL;
+
+ if (copy_from_user(dummy, ubuffer, count))
+ return -EFAULT;
+
+ if (sscanf(dummy, "%d:%d:%d", &testno, &nloops, &nbytes) == 3) {
+ rc = _kgnilnd_proc_run_cksum_test(testno, nloops, nbytes);
+ if (rc < 0) {
+ RETURN(rc);
+ } else {
+ /* spurious, but lets us know the parse was ok */
+ RETURN(count);
+ }
+ }
+ RETURN(count);
+}
+
+static int
+kgnilnd_proc_stats_read(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ kgn_device_t *dev;
+ struct timeval now;
+ int rc;
+
+ if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) {
+ rc = sprintf(page,
+ "kgnilnd is not initialized yet\n");
+ return rc;
+ }
+
+ /* only do the first device */
+ dev = &kgnilnd_data.kgn_devices[0];
+
+ /* sampling is racy, but so is reading this file! */
+ smp_rmb();
+ do_gettimeofday(&now);
+
+ rc = sprintf(page, "time: %lu.%lu\n"
+ "ntx: %d\n"
+ "npeers: %d\n"
+ "nconns: %d\n"
+ "nEPs: %d\n"
+ "ndgrams: %d\n"
+ "nfmablk: %d\n"
+ "n_mdd: %d\n"
+ "n_mdd_held: %d\n"
+ "GART map bytes: %ld\n"
+ "TX queued maps: %d\n"
+ "TX phys nmaps: %d\n"
+ "TX phys bytes: %lu\n"
+ "TX virt nmaps: %d\n"
+ "TX virt bytes: "LPU64"\n"
+ "RDMAQ bytes_auth: %ld\n"
+ "RDMAQ bytes_left: %ld\n"
+ "RDMAQ nstalls: %d\n"
+ "dev mutex delay: %ld\n"
+ "dev n_yield: %d\n"
+ "dev n_schedule: %d\n"
+ "SMSG fast_try: %d\n"
+ "SMSG fast_ok: %d\n"
+ "SMSG fast_block: %d\n"
+ "SMSG ntx: %d\n"
+ "SMSG tx_bytes: %ld\n"
+ "SMSG nrx: %d\n"
+ "SMSG rx_bytes: %ld\n"
+ "RDMA ntx: %d\n"
+ "RDMA tx_bytes: %ld\n"
+ "RDMA nrx: %d\n"
+ "RDMA rx_bytes: %ld\n"
+ "VMAP short: %d\n"
+ "VMAP cksum: %d\n"
+ "KMAP short: %d\n",
+ now.tv_sec, now.tv_usec,
+ atomic_read(&kgnilnd_data.kgn_ntx),
+ atomic_read(&kgnilnd_data.kgn_npeers),
+ atomic_read(&kgnilnd_data.kgn_nconns),
+ atomic_read(&dev->gnd_neps),
+ atomic_read(&dev->gnd_ndgrams),
+ atomic_read(&dev->gnd_nfmablk),
+ atomic_read(&dev->gnd_n_mdd), atomic_read(&dev->gnd_n_mdd_held),
+ atomic64_read(&dev->gnd_nbytes_map),
+ atomic_read(&dev->gnd_nq_map),
+ dev->gnd_map_nphys, dev->gnd_map_physnop * PAGE_SIZE,
+ dev->gnd_map_nvirt, dev->gnd_map_virtnob,
+ atomic64_read(&dev->gnd_rdmaq_bytes_out),
+ atomic64_read(&dev->gnd_rdmaq_bytes_ok),
+ atomic_read(&dev->gnd_rdmaq_nstalls),
+ dev->gnd_mutex_delay,
+ atomic_read(&dev->gnd_n_yield), atomic_read(&dev->gnd_n_schedule),
+ atomic_read(&dev->gnd_fast_try), atomic_read(&dev->gnd_fast_ok),
+ atomic_read(&dev->gnd_fast_block),
+ atomic_read(&dev->gnd_short_ntx), atomic64_read(&dev->gnd_short_txbytes),
+ atomic_read(&dev->gnd_short_nrx), atomic64_read(&dev->gnd_short_rxbytes),
+ atomic_read(&dev->gnd_rdma_ntx), atomic64_read(&dev->gnd_rdma_txbytes),
+ atomic_read(&dev->gnd_rdma_nrx), atomic64_read(&dev->gnd_rdma_rxbytes),
+ atomic_read(&kgnilnd_data.kgn_nvmap_short),
+ atomic_read(&kgnilnd_data.kgn_nvmap_cksum),
+ atomic_read(&kgnilnd_data.kgn_nkmap_short));
+
+ return rc;
+}
+
+static int
+kgnilnd_proc_stats_write(struct file *file, const char *ubuffer,
+ unsigned long count, void *data)
+{
+ kgn_device_t *dev;
+
+ if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) {
+ CERROR("kgnilnd is not initialized for stats write\n");
+ return -EINVAL;
+ }
+
+ /* only do the first device */
+ dev = &kgnilnd_data.kgn_devices[0];
+
+ atomic_set(&dev->gnd_short_ntx, 0);
+ atomic_set(&dev->gnd_short_nrx, 0);
+ atomic64_set(&dev->gnd_short_txbytes, 0);
+ atomic64_set(&dev->gnd_short_rxbytes, 0);
+ atomic_set(&dev->gnd_rdma_ntx, 0);
+ atomic_set(&dev->gnd_rdma_nrx, 0);
+ atomic_set(&dev->gnd_fast_ok, 0);
+ atomic_set(&dev->gnd_fast_try, 0);
+ atomic_set(&dev->gnd_fast_block, 0);
+ atomic64_set(&dev->gnd_rdma_txbytes, 0);
+ atomic64_set(&dev->gnd_rdma_rxbytes, 0);
+ atomic_set(&dev->gnd_rdmaq_nstalls, 0);
+ set_mb(dev->gnd_mutex_delay, 0);
+ atomic_set(&dev->gnd_n_yield, 0);
+ atomic_set(&dev->gnd_n_schedule, 0);
+ atomic_set(&kgnilnd_data.kgn_nvmap_short, 0);
+ atomic_set(&kgnilnd_data.kgn_nvmap_cksum, 0);
+ atomic_set(&kgnilnd_data.kgn_nkmap_short, 0);
+ /* sampling is racy, but so is writing this file! */
+ smp_wmb();
+ return count;
+}
+
+typedef struct {
+ kgn_device_t *gmdd_dev;
+ kgn_tx_t *gmdd_tx;
+ loff_t gmdd_off;
+} kgn_mdd_seq_iter_t;
+
+int
+kgnilnd_mdd_seq_seek(kgn_mdd_seq_iter_t *gseq, loff_t off)
+{
+ kgn_tx_t *tx;
+ struct list_head *r;
+ loff_t here;
+ int rc = 0;
+
+ if (off == 0) {
+ gseq->gmdd_tx = NULL;
+ gseq->gmdd_off = 0;
+ return 0;
+ }
+
+ tx = gseq->gmdd_tx;
+
+ if (tx == NULL || gseq->gmdd_off > off) {
+ /* search from start */
+ r = gseq->gmdd_dev->gnd_map_list.next;
+ here = 1;
+ } else {
+ /* continue current search */
+ r = &tx->tx_map_list;
+ here = gseq->gmdd_off;
+ }
+
+ gseq->gmdd_off = off;
+
+ while (r != &gseq->gmdd_dev->gnd_map_list) {
+ kgn_tx_t *t;
+
+ t = list_entry(r, kgn_tx_t, tx_map_list);
+
+ if (here == off) {
+ gseq->gmdd_tx = t;
+ rc = 0;
+ goto out;
+ }
+ r = r->next;
+ here++;
+ }
+
+ gseq->gmdd_tx = NULL;
+ rc = -ENOENT;
+out:
+ return rc;
+}
+
+static void *
+kgnilnd_mdd_seq_start(struct seq_file *s, loff_t *pos)
+{
+
+ kgn_mdd_seq_iter_t *gseq;
+ int rc;
+
+ if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) {
+ return NULL;
+ }
+
+ LIBCFS_ALLOC(gseq, sizeof(*gseq));
+ if (gseq == NULL) {
+ CERROR("could not allocate mdd sequence iterator\n");
+ return NULL;
+ }
+
+ /* only doing device 0 for now */
+ gseq->gmdd_dev = &kgnilnd_data.kgn_devices[0];
+ gseq->gmdd_tx = NULL;
+
+ /* need to lock map while we poke - huge disturbance
+ * but without it, no way to get the data printed */
+ spin_lock(&gseq->gmdd_dev->gnd_map_lock);
+
+ /* set private to gseq for stop */
+ s->private = gseq;
+
+ rc = kgnilnd_mdd_seq_seek(gseq, *pos);
+ if (rc == 0)
+ return gseq;
+ else
+ return NULL;
+}
+
+static void
+kgnilnd_mdd_seq_stop(struct seq_file *s, void *iter)
+{
+ kgn_mdd_seq_iter_t *gseq = s->private;
+
+ if (gseq != NULL) {
+ spin_unlock(&gseq->gmdd_dev->gnd_map_lock);
+ LIBCFS_FREE(gseq, sizeof(*gseq));
+ }
+}
+
+static void *
+kgnilnd_mdd_seq_next(struct seq_file *s, void *iter, loff_t *pos)
+{
+ kgn_mdd_seq_iter_t *gseq = iter;
+ int rc;
+ loff_t next = *pos + 1;
+
+ rc = kgnilnd_mdd_seq_seek(gseq, next);
+ if (rc != 0) {
+ return NULL;
+ }
+ *pos = next;
+ return gseq;
+}
+
+static int
+kgnilnd_mdd_seq_show(struct seq_file *s, void *iter)
+{
+ kgn_mdd_seq_iter_t *gseq = iter;
+ kgn_tx_t *tx;
+ __u64 nob;
+ __u32 physnop;
+ int id;
+ int buftype;
+ gni_mem_handle_t hndl;
+
+ if (gseq->gmdd_off == 0) {
+ seq_printf(s, "%s %22s %16s %8s %8s %37s\n",
+ "tx", "tx_id", "nob", "physnop",
+ "buftype", "mem handle");
+ return 0;
+ }
+
+ tx = gseq->gmdd_tx;
+ LASSERT(tx != NULL);
+
+ id = tx->tx_id.txe_smsg_id;
+ nob = tx->tx_nob;
+ physnop = tx->tx_phys_npages;
+ buftype = tx->tx_buftype;
+ hndl.qword1 = tx->tx_map_key.qword1;
+ hndl.qword2 = tx->tx_map_key.qword2;
+
+ seq_printf(s, "%p %x %16"LPF64"u %8d %#8x "LPX64"."LPX64"x\n",
+ tx, id, nob, physnop, buftype,
+ hndl.qword1, hndl.qword2);
+
+ return 0;
+}
+
+static struct seq_operations kgn_mdd_sops = {
+ .start = kgnilnd_mdd_seq_start,
+ .stop = kgnilnd_mdd_seq_stop,
+ .next = kgnilnd_mdd_seq_next,
+ .show = kgnilnd_mdd_seq_show,
+
+};
+
+static int
+kgnilnd_mdd_seq_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *sf;
+ int rc;
+
+ rc = seq_open(file, &kgn_mdd_sops);
+ if (rc == 0) {
+ sf = file->private_data;
+
+ /* NULL means we've not yet open() */
+ sf->private = NULL;
+ }
+ return rc;
+}
+
+static struct file_operations kgn_mdd_fops = {
+ .owner = THIS_MODULE,
+ .open = kgnilnd_mdd_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+typedef struct {
+ __u64 gsmsg_version;
+ kgn_device_t *gsmsg_dev;
+ kgn_fma_memblock_t *gsmsg_fmablk;
+ loff_t gsmsg_off;
+} kgn_smsg_seq_iter_t;
+
+int
+kgnilnd_smsg_seq_seek(kgn_smsg_seq_iter_t *gseq, loff_t off)
+{
+ kgn_fma_memblock_t *fmablk;
+ kgn_device_t *dev;
+ struct list_head *r;
+ loff_t here;
+ int rc = 0;
+
+ /* offset 0 is the header, so we start real entries at
+ * here == off == 1 */
+ if (off == 0) {
+ gseq->gsmsg_fmablk = NULL;
+ gseq->gsmsg_off = 0;
+ return 0;
+ }
+
+ fmablk = gseq->gsmsg_fmablk;
+ dev = gseq->gsmsg_dev;
+
+ spin_lock(&dev->gnd_fmablk_lock);
+
+ if (fmablk != NULL &&
+ gseq->gsmsg_version != atomic_read(&dev->gnd_fmablk_vers)) {
+ /* list changed */
+ rc = -ESTALE;
+ goto out;
+ }
+
+ if (fmablk == NULL || gseq->gsmsg_off > off) {
+ /* search from start */
+ r = dev->gnd_fma_buffs.next;
+ here = 1;
+ } else {
+ /* continue current search */
+ r = &fmablk->gnm_bufflist;
+ here = gseq->gsmsg_off;
+ }
+
+ gseq->gsmsg_version = atomic_read(&dev->gnd_fmablk_vers);
+ gseq->gsmsg_off = off;
+
+ while (r != &dev->gnd_fma_buffs) {
+ kgn_fma_memblock_t *t;
+
+ t = list_entry(r, kgn_fma_memblock_t, gnm_bufflist);
+
+ if (here == off) {
+ gseq->gsmsg_fmablk = t;
+ rc = 0;
+ goto out;
+ }
+ r = r->next;
+ here++;
+ }
+
+ gseq->gsmsg_fmablk = NULL;
+ rc = -ENOENT;
+out:
+ spin_unlock(&dev->gnd_fmablk_lock);
+ return rc;
+}
+
+static void *
+kgnilnd_smsg_seq_start(struct seq_file *s, loff_t *pos)
+{
+
+ kgn_smsg_seq_iter_t *gseq;
+ int rc;
+
+ if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) {
+ return NULL;
+ }
+
+ LIBCFS_ALLOC(gseq, sizeof(*gseq));
+ if (gseq == NULL) {
+ CERROR("could not allocate smsg sequence iterator\n");
+ return NULL;
+ }
+
+ /* only doing device 0 for now */
+ gseq->gsmsg_dev = &kgnilnd_data.kgn_devices[0];
+ gseq->gsmsg_fmablk = NULL;
+ rc = kgnilnd_smsg_seq_seek(gseq, *pos);
+ if (rc == 0)
+ return gseq;
+
+ LIBCFS_FREE(gseq, sizeof(*gseq));
+ return NULL;
+}
+
+static void
+kgnilnd_smsg_seq_stop(struct seq_file *s, void *iter)
+{
+ kgn_smsg_seq_iter_t *gseq = iter;
+
+ if (gseq != NULL)
+ LIBCFS_FREE(gseq, sizeof(*gseq));
+}
+
+static void *
+kgnilnd_smsg_seq_next(struct seq_file *s, void *iter, loff_t *pos)
+{
+ kgn_smsg_seq_iter_t *gseq = iter;
+ int rc;
+ loff_t next = *pos + 1;
+
+ rc = kgnilnd_smsg_seq_seek(gseq, next);
+ if (rc != 0) {
+ LIBCFS_FREE(gseq, sizeof(*gseq));
+ return NULL;
+ }
+ *pos = next;
+ return gseq;
+}
+
+static int
+kgnilnd_smsg_seq_show(struct seq_file *s, void *iter)
+{
+ kgn_smsg_seq_iter_t *gseq = iter;
+ kgn_fma_memblock_t *fmablk;
+ kgn_device_t *dev;
+ int avail_mboxs, held_mboxs, num_mboxs;
+ unsigned int blk_size;
+ int live;
+ kgn_fmablk_state_t state;
+ gni_mem_handle_t hndl;
+
+ if (gseq->gsmsg_off == 0) {
+ seq_printf(s, "%5s %4s %6s/%5s/%5s %9s %18s %37s\n",
+ "blk#", "type", "avail", "held", "total", "size",
+ "fmablk", "mem handle");
+ return 0;
+ }
+
+ fmablk = gseq->gsmsg_fmablk;
+ dev = gseq->gsmsg_dev;
+ LASSERT(fmablk != NULL);
+
+ spin_lock(&dev->gnd_fmablk_lock);
+
+ if (gseq->gsmsg_version != atomic_read(&dev->gnd_fmablk_vers)) {
+ /* list changed */
+ spin_unlock(&dev->gnd_fmablk_lock);
+ return -ESTALE;
+ }
+
+ live = fmablk->gnm_hold_timeout == 0;
+ /* none are available if it isn't live... */
+ avail_mboxs = live ? fmablk->gnm_avail_mboxs : 0;
+ held_mboxs = fmablk->gnm_held_mboxs;
+ num_mboxs = fmablk->gnm_num_mboxs;
+ blk_size = fmablk->gnm_blk_size;
+ state = fmablk->gnm_state;
+ hndl.qword1 = fmablk->gnm_hndl.qword1;
+ hndl.qword2 = fmablk->gnm_hndl.qword2;
+
+ spin_unlock(&dev->gnd_fmablk_lock);
+
+ if (live) {
+ seq_printf(s, "%5d %4s %6d/%5d/%5d %9d %18p "LPX64"."LPX64"\n",
+ (int) gseq->gsmsg_off, kgnilnd_fmablk_state2str(state),
+ avail_mboxs, held_mboxs, num_mboxs, blk_size,
+ fmablk, hndl.qword1, hndl.qword2);
+ } else {
+ seq_printf(s, "%5d %4s %6d/%5d/%5d %9d %18p %37s\n",
+ (int) gseq->gsmsg_off, kgnilnd_fmablk_state2str(state),
+ avail_mboxs, held_mboxs, num_mboxs, blk_size,
+ fmablk, "PURGATORY.HOLD");
+ }
+
+ return 0;
+}
+
+static struct seq_operations kgn_smsg_sops = {
+ .start = kgnilnd_smsg_seq_start,
+ .stop = kgnilnd_smsg_seq_stop,
+ .next = kgnilnd_smsg_seq_next,
+ .show = kgnilnd_smsg_seq_show,
+
+};
+
+static int
+kgnilnd_smsg_seq_open(struct inode *inode, struct file *file)
+{
+ struct proc_dir_entry *dp = PDE(inode);
+ struct seq_file *sf;
+ int rc;
+
+ rc = seq_open(file, &kgn_smsg_sops);
+ if (rc == 0) {
+ sf = file->private_data;
+ sf->private = dp->data;
+ }
+
+ return rc;
+}
+
+static struct file_operations kgn_smsg_fops = {
+ .owner = THIS_MODULE,
+ .open = kgnilnd_smsg_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+typedef struct {
+ __u64 gconn_version;
+ struct list_head *gconn_list;
+ kgn_conn_t *gconn_conn;
+ loff_t gconn_off;
+ int gconn_hashidx;
+} kgn_conn_seq_iter_t;
+
+int
+kgnilnd_conn_seq_seek(kgn_conn_seq_iter_t *gseq, loff_t off)
+{
+ struct list_head *list, *tmp;
+ loff_t here = 0;
+ int rc = 0;
+
+ if (off == 0) {
+ gseq->gconn_hashidx = 0;
+ gseq->gconn_list = NULL;
+ }
+
+ if (off > atomic_read(&kgnilnd_data.kgn_nconns)) {
+ gseq->gconn_list = NULL;
+ rc = -ENOENT;
+ }
+
+ read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+ if (gseq->gconn_list != NULL &&
+ gseq->gconn_version != kgnilnd_data.kgn_conn_version) {
+ /* list changed */
+ rc = -ESTALE;
+ goto out;
+ }
+
+ if ((gseq->gconn_list == NULL) ||
+ (gseq->gconn_off > off) ||
+ (gseq->gconn_hashidx >= *kgnilnd_tunables.kgn_peer_hash_size)) {
+ /* search from start */
+ gseq->gconn_hashidx = 0;
+ list = &kgnilnd_data.kgn_conns[gseq->gconn_hashidx];
+ here = 0;
+ } else {
+ /* continue current search */
+ list = gseq->gconn_list;
+ }
+
+ gseq->gconn_version = kgnilnd_data.kgn_conn_version;
+ gseq->gconn_off = off;
+
+start_list:
+
+ list_for_each(tmp, list) {
+ if (here == off) {
+ kgn_conn_t *conn;
+ conn = list_entry(tmp, kgn_conn_t, gnc_hashlist);
+ gseq->gconn_conn = conn;
+ rc = 0;
+ goto out;
+ }
+ here++;
+ }
+ /* if we got through this hash bucket with 'off' still to go, try next*/
+ gseq->gconn_hashidx++;
+ if ((here <= off) &&
+ (gseq->gconn_hashidx < *kgnilnd_tunables.kgn_peer_hash_size)) {
+ list = &kgnilnd_data.kgn_conns[gseq->gconn_hashidx];
+ goto start_list;
+ }
+
+ gseq->gconn_list = NULL;
+ rc = -ENOENT;
+out:
+ read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+ return rc;
+}
+
+static void *
+kgnilnd_conn_seq_start(struct seq_file *s, loff_t *pos)
+{
+
+ kgn_conn_seq_iter_t *gseq;
+ int rc;
+
+ if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) {
+ return NULL;
+ }
+
+ LIBCFS_ALLOC(gseq, sizeof(*gseq));
+ if (gseq == NULL) {
+ CERROR("could not allocate conn sequence iterator\n");
+ return NULL;
+ }
+
+ /* only doing device 0 for now */
+ gseq->gconn_list = NULL;
+ rc = kgnilnd_conn_seq_seek(gseq, *pos);
+ if (rc == 0)
+ return gseq;
+
+ LIBCFS_FREE(gseq, sizeof(*gseq));
+ return NULL;
+}
+
+static void
+kgnilnd_conn_seq_stop(struct seq_file *s, void *iter)
+{
+ kgn_conn_seq_iter_t *gseq = iter;
+
+ if (gseq != NULL)
+ LIBCFS_FREE(gseq, sizeof(*gseq));
+}
+
+static void *
+kgnilnd_conn_seq_next(struct seq_file *s, void *iter, loff_t *pos)
+{
+ kgn_conn_seq_iter_t *gseq = iter;
+ int rc;
+ loff_t next = *pos + 1;
+
+ rc = kgnilnd_conn_seq_seek(gseq, next);
+ if (rc != 0) {
+ LIBCFS_FREE(gseq, sizeof(*gseq));
+ return NULL;
+ }
+ *pos = next;
+ return gseq;
+}
+
+static int
+kgnilnd_conn_seq_show(struct seq_file *s, void *iter)
+{
+ kgn_conn_seq_iter_t *gseq = iter;
+ kgn_peer_t *peer = NULL;
+ kgn_conn_t *conn;
+
+ /* there is no header data for conns, so offset 0 is the first
+ * real entry. */
+
+ conn = gseq->gconn_conn;
+ LASSERT(conn != NULL);
+
+ read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+ if (gseq->gconn_list != NULL &&
+ gseq->gconn_version != kgnilnd_data.kgn_conn_version) {
+ /* list changed */
+ read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+ return -ESTALE;
+ }
+
+ /* instead of saving off the data, just refcount */
+ kgnilnd_conn_addref(conn);
+ if (conn->gnc_peer) {
+ /* don't use link - after unlock it could get nuked */
+ peer = conn->gnc_peer;
+ kgnilnd_peer_addref(peer);
+ }
+
+ read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+ seq_printf(s, "%p->%s [%d] q %d/%d/%d "
+ "tx sq %u %dms/%dms "
+ "rx sq %u %dms/%dms "
+ "noop r/s %d/%d w/s/cq %lds/%lds/%lds "
+ "sched a/d %lds/%lds "
+ "tx_re "LPD64" TO %ds %s\n",
+ conn, peer ? libcfs_nid2str(peer->gnp_nid) : "<?>",
+ atomic_read(&conn->gnc_refcount),
+ kgnilnd_count_list(&conn->gnc_fmaq),
+ atomic_read(&conn->gnc_nlive_fma),
+ atomic_read(&conn->gnc_nlive_rdma),
+ conn->gnc_tx_seq,
+ jiffies_to_msecs(jiffies - conn->gnc_last_tx),
+ jiffies_to_msecs(jiffies - conn->gnc_last_tx_cq),
+ conn->gnc_rx_seq,
+ jiffies_to_msecs(jiffies - conn->gnc_last_rx),
+ jiffies_to_msecs(jiffies - conn->gnc_last_rx_cq),
+ atomic_read(&conn->gnc_reaper_noop),
+ atomic_read(&conn->gnc_sched_noop),
+ cfs_duration_sec(jiffies - conn->gnc_last_noop_want),
+ cfs_duration_sec(jiffies - conn->gnc_last_noop_sent),
+ cfs_duration_sec(jiffies - conn->gnc_last_noop_cq),
+ cfs_duration_sec(jiffies - conn->gnc_last_sched_ask),
+ cfs_duration_sec(jiffies - conn->gnc_last_sched_do),
+ conn->gnc_tx_retrans, conn->gnc_timeout,
+ kgnilnd_conn_state2str(conn));
+
+ if (peer)
+ kgnilnd_peer_decref(peer);
+ kgnilnd_conn_decref(conn);
+
+ return 0;
+}
+
+static struct seq_operations kgn_conn_sops = {
+ .start = kgnilnd_conn_seq_start,
+ .stop = kgnilnd_conn_seq_stop,
+ .next = kgnilnd_conn_seq_next,
+ .show = kgnilnd_conn_seq_show,
+
+};
+
+static int
+kgnilnd_conn_seq_open(struct inode *inode, struct file *file)
+{
+ struct proc_dir_entry *dp = PDE(inode);
+ struct seq_file *sf;
+ int rc;
+
+ rc = seq_open(file, &kgn_conn_sops);
+ if (rc == 0) {
+ sf = file->private_data;
+ sf->private = dp->data;
+ }
+
+ return rc;
+}
+
+static struct file_operations kgn_conn_fops = {
+ .owner = THIS_MODULE,
+ .open = kgnilnd_conn_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+typedef struct {
+ __u64 gpeer_version;
+ struct list_head *gpeer_list;
+ kgn_peer_t *gpeer_peer;
+ loff_t gpeer_off;
+ int gpeer_hashidx;
+} kgn_peer_seq_iter_t;
+
+int
+kgnilnd_peer_seq_seek(kgn_peer_seq_iter_t *gseq, loff_t off)
+{
+ struct list_head *list, *tmp;
+ loff_t here = 0;
+ int rc = 0;
+
+ if (off == 0) {
+ gseq->gpeer_hashidx = 0;
+ gseq->gpeer_list = NULL;
+ }
+
+ if (off > atomic_read(&kgnilnd_data.kgn_npeers)) {
+ gseq->gpeer_list = NULL;
+ rc = -ENOENT;
+ }
+
+ read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+ if (gseq->gpeer_list != NULL &&
+ gseq->gpeer_version != kgnilnd_data.kgn_peer_version) {
+ /* list changed */
+ rc = -ESTALE;
+ goto out;
+ }
+
+ if ((gseq->gpeer_list == NULL) ||
+ (gseq->gpeer_off > off) ||
+ (gseq->gpeer_hashidx >= *kgnilnd_tunables.kgn_peer_hash_size)) {
+ /* search from start */
+ gseq->gpeer_hashidx = 0;
+ list = &kgnilnd_data.kgn_peers[gseq->gpeer_hashidx];
+ here = 0;
+ } else {
+ /* continue current search */
+ list = gseq->gpeer_list;
+ }
+
+ gseq->gpeer_version = kgnilnd_data.kgn_peer_version;
+ gseq->gpeer_off = off;
+
+start_list:
+
+ list_for_each(tmp, list) {
+ if (here == off) {
+ kgn_peer_t *peer;
+ peer = list_entry(tmp, kgn_peer_t, gnp_list);
+ gseq->gpeer_peer = peer;
+ rc = 0;
+ goto out;
+ }
+ here++;
+ }
+ /* if we got through this hash bucket with 'off' still to go, try next*/
+ gseq->gpeer_hashidx++;
+ if ((here <= off) &&
+ (gseq->gpeer_hashidx < *kgnilnd_tunables.kgn_peer_hash_size)) {
+ list = &kgnilnd_data.kgn_peers[gseq->gpeer_hashidx];
+ goto start_list;
+ }
+
+ gseq->gpeer_list = NULL;
+ rc = -ENOENT;
+out:
+ read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+ return rc;
+}
+
+static void *
+kgnilnd_peer_seq_start(struct seq_file *s, loff_t *pos)
+{
+
+ kgn_peer_seq_iter_t *gseq;
+ int rc;
+
+ if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) {
+ return NULL;
+ }
+
+ LIBCFS_ALLOC(gseq, sizeof(*gseq));
+ if (gseq == NULL) {
+ CERROR("could not allocate peer sequence iterator\n");
+ return NULL;
+ }
+
+ /* only doing device 0 for now */
+ gseq->gpeer_list = NULL;
+ rc = kgnilnd_peer_seq_seek(gseq, *pos);
+ if (rc == 0)
+ return gseq;
+
+ LIBCFS_FREE(gseq, sizeof(*gseq));
+ return NULL;
+}
+
+static void
+kgnilnd_peer_seq_stop(struct seq_file *s, void *iter)
+{
+ kgn_peer_seq_iter_t *gseq = iter;
+
+ if (gseq != NULL)
+ LIBCFS_FREE(gseq, sizeof(*gseq));
+}
+
+static void *
+kgnilnd_peer_seq_next(struct seq_file *s, void *iter, loff_t *pos)
+{
+ kgn_peer_seq_iter_t *gseq = iter;
+ int rc;
+ loff_t next = *pos + 1;
+
+ rc = kgnilnd_peer_seq_seek(gseq, next);
+ if (rc != 0) {
+ LIBCFS_FREE(gseq, sizeof(*gseq));
+ return NULL;
+ }
+ *pos = next;
+ return gseq;
+}
+
+static int
+kgnilnd_peer_seq_show(struct seq_file *s, void *iter)
+{
+ kgn_peer_seq_iter_t *gseq = iter;
+ kgn_peer_t *peer;
+ kgn_conn_t *conn;
+ char conn_str;
+ int purg_count = 0;
+ /* there is no header data for peers, so offset 0 is the first
+ * real entry. */
+
+ peer = gseq->gpeer_peer;
+ LASSERT(peer != NULL);
+
+ read_lock(&kgnilnd_data.kgn_peer_conn_lock);
+ if (gseq->gpeer_list != NULL &&
+ gseq->gpeer_version != kgnilnd_data.kgn_peer_version) {
+ /* list changed */
+ read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+ return -ESTALE;
+ }
+
+ /* instead of saving off the data, just refcount */
+ kgnilnd_peer_addref(peer);
+ conn = kgnilnd_find_conn_locked(peer);
+
+ if (peer->gnp_connecting) {
+ conn_str = 'S';
+ } else if (conn != NULL) {
+ conn_str = 'C';
+ } else {
+ conn_str = 'D';
+ }
+
+ list_for_each_entry(conn, &peer->gnp_conns, gnc_list) {
+ if (conn->gnc_in_purgatory) {
+ purg_count++;
+ }
+ }
+
+ read_unlock(&kgnilnd_data.kgn_peer_conn_lock);
+
+ seq_printf(s, "%p->%s [%d] NIC 0x%x q %d conn %c purg %d "
+ "last %d@%dms dgram %d@%dms "
+ "reconn %dms to %lus \n",
+ peer, libcfs_nid2str(peer->gnp_nid),
+ atomic_read(&peer->gnp_refcount),
+ peer->gnp_host_id,
+ kgnilnd_count_list(&peer->gnp_tx_queue),
+ conn_str,
+ purg_count,
+ peer->gnp_last_errno,
+ jiffies_to_msecs(jiffies - peer->gnp_last_alive),
+ peer->gnp_last_dgram_errno,
+ jiffies_to_msecs(jiffies - peer->gnp_last_dgram_time),
+ peer->gnp_reconnect_interval != 0
+ ? jiffies_to_msecs(jiffies - peer->gnp_reconnect_time)
+ : 0,
+ peer->gnp_reconnect_interval);
+
+ kgnilnd_peer_decref(peer);
+
+ return 0;
+}
+
+static struct seq_operations kgn_peer_sops = {
+ .start = kgnilnd_peer_seq_start,
+ .stop = kgnilnd_peer_seq_stop,
+ .next = kgnilnd_peer_seq_next,
+ .show = kgnilnd_peer_seq_show,
+};
+
+static int
+kgnilnd_peer_seq_open(struct inode *inode, struct file *file)
+{
+ struct proc_dir_entry *dp = PDE(inode);
+ struct seq_file *sf;
+ int rc;
+
+ rc = seq_open(file, &kgn_peer_sops);
+ if (rc == 0) {
+ sf = file->private_data;
+ sf->private = dp->data;
+ }
+
+ return rc;
+}
+
+static struct file_operations kgn_peer_fops = {
+ .owner = THIS_MODULE,
+ .open = kgnilnd_peer_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static struct proc_dir_entry *kgn_proc_root;
+
+void
+kgnilnd_proc_init(void)
+{
+ struct proc_dir_entry *pde;
+ int rc = 0;
+ ENTRY;
+
+ /* setup dir */
+ kgn_proc_root = proc_mkdir(libcfs_lnd2modname(GNILND), NULL);
+ if (kgn_proc_root == NULL) {
+ CERROR("couldn't create proc dir %s\n",
+ libcfs_lnd2modname(GNILND));
+ return;
+ }
+
+ /* Initialize CKSUM_TEST */
+ pde = create_proc_entry(GNILND_PROC_CKSUM_TEST, 0200, kgn_proc_root);
+ if (pde == NULL) {
+ CERROR("couldn't create proc entry %s\n", GNILND_PROC_CKSUM_TEST);
+ rc = -ENOENT;
+ GOTO(remove_dir, rc);
+ }
+
+ pde->data = NULL;
+ pde->write_proc = kgnilnd_proc_cksum_test_write;
+
+ /* Initialize STATS */
+ pde = create_proc_entry(GNILND_PROC_STATS, 0644, kgn_proc_root);
+ if (pde == NULL) {
+ CERROR("couldn't create proc entry %s\n", GNILND_PROC_STATS);
+ rc = -ENOENT;
+ GOTO(remove_test, rc);
+ }
+
+ pde->data = NULL;
+ pde->read_proc = kgnilnd_proc_stats_read;
+ pde->write_proc = kgnilnd_proc_stats_write;
+
+ /* Initialize MDD */
+ pde = create_proc_entry(GNILND_PROC_MDD, 0444, kgn_proc_root);
+ if (pde == NULL) {
+ CERROR("couldn't create proc entry %s\n", GNILND_PROC_MDD);
+ rc = -ENOENT;
+ GOTO(remove_stats, rc);
+ }
+
+ pde->data = NULL;
+ pde->proc_fops = &kgn_mdd_fops;
+
+ /* Initialize SMSG */
+ pde = create_proc_entry(GNILND_PROC_SMSG, 0444, kgn_proc_root);
+ if (pde == NULL) {
+ CERROR("couldn't create proc entry %s\n", GNILND_PROC_SMSG);
+ rc = -ENOENT;
+ GOTO(remove_mdd, rc);
+ }
+
+ pde->data = NULL;
+ pde->proc_fops = &kgn_smsg_fops;
+
+ /* Initialize CONN */
+ pde = create_proc_entry(GNILND_PROC_CONN, 0444, kgn_proc_root);
+ if (pde == NULL) {
+ CERROR("couldn't create proc entry %s\n", GNILND_PROC_CONN);
+ rc = -ENOENT;
+ GOTO(remove_smsg, rc);
+ }
+
+ pde->data = NULL;
+ pde->proc_fops = &kgn_conn_fops;
+
+ /* Initialize PEER */
+ pde = create_proc_entry(GNILND_PROC_PEER, 0444, kgn_proc_root);
+ if (pde == NULL) {
+ CERROR("couldn't create proc entry %s\n", GNILND_PROC_PEER);
+ rc = -ENOENT;
+ GOTO(remove_conn, rc);
+ }
+
+ pde->data = NULL;
+ pde->proc_fops = &kgn_peer_fops;
+ RETURN_EXIT;
+
+remove_conn:
+ remove_proc_entry(GNILND_PROC_CONN, kgn_proc_root);
+remove_smsg:
+ remove_proc_entry(GNILND_PROC_SMSG, kgn_proc_root);
+remove_mdd:
+ remove_proc_entry(GNILND_PROC_MDD, kgn_proc_root);
+remove_stats:
+ remove_proc_entry(GNILND_PROC_STATS, kgn_proc_root);
+remove_test:
+ remove_proc_entry(GNILND_PROC_CKSUM_TEST, kgn_proc_root);
+remove_dir:
+ remove_proc_entry(kgn_proc_root->name, NULL);
+
+ RETURN_EXIT;
+}
+
+void
+kgnilnd_proc_fini(void)
+{
+ remove_proc_entry(GNILND_PROC_PEER, kgn_proc_root);
+ remove_proc_entry(GNILND_PROC_CONN, kgn_proc_root);
+ remove_proc_entry(GNILND_PROC_MDD, kgn_proc_root);
+ remove_proc_entry(GNILND_PROC_SMSG, kgn_proc_root);
+ remove_proc_entry(GNILND_PROC_STATS, kgn_proc_root);
+ remove_proc_entry(GNILND_PROC_CKSUM_TEST, kgn_proc_root);
+ remove_proc_entry(kgn_proc_root->name, NULL);
+}
--- /dev/null
+/*
+ * Copyright (C) 2012 Cray, Inc.
+ *
+ * Author: Nic Henke <nic@cray.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#include "gnilnd.h"
+
+/* Advance all timeouts by nap_time seconds. */
+void
+kgnilnd_bump_timeouts(__u32 nap_time, char *reason)
+{
+ int i;
+ kgn_peer_t *peer;
+ kgn_conn_t *conn;
+ kgn_tx_t *tx;
+ kgn_device_t *dev;
+ kgn_dgram_t *dgram;
+
+ LCONSOLE_INFO("%s: bumping all timeouts by %ds\n", reason, nap_time);
+
+ LASSERTF(GNILND_IS_QUIESCED, "gnilnd not quiesced %d != %d\n",
+ atomic_read(&kgnilnd_data.kgn_nquiesce),
+ atomic_read(&kgnilnd_data.kgn_nthreads));
+
+ /* requiring that the threads are paused ensures a couple of things:
+ * - combined code paths for stack reset and quiesce event as stack reset
+ * runs with the threads paused
+ * - prevents traffic to the Gemini during a quiesce period
+ * - reduces the locking requirements
+ */
+
+ for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
+ list_for_each_entry(peer, &kgnilnd_data.kgn_peers[i], gnp_list) {
+
+ /* we can reconnect again at any time */
+ peer->gnp_reconnect_time = jiffies;
+ /* reset now that network is healthy */
+ peer->gnp_reconnect_interval = 0;
+ /* tell LNet dude is still alive */
+ kgnilnd_peer_alive(peer);
+
+ list_for_each_entry(tx, &peer->gnp_tx_queue, tx_list) {
+ tx->tx_qtime = jiffies;
+ }
+
+ list_for_each_entry(conn, &peer->gnp_conns, gnc_list) {
+ unsigned long timeout;
+
+ timeout = cfs_time_seconds(conn->gnc_timeout);
+
+ /* bump last_rx/last_rx_cq on all conns - including
+ * closed ones, this will have the effect of
+ * bumping the purgatory timers for those */
+ conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies;
+
+ /* we don't timeout based on old gnc_last_tx, so
+ * we'll back it up and schedule the conn to trigger
+ * a NOOP */
+ conn->gnc_last_tx = jiffies - timeout;
+ kgnilnd_schedule_conn(conn);
+ }
+ }
+ }
+
+ for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+ dev = &kgnilnd_data.kgn_devices[i];
+ for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size - 1); i++) {
+ list_for_each_entry(dgram, &dev->gnd_dgrams[i], gndg_list) {
+ dgram->gndg_post_time = jiffies;
+ }
+ }
+ }
+}
+
+/* Quiesce or wake up the stack. The caller must hold the kgn_quiesce_sem semaphore
+ * on entry, which holds off any pending stack shutdown. */
+void
+kgnilnd_quiesce_wait(char *reason)
+{
+ int i;
+
+ if (kgnilnd_data.kgn_quiesce_trigger) {
+ unsigned long quiesce_deadline, quiesce_to;
+ /* FREEZE TAG!!!! */
+
+ /* morning sunshine */
+ spin_lock(&kgnilnd_data.kgn_reaper_lock);
+ wake_up_all(&kgnilnd_data.kgn_reaper_waitq);
+ spin_unlock(&kgnilnd_data.kgn_reaper_lock);
+
+ for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+ kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
+
+ wake_up_all(&dev->gnd_waitq);
+ wake_up_all(&dev->gnd_dgram_waitq);
+ wake_up_all(&dev->gnd_dgping_waitq);
+ }
+
+ /* we'll wait for 10x the timeout for the threads to pause */
+ quiesce_to = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout * 10);
+ quiesce_deadline = (long) jiffies + quiesce_to;
+
+ /* wait for everyone to check-in as quiesced */
+ i = 1;
+ while (!GNILND_IS_QUIESCED) {
+ i++;
+ LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
+ "%s: Waiting for %d threads to pause\n",
+ reason,
+ atomic_read(&kgnilnd_data.kgn_nthreads) -
+ atomic_read(&kgnilnd_data.kgn_nquiesce));
+ CFS_RACE(CFS_FAIL_GNI_QUIESCE_RACE);
+ cfs_pause(cfs_time_seconds(1 * i));
+
+ LASSERTF(quiesce_deadline > jiffies,
+ "couldn't quiesce threads in %lu seconds, falling over now\n",
+ cfs_duration_sec(quiesce_to));
+ }
+
+ LCONSOLE_WARN("%s: All threads paused!\n", reason);
+ /* XXX Nic: Is there a set of counters we can grab here to
+ * ensure that there is no traffic until quiesce is over ?*/
+ } else {
+ /* GO! GO! GO! */
+
+ for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+ kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
+ kgnilnd_schedule_dgram(dev);
+ }
+
+ /* wait for everyone to check-in as running - they will be spinning
+ * and looking, so no need to poke any waitq */
+ i = 1;
+ while (atomic_read(&kgnilnd_data.kgn_nquiesce) > 0) {
+ i++;
+ LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
+ "%s: Waiting for %d threads to wake up\n",
+ reason,
+ atomic_read(&kgnilnd_data.kgn_nquiesce));
+ cfs_pause(cfs_time_seconds(1 * i));
+ }
+
+ LCONSOLE_WARN("%s: All threads awake!\n", reason);
+ }
+}
+
+/* Reset the stack. */
+void
+kgnilnd_reset_stack(void)
+{
+ int i, rc = 0;
+ kgn_net_t *net;
+ kgn_peer_t *peer, *peerN;
+ LIST_HEAD (souls);
+ char *reason = "critical hardware error";
+ __u32 seconds;
+ unsigned long start, end;
+ ENTRY;
+
+ /* Race with del_peer and its atomics */
+ CFS_RACE(CFS_FAIL_GNI_RACE_RESET);
+
+ if (kgnilnd_data.kgn_init != GNILND_INIT_ALL) {
+ CERROR("can't reset the stack, gnilnd is not initialized\n");
+ RETURN_EXIT;
+ }
+
+ /* First make sure we are not already quiesced - we panic if so,
+ * as that could leave software in a bad state */
+ LASSERTF(kgnilnd_data.kgn_quiesce_trigger == GNILND_QUIESCE_IDLE,
+ "can't reset the stack, already doing so: trigger %d\n",
+ kgnilnd_data.kgn_quiesce_trigger);
+
+ set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_RESET);
+
+ /* wake up the dgram waitq thread - but after trigger set to make sure it
+ * goes into quiesce */
+ CFS_RACE(CFS_FAIL_GNI_WC_DGRAM_FREE);
+ /* same for scheduler that is dropping state transitiosn */
+ CFS_RACE(CFS_FAIL_GNI_DROP_CLOSING);
+ CFS_RACE(CFS_FAIL_GNI_DROP_DESTROY_EP);
+
+ kgnilnd_quiesce_wait(reason);
+
+ start = jiffies;
+
+ kgnilnd_data.kgn_in_reset = 1;
+ kgnilnd_data.kgn_nresets++;
+ LCONSOLE_WARN("%s: resetting all resources (count %d)\n",
+ reason, kgnilnd_data.kgn_nresets);
+
+ for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) {
+ list_for_each_entry(net, &kgnilnd_data.kgn_nets[i], gnn_list) {
+ rc = kgnilnd_cancel_net_dgrams(net);
+ LASSERTF(rc == 0, "couldn't cleanup datagrams: %d\n", rc);
+ }
+ }
+
+ /* error -ENOTRECOVERABLE is stack reset */
+ kgnilnd_del_conn_or_peer(NULL, LNET_NID_ANY, GNILND_DEL_CONN, -ENOTRECOVERABLE);
+
+ for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+ kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
+ kgnilnd_cancel_wc_dgrams(dev);
+ kgnilnd_wait_for_canceled_dgrams(dev);
+ }
+
+ /* manually do some conn processing ala kgnilnd_process_conns */
+ for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+ kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
+ kgn_conn_t *conn;
+ int conn_sched;
+
+ /* go find all the closed conns that need to be nuked - the
+ * scheduler thread isn't running to do this for us */
+
+ CDEBUG(D_NET, "will try to clear up %d ready_conns\n",
+ kgnilnd_count_list(&dev->gnd_ready_conns));
+
+ /* use while/list_first_entry loop to ensure we can handle any
+ * DESTROY_EP conns added from kgnilnd_complete_closed_conn */
+ while (!list_empty(&dev->gnd_ready_conns)) {
+ conn = list_first_entry(&dev->gnd_ready_conns,
+ kgn_conn_t, gnc_schedlist);
+ conn_sched = xchg(&conn->gnc_scheduled, GNILND_CONN_PROCESS);
+
+ LASSERTF(conn_sched != GNILND_CONN_IDLE &&
+ conn_sched != GNILND_CONN_PROCESS,
+ "conn %p on ready list but in bad state: %d\n",
+ conn, conn_sched);
+
+ list_del_init(&conn->gnc_schedlist);
+
+ if (conn->gnc_state == GNILND_CONN_CLOSING) {
+ /* bump to CLOSED to fake out send of CLOSE */
+ conn->gnc_state = GNILND_CONN_CLOSED;
+ conn->gnc_close_sent = 1;
+ }
+
+ if (conn->gnc_state == GNILND_CONN_DESTROY_EP) {
+ kgnilnd_destroy_conn_ep(conn);
+ } else {
+ kgnilnd_complete_closed_conn(conn);
+ }
+
+ /* there really shouldn't be any other states here -
+ * they would have been cleared out in the del_peer_or_conn or the dgram
+ * aborts above.
+ * there is an LASSERTF in kgnilnd_complete_closed_conn that will take
+ * care of catching anything else for us */
+
+ kgnilnd_schedule_process_conn(conn, -1);
+
+ kgnilnd_conn_decref(conn);
+ }
+ }
+
+ /* don't let the little weasily purgatory conns hide from us */
+ for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) {
+ list_for_each_entry_safe(peer, peerN, &kgnilnd_data.kgn_peers[i], gnp_list) {
+ kgn_conn_t *conn, *connN;
+
+ list_for_each_entry_safe(conn, connN, &peer->gnp_conns, gnc_list) {
+ kgnilnd_detach_purgatory_locked(conn, &souls);
+ }
+ }
+ }
+
+ CDEBUG(D_NET, "about to release %d purgatory entries\n",
+ kgnilnd_count_list(&souls));
+
+ kgnilnd_release_purgatory_list(&souls);
+
+ /* validate we are now clean */
+ for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+ kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
+
+ /* now all the cons/mboxes should be cleaned up, including purgatory
+ * so go through and release the MDDs for our persistent PHYS fma_blks
+ */
+ kgnilnd_unmap_phys_fmablk(dev);
+
+ LASSERTF(atomic_read(&dev->gnd_nfmablk) == 0,
+ "reset failed: fma blocks still live %d\n",
+ atomic_read(&dev->gnd_nfmablk));
+
+ LASSERTF(atomic_read(&dev->gnd_neps) == 0,
+ "reset failed: EP handles still live %d\n",
+ atomic_read(&dev->gnd_neps));
+ }
+
+ LASSERTF(atomic_read(&kgnilnd_data.kgn_nconns) == 0,
+ "reset failed: conns left %d\n",
+ atomic_read(&kgnilnd_data.kgn_nconns));
+
+ /* fine to have peers left - they are waiting for new conns
+ * but should not be holding any open HW resources */
+
+ /* like the last part of kgnilnd_base_shutdown() */
+
+ CFS_RACE(CFS_FAIL_GNI_SR_DOWN_RACE);
+
+ for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+ kgnilnd_dev_fini(&kgnilnd_data.kgn_devices[i]);
+ }
+
+ /* no need to free and recreate the TX descriptors
+ * we nuked all the ones that could be using HW resources in
+ * kgnilnd_close_matching_conns and asserted it worked in
+ * kgnilnd_dev_fini */
+
+ /* At this point, all HW is torn down, start to reset */
+
+ /* only reset our known devs */
+ for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) {
+ kgn_device_t *dev = &kgnilnd_data.kgn_devices[i];
+ rc = kgnilnd_dev_init(dev);
+ LASSERTF(rc == 0, "dev_init failed for dev %d\n", i);
+ kgnilnd_map_phys_fmablk(dev);
+ LASSERTF(rc == 0, "map_phys_fmablk failed for dev %d\n", i);
+ rc = kgnilnd_setup_wildcard_dgram(dev);
+ LASSERTF(rc == 0, "couldnt setup datagrams on dev %d: %d\n",
+ i, rc);
+ }
+
+ /* Now the fun restarts... - release the hounds! */
+
+ end = jiffies;
+ seconds = cfs_duration_sec((long)end - start);
+ kgnilnd_bump_timeouts(seconds, reason);
+
+ kgnilnd_data.kgn_in_reset = 0;
+ set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_IDLE);
+ kgnilnd_quiesce_wait(reason);
+ LCONSOLE_WARN("%s reset of all hardware resources\n",
+ rc ? "failed" : "successful");
+
+ RETURN_EXIT;
+}
+
+/* A thread that handles quiece and reset hardware events.
+ * We do the same thing regardless of which device reported the event. */
+int
+kgnilnd_ruhroh_thread(void *arg)
+{
+ int i = 1;
+ DEFINE_WAIT(wait);
+
+ cfs_daemonize("kgnilnd_rr");
+ cfs_block_allsigs();
+ set_user_nice(current, *kgnilnd_tunables.kgn_nice);
+ kgnilnd_data.kgn_ruhroh_running = 1;
+
+ while (1) {
+
+ /* Block until there's a request.. A reset request could come in
+ * while we're handling a quiesce one, or vice versa.
+ * Keep processing requests until there are none.*/
+ prepare_to_wait(&kgnilnd_data.kgn_ruhroh_waitq, &wait, TASK_INTERRUPTIBLE);
+ while (!(kgnilnd_data.kgn_ruhroh_shutdown ||
+ kgnilnd_data.kgn_needs_reset || kgnilnd_data.kgn_needs_pause))
+ schedule();
+ finish_wait(&kgnilnd_data.kgn_ruhroh_waitq, &wait);
+
+ /* Exit if the driver is shutting down. */
+ if (kgnilnd_data.kgn_ruhroh_shutdown)
+ break;
+
+ /* Serialize with driver startup and shutdown. */
+ down(&kgnilnd_data.kgn_quiesce_sem);
+
+ CDEBUG(D_NET, "trigger %d reset %d to_bump %d pause %d\n",
+ kgnilnd_data.kgn_quiesce_trigger,
+ kgnilnd_data.kgn_needs_reset,
+ kgnilnd_data.kgn_bump_info_rdy,
+ kgnilnd_data.kgn_needs_pause);
+
+ /* Do we need to do a pause/quiesce? */
+ if (kgnilnd_data.kgn_needs_pause) {
+
+ /* Pause all other kgnilnd threads. */
+ set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_HW_QUIESCE);
+ kgnilnd_quiesce_wait("hardware quiesce flag");
+
+ /* If the hardware quiesce flag is set, wait for it to clear.
+ * This should happen relatively quickly, so we wait for it.
+ * This will hold up the eventd thread, but on everything but
+ * the simulator, this is ok-- there is one thread per core.
+ *
+ * Handle (possibly multiple) quiesce events while we wait. The
+ * memory barrier ensures that the core doesn't start fetching
+ * kgn_bump_info_rdy before it fetches kgn_needs_pause, and
+ * matches the second mb in kgnilnd_quiesce_end_callback(). */
+ smp_rmb();
+ while (kgnilnd_hw_in_quiesce() || kgnilnd_data.kgn_bump_info_rdy) {
+
+ i++;
+ LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
+ "Waiting for hardware quiesce flag to clear\n");
+ cfs_pause(cfs_time_seconds(1 * i));
+
+ /* If we got a quiesce event with bump info, DO THE BUMP!. */
+ if (kgnilnd_data.kgn_bump_info_rdy) {
+ /* reset console rate limiting for each event */
+ i = 1;
+
+ /* Make sure the core doesn't start fetching
+ * kgni_quiesce_seconds until after it sees
+ * kgn_bump_info_rdy set. This is the match to the
+ * first mb in kgnilnd_quiesce_end_callback(). */
+ smp_rmb();
+ (void) kgnilnd_bump_timeouts(kgnilnd_data.kgn_quiesce_secs,
+ "hardware quiesce callback");
+ set_mb(kgnilnd_data.kgn_quiesce_secs, 0);
+ set_mb(kgnilnd_data.kgn_bump_info_rdy, 0);
+ }
+ }
+
+ /* Reset the kgn_needs_pause flag before coming out of
+ * the pause. This ordering avoids a race with the
+ * setting of this flag in kgnilnd_pause_threads(). */
+ set_mb(kgnilnd_data.kgn_needs_pause, 0);
+
+ /* ok, let the kids back into the pool */
+ set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_IDLE);
+ kgnilnd_quiesce_wait("hardware quiesce");
+ }
+
+ /* Do a stack reset if needed. */
+ if (kgnilnd_data.kgn_needs_reset) {
+ kgnilnd_reset_stack();
+ set_mb(kgnilnd_data.kgn_needs_reset, 0);
+ }
+
+ up(&kgnilnd_data.kgn_quiesce_sem);
+ }
+
+ kgnilnd_data.kgn_ruhroh_running = 0;
+ return 0;
+}
+
+/* Set pause request flag. Any functions that
+ * call this one are responsible for ensuring that
+ * variables they set up are visible on other cores before
+ * this flag setting. This executes in interrupt or kernel
+ * thread context. */
+void
+kgnilnd_pause_threads(void)
+{
+ /* only device 0 gets the handle, see kgnilnd_dev_init */
+ kgn_device_t *dev = &kgnilnd_data.kgn_devices[0];
+ LASSERTF(dev != NULL, "dev 0 is NULL\n");
+
+ /* If we're currently in a pause triggered by the pause flag,
+ * there's no need to set it again. We clear the kgn_needs_pause
+ * flag before we reset kgn_quiesce_trigger to avoid a race. The
+ * read memory barrier matches the setmb() on the trigger in
+ * kgnilnd_ruhroh_task(). */
+ smp_rmb();
+ if (!(kgnilnd_data.kgn_quiesce_trigger == GNILND_QUIESCE_HW_QUIESCE &&
+ GNILND_IS_QUIESCED)) {
+ CDEBUG(D_NET, "requesting thread pause\n");
+
+ kgnilnd_data.kgn_needs_pause = 1;
+
+ wake_up(&kgnilnd_data.kgn_ruhroh_waitq);
+ } else {
+ CDEBUG(D_NET, "thread pause already underway\n");
+ }
+}
+
+/* Return non-zero if the GNI hardware quiesce flag is set */
+int
+kgnilnd_hw_in_quiesce(void)
+{
+ /* only device 0 gets the handle, see kgnilnd_dev_init */
+ kgn_device_t *dev0 = &kgnilnd_data.kgn_devices[0];
+
+ LASSERTF(dev0 != NULL, "dev 0 is NULL\n");
+
+ smp_rmb();
+ return kgnilnd_get_quiesce_status(dev0->gnd_handle) != 0;
+}
+
+
+/* If the GNI hardware quiesce flag is set, initiate our pause and
+ * return non-zero. Also return non-zero if the stack is shutting down. */
+int
+kgnilnd_check_hw_quiesce(void)
+{
+ if (likely(!kgnilnd_hw_in_quiesce()))
+ return 0;
+
+ if (!kgnilnd_data.kgn_ruhroh_shutdown) {
+ CDEBUG(D_NET, "initiating thread pause\n");
+ kgnilnd_pause_threads();
+ } else {
+ CDEBUG(D_NET, "thread pause bypassed because of shutdown\n");
+ }
+
+ return 1;
+}
+
+/* Callback from kngi with the quiesce duration. This executes
+ * in interrupt context. */
+void
+kgnilnd_quiesce_end_callback(gni_nic_handle_t nic_handle, uint64_t msecs)
+{
+ /* only device 0 gets the handle, see kgnilnd_dev_init */
+ kgn_device_t *dev = &kgnilnd_data.kgn_devices[0];
+ LASSERTF(dev != NULL, "dev 0 is NULL\n");
+
+ if (!kgnilnd_data.kgn_ruhroh_shutdown) {
+
+ CDEBUG(D_NET, "requesting timeout bump by "LPD64" msecs\n", msecs);
+
+ /* Save the bump interval and request the bump.
+ * The memory barrier ensures that the interval is in place before
+ * the bump flag can be seen (in case a core is already running the
+ * ruhroh task), and that the bump request flag in place before
+ * the pause request can be seen (to ensure a core doesn't miss the bump
+ * request flag). */
+ /* If another callback occurred before the ruhroh task
+ * finished processing the first bump request, we'd over-write its info.
+ * Nic says that callbacks occur so slowly that this isn't an issue. */
+ set_mb(kgnilnd_data.kgn_quiesce_secs, msecs / MSEC_PER_SEC);
+ set_mb(kgnilnd_data.kgn_bump_info_rdy, 1);
+ kgnilnd_pause_threads();
+ } else {
+ CDEBUG(D_NET, "timeout bump bypassed because of shutdown\n");
+ }
+}
+
+void
+kgnilnd_critical_error(struct gni_err *err_handle)
+{
+ /* only device 0 gets the handle, see kgnilnd_dev_init */
+ kgn_device_t *dev = &kgnilnd_data.kgn_devices[0];
+ LASSERTF(dev != NULL, "dev 0 is NULL\n");
+
+ if (!kgnilnd_data.kgn_ruhroh_shutdown) {
+ CDEBUG(D_NET, "requesting stack reset\n");
+ kgnilnd_data.kgn_needs_reset = 1;
+ wake_up(&kgnilnd_data.kgn_ruhroh_waitq);
+ } else {
+ CDEBUG(D_NET, "stack reset bypassed because of shutdown\n");
+ }
+}
--- /dev/null
+/*
+ * Copyright (C) 2012 Cray, Inc.
+ *
+ * Author: Nic Henke <nic@cray.com>
+ * Author: James Shimek <jshimek@cray.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* this code liberated and modified from Lustre */
+
+#define DEBUG_SUBSYSTEM S_LND
+
+#include "gnilnd.h"
+
+typedef struct kgn_sysctl_data {
+ int ksd_pause_trigger;
+ int ksd_quiesce_secs;
+ int ksd_rdmaq_override;
+} kgn_sysctl_data_t;
+
+static kgn_sysctl_data_t kgnilnd_sysctl;
+
+#if defined(CONFIG_SYSCTL)
+
+static cfs_sysctl_table_header_t *kgnilnd_table_header = NULL;
+#ifndef HAVE_SYSCTL_UNNUMBERED
+
+enum {
+ GNILND_VERSION = 1,
+ GNILND_THREAD_PAUSE,
+ GNILND_HW_QUIESCE,
+ GNILND_STACK_RESET,
+ GNILND_RDMAQ_OVERRIDE,
+};
+#else
+#define GNILND_VERSION CTL_UNNUMBERED
+#define GNILND_THREAD_PAUSE CTL_UNNUMBERED
+#define GNILND_HW_QUIESCE CTL_UNNUMBERED
+#define GNILND_STACK_RESET CTL_UNNUMBERED
+#define GNILND_RDMAQ_OVERRIDE CTL_UNNUMBERED
+#endif
+
+static int LL_PROC_PROTO(proc_toggle_thread_pause)
+{
+ int old_val = kgnilnd_sysctl.ksd_pause_trigger;
+ int rc = 0;
+ ENTRY;
+
+ rc = ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+ if (!write) {
+ /* read */
+ RETURN(rc);
+ }
+
+ if (kgnilnd_data.kgn_init != GNILND_INIT_ALL) {
+ rc = -EINVAL;
+ RETURN(rc);
+ }
+
+ if (old_val != kgnilnd_sysctl.ksd_pause_trigger) {
+ down(&kgnilnd_data.kgn_quiesce_sem);
+ CDEBUG(D_NET, "setting quiesce_trigger %d\n", old_val);
+ kgnilnd_data.kgn_quiesce_trigger = kgnilnd_sysctl.ksd_pause_trigger;
+ kgnilnd_quiesce_wait("admin sysctl");
+ up(&kgnilnd_data.kgn_quiesce_sem);
+ }
+
+ RETURN(rc);
+}
+
+static int LL_PROC_PROTO(proc_hw_quiesce)
+{
+ int rc = 0;
+ kgn_device_t *dev;
+ ENTRY;
+
+ rc = ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+ if (!write) {
+ /* read */
+ RETURN(rc);
+ }
+
+ if (kgnilnd_data.kgn_init != GNILND_INIT_ALL) {
+ rc = -EINVAL;
+ RETURN(rc);
+ }
+
+
+ /* only device 0 gets the handle, see kgnilnd_dev_init */
+ dev = &kgnilnd_data.kgn_devices[0];
+
+ LASSERTF(dev != NULL, "dev 0 is NULL\n");
+
+ kgnilnd_quiesce_end_callback(dev->gnd_handle,
+ kgnilnd_sysctl.ksd_quiesce_secs * MSEC_PER_SEC);
+
+ RETURN(rc);
+}
+
+int LL_PROC_PROTO(proc_trigger_stack_reset)
+{
+ int rc = 0;
+ int i = 1;
+ kgn_device_t *dev;
+ ENTRY;
+
+ if (!write) {
+ /* read */
+ rc = ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+ RETURN(rc);
+ }
+
+ /* only device 0 gets the handle, see kgnilnd_dev_init */
+ dev = &kgnilnd_data.kgn_devices[0];
+
+ LASSERTF(dev != NULL, "dev 0 is NULL\n");
+
+ kgnilnd_critical_error(dev->gnd_err_handle);
+
+ /* Wait for the reset to complete. This prevents any races in testing
+ * where we'd immediately try to send traffic again */
+ while (kgnilnd_data.kgn_needs_reset != 0) {
+ i++;
+ LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET,
+ "Waiting for stack reset request to clear\n");
+ cfs_pause(cfs_time_seconds(1 * i));
+ }
+
+ RETURN(rc);
+}
+
+static int LL_PROC_PROTO(proc_toggle_rdmaq_override)
+{
+ int old_val = kgnilnd_sysctl.ksd_rdmaq_override;
+ int rc = 0;
+ ENTRY;
+
+ rc = ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+ if (!write) {
+ /* read */
+ RETURN(rc);
+ }
+
+ if (kgnilnd_data.kgn_init != GNILND_INIT_ALL) {
+ rc = -EINVAL;
+ RETURN(rc);
+ }
+
+ if (old_val != kgnilnd_sysctl.ksd_rdmaq_override) {
+ long new_mb = kgnilnd_sysctl.ksd_rdmaq_override * (long)(1024*1024);
+ LCONSOLE_INFO("changing RDMAQ override to %d mbytes/sec\n",
+ kgnilnd_sysctl.ksd_rdmaq_override);
+ /* override proc is mbytes, but we calc in bytes */
+ kgnilnd_data.kgn_rdmaq_override = new_mb;
+ smp_wmb();
+ }
+
+ RETURN(rc);
+}
+
+static cfs_sysctl_table_t kgnilnd_table[] = {
+ /*
+ * NB No .strategy entries have been provided since sysctl(8) prefers
+ * to go via /proc for portability.
+ */
+ {
+ INIT_CTL_NAME(GNILND_VERSION)
+ .procname = "version",
+ .data = KGNILND_BUILD_REV,
+ .maxlen = sizeof(KGNILND_BUILD_REV),
+ .mode = 0444,
+ .proc_handler = &proc_dostring
+ },
+ {
+ INIT_CTL_NAME(GNILND_THREAD_PAUSE)
+ .procname = "thread_pause",
+ .data = &kgnilnd_sysctl.ksd_pause_trigger,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_toggle_thread_pause,
+ },
+ {
+ INIT_CTL_NAME(GNILND_HW_QUIESCE)
+ .procname = "hw_quiesce",
+ .data = &kgnilnd_sysctl.ksd_quiesce_secs,
+ .maxlen = sizeof(__u32),
+ .mode = 0644,
+ .proc_handler = &proc_hw_quiesce,
+ },
+ {
+ INIT_CTL_NAME(GNILND_STACK_RESET)
+ .procname = "stack_reset",
+ .data = NULL,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = &proc_trigger_stack_reset,
+ },
+ {
+ INIT_CTL_NAME(GNILND_RDMAQ_OVERRIDE)
+ .procname = "rdmaq_override",
+ .data = &kgnilnd_sysctl.ksd_rdmaq_override,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_toggle_rdmaq_override,
+ },
+ { INIT_CTL_NAME(0) }
+};
+
+static cfs_sysctl_table_t kgnilnd_top_table[2] = {
+ {
+ INIT_CTL_NAME(CTL_GNILND)
+ .procname = "kgnilnd",
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0555,
+ .child = kgnilnd_table
+ },
+ { INIT_CTL_NAME(0) }
+};
+
+void kgnilnd_insert_sysctl(void)
+{
+ if (kgnilnd_table_header == NULL)
+ kgnilnd_table_header = cfs_register_sysctl_table(kgnilnd_top_table, 0);
+}
+
+void kgnilnd_remove_sysctl(void)
+{
+ if (kgnilnd_table_header != NULL)
+ cfs_unregister_sysctl_table(kgnilnd_table_header);
+
+ kgnilnd_table_header = NULL;
+}
+
+#else
+void kgnilnd_insert_sysctl(void) {}
+void kgnilnd_remove_sysctl(void) {}
+#endif
--- /dev/null
+#define KGNILND_BUILD_REV SVN_CODE_REV
{ "kmxlnd", "lnet/klnds/mxlnd" },
{ "ko2iblnd", "lnet/klnds/o2iblnd" },
{ "kptllnd", "lnet/klnds/ptllnd" },
+ { "kgnilnd", "lnet/klnds/gnilnd"},
{ "kqswlnd", "lnet/klnds/qswlnd" },
{ "kralnd", "lnet/klnds/ralnd" },
{ "ksocklnd", "lnet/klnds/socklnd" },
int rc;
if (!g_net_is_compatible (argv[0], SOCKLND, RALND, PTLLND, MXLND,
- O2IBLND, 0))
+ O2IBLND, GNILND, 0))
return -1;
for (index = 0;;index++) {
ptl_ipaddr_2_str(data.ioc_u32[0], buffer[1],
sizeof(buffer[1]), 1),
data.ioc_u32[1]); /* peer port */
+ } else if (g_net_is_compatible(NULL, GNILND, 0)) {
+ int disconn = data.ioc_flags >> 16;
+ char *state;
+
+ if (disconn)
+ state = "D";
+ else
+ state = data.ioc_flags & 0xffff ? "C" : "U";
+
+ printf ("%-20s (%d) %s [%d] "LPU64" "
+ "sq %d/%d tx %d/%d/%d\n",
+ libcfs_nid2str(data.ioc_nid), /* peer nid */
+ data.ioc_net, /* gemini device id */
+ state, /* peer is Connecting, Up, or Down */
+ data.ioc_count, /* peer refcount */
+ data.ioc_u64[0], /* peerstamp */
+ data.ioc_u32[2], data.ioc_u32[3], /* tx and rx seq */
+ /* fmaq, nfma, nrdma */
+ data.ioc_u32[0], data.ioc_u32[1], data.ioc_u32[4]
+ );
} else {
printf ("%-20s [%d]\n",
libcfs_nid2str(data.ioc_nid), data.ioc_count);
int port = 0;
int rc;
- if (!g_net_is_compatible (argv[0], SOCKLND, RALND, 0))
+ if (!g_net_is_compatible (argv[0], SOCKLND, RALND,
+ GNILND, 0))
return -1;
if (argc != 4) {
- fprintf (stderr, "usage(tcp,ra): %s nid ipaddr port\n",
+ fprintf (stderr, "usage(tcp,ra,gni): %s nid ipaddr port\n",
argv[0]);
return 0;
}
int rc;
if (!g_net_is_compatible (argv[0], SOCKLND, RALND, MXLND, PTLLND,
- O2IBLND, 0))
+ O2IBLND, GNILND, 0))
return -1;
if (g_net_is_compatible(NULL, SOCKLND, 0)) {
int index;
int rc;
- if (!g_net_is_compatible (argv[0], SOCKLND, RALND, MXLND, O2IBLND, 0))
+ if (!g_net_is_compatible (argv[0], SOCKLND, RALND, MXLND, O2IBLND,
+ GNILND, 0))
return -1;
for (index = 0; ; index++) {
printf ("%s mtu %d\n",
libcfs_nid2str(data.ioc_nid),
data.ioc_u32[0]); /* path MTU */
+ } else if (g_net_is_compatible (NULL, GNILND, 0)) {
+ printf ("%-20s [%d]\n",
+ libcfs_nid2str(data.ioc_nid),
+ data.ioc_u32[0] /* device id */);
} else {
printf ("%s\n", libcfs_nid2str(data.ioc_nid));
}
return 0;
}
- if (!g_net_is_compatible (NULL, SOCKLND, RALND, MXLND, O2IBLND, 0))
+ if (!g_net_is_compatible (NULL, SOCKLND, RALND, MXLND, O2IBLND,
+ GNILND, 0))
return 0;
if (argc >= 2 &&
return 0;
}
- if (!g_net_is_compatible (argv[0], SOCKLND, 0))
+ if (!g_net_is_compatible (argv[0], SOCKLND, GNILND, 0))
return -1;
if (argc > 1 &&