From: James Simmons Date: Wed, 5 Dec 2012 18:54:39 +0000 (-0500) Subject: LU-1419 lnet: Add support for Cray's Gemini interconnect X-Git-Tag: 2.3.58~20 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=4d381ef9f179b21217c237ad1cc83055a2448550 LU-1419 lnet: Add support for Cray's Gemini interconnect This patch adds LNET support to use Cray's Gemini interconnect on their newer systems. The gnilnd was originally based off of the ralnd. Signed-off-by: James Simmons Signed-off-by: Chris Horn Signed-off-by: Cory Spitz Change-Id: Ia98a44f4f3d68773438d820c49fe554a3d551dc5 Reviewed-on: http://review.whamcloud.com/3381 Tested-by: Hudson Tested-by: Maloo Reviewed-by: Isaac Huang Reviewed-by: Doug Oucharek Reviewed-by: Oleg Drokin --- diff --git a/lnet/autoconf/lustre-lnet.m4 b/lnet/autoconf/lustre-lnet.m4 index 4cd30eb..9d9ca11 100644 --- a/lnet/autoconf/lustre-lnet.m4 +++ b/lnet/autoconf/lustre-lnet.m4 @@ -526,13 +526,88 @@ AC_SUBST(RACPPFLAGS) AC_SUBST(RALND) ]) +# +# LN_CONFIG_GNILND +# +# check whether to use the Gemini Network Interface lnd +# +AC_DEFUN([LN_CONFIG_GNILND], +[#### Gemini Network Interface +AC_MSG_CHECKING([whether to enable GNI lnd]) +AC_ARG_ENABLE([gni], + AC_HELP_STRING([--enable-gni], + [enable GNI lnd]), + [],[enable_gni='no']) +AC_MSG_RESULT([$enable_gni]) + +if test x$enable_gni = xyes ; then + AC_MSG_CHECKING([if GNI kernel headers are present]) + # placeholder + # GNICPPFLAGS was set in spec file + EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="$EXTRA_KCFLAGS $GNICPPFLAGS" + LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + gni_cdm_handle_t kgni_domain; + gni_return_t rc; + int rrc; + + rc = gni_cdm_create(0, 1, 1, 0, &kgni_domain); + + rrc = (rc == GNI_RC_SUCCESS) ? 0 : 1; + + return rrc; + ],[ + AC_MSG_RESULT([yes]) + GNILND="gnilnd" + ],[ + AC_MSG_RESULT([no]) + AC_MSG_ERROR([can't compile gnilnd with given GNICPPFLAGS: $GNICPPFLAGS]) + ]) + # at this point, we have gnilnd basic support, now check for extra features + AC_MSG_CHECKING([to use RCA in gnilnd]) + LB_LINUX_TRY_COMPILE([ + #include + #include + #include + ],[ + gni_cdm_handle_t kgni_domain; + gni_return_t rc; + krca_ticket_t ticket = KRCA_NULL_TICKET; + int rrc; + __u32 nid = 0, nic_addr; + + rc = gni_cdm_create(0, 1, 1, 0, &kgni_domain); + + rrc = (rc == GNI_RC_SUCCESS) ? 0 : 1; + + rrc += krca_nid_to_nicaddrs(nid, 1, &nic_addr); + + rrc += krca_register(&ticket, RCA_MAKE_SERVICE_INDEX(RCA_IO_CLASS, 9), 99, 0); + + return rrc; + ],[ + AC_MSG_RESULT([yes]) + GNICPPFLAGS="$GNICPPFLAGS -DGNILND_USE_RCA=1" + GNILNDRCA="gnilndrca" + ],[ + AC_MSG_RESULT([no]) + ]) + EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save" +fi +AC_SUBST(GNICPPFLAGS) +AC_SUBST(GNILNDRCA) +AC_SUBST(GNILND) +]) # # # LN_CONFIG_USERSPACE # -# This is defined but empty because it is called from +# This is defined but empty because it is called from # build/autconf/lustre-build.m4 which is shared by all branches. # AC_DEFUN([LN_CONFIG_USERSPACE], @@ -598,6 +673,7 @@ LN_CONFIG_BACKOFF LN_CONFIG_QUADRICS LN_CONFIG_O2IB LN_CONFIG_RALND +LN_CONFIG_GNILND LN_CONFIG_PTLLND LN_CONFIG_MX # 2.6.32 @@ -740,6 +816,8 @@ AC_DEFUN([LN_CONDITIONALS], AM_CONDITIONAL(BUILD_MXLND, test x$MXLND = "xmxlnd") AM_CONDITIONAL(BUILD_O2IBLND, test x$O2IBLND = "xo2iblnd") AM_CONDITIONAL(BUILD_RALND, test x$RALND = "xralnd") +AM_CONDITIONAL(BUILD_GNILND, test x$GNILND = "xgnilnd") +AM_CONDITIONAL(BUILD_GNILND_RCA, test x$GNILNDRCA = "xgnilndrca") AM_CONDITIONAL(BUILD_PTLLND, test x$PTLLND = "xptllnd") AM_CONDITIONAL(BUILD_USOCKLND, test x$USOCKLND = "xusocklnd") ]) @@ -769,6 +847,8 @@ lnet/klnds/qswlnd/Makefile lnet/klnds/qswlnd/autoMakefile lnet/klnds/ralnd/Makefile lnet/klnds/ralnd/autoMakefile +lnet/klnds/gnilnd/Makefile +lnet/klnds/gnilnd/autoMakefile lnet/klnds/socklnd/Makefile lnet/klnds/socklnd/autoMakefile lnet/klnds/ptllnd/Makefile diff --git a/lnet/klnds/Makefile.in b/lnet/klnds/Makefile.in index f0586ae..0d99a87 100644 --- a/lnet/klnds/Makefile.in +++ b/lnet/klnds/Makefile.in @@ -1,5 +1,6 @@ @BUILD_MXLND_TRUE@subdir-m += mxlnd @BUILD_RALND_TRUE@subdir-m += ralnd +@BUILD_GNILND_TRUE@subdir-m += gnilnd @BUILD_O2IBLND_TRUE@subdir-m += o2iblnd @BUILD_QSWLND_TRUE@subdir-m += qswlnd @BUILD_PTLLND_TRUE@subdir-m += ptllnd diff --git a/lnet/klnds/autoMakefile.am b/lnet/klnds/autoMakefile.am index 57d709c..1591d87 100644 --- a/lnet/klnds/autoMakefile.am +++ b/lnet/klnds/autoMakefile.am @@ -34,4 +34,4 @@ # Lustre is a trademark of Sun Microsystems, Inc. # -SUBDIRS = socklnd qswlnd mxlnd ralnd ptllnd o2iblnd +SUBDIRS = socklnd qswlnd mxlnd ralnd gnilnd ptllnd o2iblnd diff --git a/lnet/klnds/gnilnd/Makefile.in b/lnet/klnds/gnilnd/Makefile.in new file mode 100644 index 0000000..14e8c30 --- /dev/null +++ b/lnet/klnds/gnilnd/Makefile.in @@ -0,0 +1,9 @@ +MODULES := kgnilnd +kgnilnd-objs := gnilnd.o gnilnd_cb.o gnilnd_modparams.o gnilnd_debug.o gnilnd_proc.o \ + gnilnd_sysctl.o gnilnd_stack.o gnilnd_conn.o + +EXTRA_POST_CFLAGS := -D"SVN_CODE_REV=KBUILD_STR(${SVN_CODE_REV})" @GNICPPFLAGS@ + +EXTRA_DIST = $(kgnilnd-objs:%.o=%.c) gnilnd.h gnilnd_api_wrap.h + +@INCLUDE_RULES@ diff --git a/lnet/klnds/gnilnd/autoMakefile.am b/lnet/klnds/gnilnd/autoMakefile.am new file mode 100644 index 0000000..888b68e --- /dev/null +++ b/lnet/klnds/gnilnd/autoMakefile.am @@ -0,0 +1,12 @@ +# Copyright (C) 2009 Cray, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +if MODULES +if BUILD_GNILND +modulenet_DATA = kgnilnd$(KMODEXT) +endif +endif + +MOSTLYCLEANFILES = @MOSTLYCLEANFILES@ diff --git a/lnet/klnds/gnilnd/gnilnd.c b/lnet/klnds/gnilnd/gnilnd.c new file mode 100644 index 0000000..fcc05fa --- /dev/null +++ b/lnet/klnds/gnilnd/gnilnd.c @@ -0,0 +1,2698 @@ +/* + * Copyright (C) 2012 Cray, Inc. + * + * Author: Igor Gorodetsky + * Author: Nic Henke + * Author: James Shimek + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ +#include "gnilnd.h" + +/* Primary entry points from LNET. There are no guarantees against reentrance. */ +lnd_t the_kgnilnd = { + .lnd_type = GNILND, + .lnd_startup = kgnilnd_startup, + .lnd_shutdown = kgnilnd_shutdown, + .lnd_ctl = kgnilnd_ctl, + .lnd_send = kgnilnd_send, + .lnd_recv = kgnilnd_recv, + .lnd_eager_recv = kgnilnd_eager_recv, + .lnd_query = kgnilnd_query, +}; + +kgn_data_t kgnilnd_data; +kgn_hssops_t kgnilnd_hssops; + +/* needs write_lock on kgn_peer_conn_lock */ +int +kgnilnd_close_stale_conns_locked(kgn_peer_t *peer, kgn_conn_t *newconn) +{ + kgn_conn_t *conn; + struct list_head *ctmp, *cnxt; + int loopback; + int count = 0; + + loopback = peer->gnp_nid == peer->gnp_net->gnn_ni->ni_nid; + + list_for_each_safe(ctmp, cnxt, &peer->gnp_conns) { + conn = list_entry(ctmp, kgn_conn_t, gnc_list); + + if (conn->gnc_state != GNILND_CONN_ESTABLISHED) + continue; + + if (conn == newconn) + continue; + + if (conn->gnc_device != newconn->gnc_device) + continue; + + /* This is a two connection loopback - one talking to the other */ + if (loopback && + newconn->gnc_my_connstamp == conn->gnc_peer_connstamp && + newconn->gnc_peer_connstamp == conn->gnc_my_connstamp) { + CDEBUG(D_NET, "skipping prune of %p, " + "loopback and matching stamps" + " connstamp "LPU64"("LPU64")" + " peerstamp "LPU64"("LPU64")\n", + conn, newconn->gnc_my_connstamp, + conn->gnc_peer_connstamp, + newconn->gnc_peer_connstamp, + conn->gnc_my_connstamp); + continue; + } + + if (conn->gnc_peerstamp != newconn->gnc_peerstamp) { + LASSERTF(conn->gnc_peerstamp < newconn->gnc_peerstamp, + "conn 0x%p peerstamp "LPU64" >= " + "newconn 0x%p peerstamp "LPU64"\n", + conn, conn->gnc_peerstamp, + newconn, newconn->gnc_peerstamp); + + CDEBUG(D_NET, "Closing stale conn nid: %s " + " peerstamp:"LPX64"("LPX64")\n", + libcfs_nid2str(peer->gnp_nid), + conn->gnc_peerstamp, newconn->gnc_peerstamp); + } else { + + LASSERTF(conn->gnc_peer_connstamp < newconn->gnc_peer_connstamp, + "conn 0x%p peer_connstamp "LPU64" >= " + "newconn 0x%p peer_connstamp "LPU64"\n", + conn, conn->gnc_peer_connstamp, + newconn, newconn->gnc_peer_connstamp); + + CDEBUG(D_NET, "Closing stale conn nid: %s" + " connstamp:"LPU64"("LPU64")\n", + libcfs_nid2str(peer->gnp_nid), + conn->gnc_peer_connstamp, newconn->gnc_peer_connstamp); + } + + count++; + kgnilnd_close_conn_locked(conn, -ESTALE); + } + + if (count != 0) { + CWARN("Closed %d stale conns to %s\n", count, libcfs_nid2str(peer->gnp_nid)); + } + + RETURN(count); +} + +int +kgnilnd_conn_isdup_locked(kgn_peer_t *peer, kgn_conn_t *newconn) +{ + kgn_conn_t *conn; + struct list_head *tmp; + int loopback; + ENTRY; + + loopback = peer->gnp_nid == peer->gnp_net->gnn_ni->ni_nid; + + list_for_each(tmp, &peer->gnp_conns) { + conn = list_entry(tmp, kgn_conn_t, gnc_list); + CDEBUG(D_NET, "checking conn 0x%p for peer %s" + " lo %d new "LPU64" existing "LPU64 + " new peer "LPU64" existing peer "LPU64 + " new dev %p existing dev %p\n", + conn, libcfs_nid2str(peer->gnp_nid), + loopback, + newconn->gnc_peerstamp, conn->gnc_peerstamp, + newconn->gnc_peer_connstamp, conn->gnc_peer_connstamp, + newconn->gnc_device, conn->gnc_device); + + /* conn is in the process of closing */ + if (conn->gnc_state != GNILND_CONN_ESTABLISHED) + continue; + + /* 'newconn' is from an earlier version of 'peer'!!! */ + if (newconn->gnc_peerstamp < conn->gnc_peerstamp) + RETURN(1); + + /* 'conn' is from an earlier version of 'peer': it will be + * removed when we cull stale conns later on... */ + if (newconn->gnc_peerstamp > conn->gnc_peerstamp) + continue; + + /* Different devices are OK */ + if (conn->gnc_device != newconn->gnc_device) + continue; + + /* It's me connecting to myself */ + if (loopback && + newconn->gnc_my_connstamp == conn->gnc_peer_connstamp && + newconn->gnc_peer_connstamp == conn->gnc_my_connstamp) + continue; + + /* 'newconn' is an earlier connection from 'peer'!!! */ + if (newconn->gnc_peer_connstamp < conn->gnc_peer_connstamp) + RETURN(2); + + /* 'conn' is an earlier connection from 'peer': it will be + * removed when we cull stale conns later on... */ + if (newconn->gnc_peer_connstamp > conn->gnc_peer_connstamp) + continue; + + /* 'newconn' has the SAME connection stamp; 'peer' isn't + * playing the game... */ + RETURN(3); + } + + RETURN(0); +} + +int +kgnilnd_create_conn(kgn_conn_t **connp, kgn_device_t *dev) +{ + kgn_conn_t *conn; + gni_return_t rrc; + int rc = 0; + + LASSERT (!in_interrupt()); + atomic_inc(&kgnilnd_data.kgn_nconns); + + /* divide by 2 to allow for complete reset and immediate reconnect */ + if (atomic_read(&kgnilnd_data.kgn_nconns) >= GNILND_MAX_CQID/2) { + CERROR("Too many conn are live: %d > %d\n", + atomic_read(&kgnilnd_data.kgn_nconns), GNILND_MAX_CQID/2); + atomic_dec(&kgnilnd_data.kgn_nconns); + return -E2BIG; + } + + LIBCFS_ALLOC(conn, sizeof(*conn)); + if (conn == NULL) { + atomic_dec(&kgnilnd_data.kgn_nconns); + return -ENOMEM; + } + + LIBCFS_ALLOC(conn->gnc_tx_ref_table, GNILND_MAX_MSG_ID * sizeof(void *)); + if (conn->gnc_tx_ref_table == NULL) { + CERROR("Can't allocate conn tx_ref_table\n"); + rc = -ENOMEM; + GOTO(failed, rc); + } + + atomic_set(&conn->gnc_refcount, 1); + atomic_set(&conn->gnc_reaper_noop, 0); + atomic_set(&conn->gnc_sched_noop, 0); + INIT_LIST_HEAD(&conn->gnc_list); + INIT_LIST_HEAD(&conn->gnc_hashlist); + INIT_LIST_HEAD(&conn->gnc_schedlist); + INIT_LIST_HEAD(&conn->gnc_fmaq); + INIT_LIST_HEAD(&conn->gnc_mdd_list); + spin_lock_init(&conn->gnc_list_lock); + spin_lock_init(&conn->gnc_tx_lock); + + /* set tx id to nearly the end to make sure we find wrapping + * issues soon */ + conn->gnc_next_tx = (int) GNILND_MAX_MSG_ID - 10; + + /* if this fails, we have conflicts and MAX_TX is too large */ + CLASSERT(GNILND_MAX_MSG_ID < GNILND_MSGID_CLOSE); + + /* get a new unique CQ id for this conn */ + write_lock(&kgnilnd_data.kgn_peer_conn_lock); + conn->gnc_my_connstamp = kgnilnd_data.kgn_connstamp++; + conn->gnc_cqid = kgnilnd_get_cqid_locked(); + write_unlock(&kgnilnd_data.kgn_peer_conn_lock); + + if (conn->gnc_cqid == 0) { + CERROR("Could not allocate unique CQ ID for conn 0x%p\n", conn); + rc = -E2BIG; + GOTO(failed, rc); + } + + CDEBUG(D_NET, "alloc cqid %u for conn 0x%p\n", + conn->gnc_cqid, conn); + + /* need to be set before gnc_ephandle to allow kgnilnd_destroy_conn_ep to + * check context */ + conn->gnc_device = dev; + + conn->gnc_timeout = MAX(*kgnilnd_tunables.kgn_timeout, + GNILND_MIN_TIMEOUT); + kgnilnd_update_reaper_timeout(conn->gnc_timeout); + + /* this is the ep_handle for doing SMSG & BTE */ + mutex_lock(&dev->gnd_cq_mutex); + rrc = kgnilnd_ep_create(dev->gnd_handle, dev->gnd_snd_fma_cqh, + &conn->gnc_ephandle); + mutex_unlock(&dev->gnd_cq_mutex); + if (rrc != GNI_RC_SUCCESS) { + rc = -ENETDOWN; + GOTO(failed, rc); + } + + CDEBUG(D_NET, "created conn 0x%p ep_hndl 0x%p\n", + conn, conn->gnc_ephandle); + + /* add ref for EP canceling */ + kgnilnd_conn_addref(conn); + atomic_inc(&dev->gnd_neps); + + *connp = conn; + return 0; + +failed: + atomic_dec(&kgnilnd_data.kgn_nconns); + LIBCFS_FREE(conn->gnc_tx_ref_table, GNILND_MAX_MSG_ID * sizeof(void *)); + LIBCFS_FREE(conn, sizeof(*conn)); + return rc; +} + +/* needs to be called with kgn_peer_conn_lock held (read or write) */ +kgn_conn_t * +kgnilnd_find_conn_locked(kgn_peer_t *peer) +{ + kgn_conn_t *conn = NULL; + ENTRY; + + /* if we are in reset, this conn is going to die soon */ + if (unlikely(kgnilnd_data.kgn_in_reset)) { + RETURN(NULL); + } + + /* just return the first ESTABLISHED connection */ + list_for_each_entry(conn, &peer->gnp_conns, gnc_list) { + /* kgnilnd_finish_connect doesn't put connections on the + * peer list until they are actually established */ + LASSERTF(conn->gnc_state >= GNILND_CONN_ESTABLISHED, + "found conn %p state %s on peer %p (%s)\n", + conn, kgnilnd_conn_state2str(conn), peer, + libcfs_nid2str(peer->gnp_nid)); + if (conn->gnc_state != GNILND_CONN_ESTABLISHED) + continue; + + RETURN(conn); + } + RETURN(NULL); +} + +/* needs write_lock on kgn_peer_conn_lock held */ +kgn_conn_t * +kgnilnd_find_or_create_conn_locked(kgn_peer_t *peer) { + + kgn_device_t *dev = peer->gnp_net->gnn_dev; + kgn_conn_t *conn; + + conn = kgnilnd_find_conn_locked(peer); + + if (conn != NULL) { + return conn; + } + + /* if the peer was previously connecting, check if we should + * trigger another connection attempt yet. */ + if (time_before(jiffies, peer->gnp_reconnect_time)) { + return NULL; + } + + /* This check prevents us from creating a new connection to a peer while we are + * still in the process of closing an existing connection to the peer. + */ + list_for_each_entry(conn, &peer->gnp_conns, gnc_list) { + if (conn->gnc_ephandle != NULL) { + CDEBUG(D_NET, "Not connecting non-null ephandle found peer 0x%p->%s\n", peer, + libcfs_nid2str(peer->gnp_nid)); + return NULL; + } + } + + if (peer->gnp_connecting != GNILND_PEER_IDLE) { + /* if we are not connecting, fire up a new connection */ + /* or if we are anything but IDLE DONT start a new connection */ + return NULL; + } + + CDEBUG(D_NET, "starting connect to %s\n", + libcfs_nid2str(peer->gnp_nid)); + peer->gnp_connecting = GNILND_PEER_CONNECT; + kgnilnd_peer_addref(peer); /* extra ref for connd */ + + spin_lock(&dev->gnd_connd_lock); + list_add_tail(&peer->gnp_connd_list, &dev->gnd_connd_peers); + spin_unlock(&dev->gnd_connd_lock); + + kgnilnd_schedule_dgram(dev); + CDEBUG(D_NETTRACE, "scheduling new connect\n"); + + return NULL; +} + +/* Caller is responsible for deciding if/when to call this */ +void +kgnilnd_destroy_conn_ep(kgn_conn_t *conn) +{ + gni_return_t rrc; + gni_ep_handle_t tmp_ep; + + /* only if we actually initialized it, + * then set NULL to tell kgnilnd_destroy_conn to leave it alone */ + + tmp_ep = xchg(&conn->gnc_ephandle, NULL); + if (tmp_ep != NULL) { + /* we never re-use the EP, so unbind is not needed */ + mutex_lock(&conn->gnc_device->gnd_cq_mutex); + rrc = kgnilnd_ep_destroy(tmp_ep); + + mutex_unlock(&conn->gnc_device->gnd_cq_mutex); + + /* if this fails, it could hork up kgni smsg retransmit and others + * since we could free the SMSG mbox memory, etc. */ + LASSERTF(rrc == GNI_RC_SUCCESS, "rrc %d conn 0x%p ep 0x%p\n", + rrc, conn, conn->gnc_ephandle); + + atomic_dec(&conn->gnc_device->gnd_neps); + + /* clear out count added in kgnilnd_close_conn_locked + * conn will have a peer once it hits finish_connect, where it + * is the first spot we'll mark it ESTABLISHED as well */ + if (conn->gnc_peer) { + kgnilnd_admin_decref(conn->gnc_peer->gnp_dirty_eps); + } + + /* drop ref for EP */ + kgnilnd_conn_decref(conn); + } +} + +void +kgnilnd_destroy_conn(kgn_conn_t *conn) +{ + LASSERTF(!in_interrupt() && + !conn->gnc_scheduled && + !conn->gnc_in_purgatory && + conn->gnc_ephandle == NULL && + list_empty(&conn->gnc_list) && + list_empty(&conn->gnc_hashlist) && + list_empty(&conn->gnc_schedlist) && + list_empty(&conn->gnc_mdd_list), + "conn 0x%p->%s IRQ %d sched %d purg %d ep 0x%p lists %d/%d/%d/%d\n", + conn, conn->gnc_peer ? libcfs_nid2str(conn->gnc_peer->gnp_nid) + : "", + !!in_interrupt(), conn->gnc_scheduled, + conn->gnc_in_purgatory, + conn->gnc_ephandle, + list_empty(&conn->gnc_list), + list_empty(&conn->gnc_hashlist), + list_empty(&conn->gnc_schedlist), + list_empty(&conn->gnc_mdd_list)); + + /* Tripping these is especially bad, as it means we have items on the + * lists that didn't keep their refcount on the connection - or + * somebody evil released their own */ + LASSERTF(list_empty(&conn->gnc_fmaq) && + atomic_read(&conn->gnc_nlive_fma) == 0 && + atomic_read(&conn->gnc_nlive_rdma) == 0, + "conn 0x%p fmaq %d@0x%p nfma %d nrdma %d\n", + conn, kgnilnd_count_list(&conn->gnc_fmaq), &conn->gnc_fmaq, + atomic_read(&conn->gnc_nlive_fma), atomic_read(&conn->gnc_nlive_rdma)); + + CDEBUG(D_NET, "destroying conn %p ephandle %p error %d\n", + conn, conn->gnc_ephandle, conn->gnc_error); + + /* if there is an FMA blk left here, we'll tear it down */ + if (conn->gnc_fma_blk) { + kgnilnd_release_mbox(conn, 0); + } + + if (conn->gnc_peer != NULL) + kgnilnd_peer_decref(conn->gnc_peer); + + if (conn->gnc_tx_ref_table != NULL) { + LIBCFS_FREE(conn->gnc_tx_ref_table, + GNILND_MAX_MSG_ID * sizeof(void *)); + } + + LIBCFS_FREE(conn, sizeof(*conn)); + atomic_dec(&kgnilnd_data.kgn_nconns); +} + +/* peer_alive and peer_notify done in the style of the o2iblnd */ +void +kgnilnd_peer_alive(kgn_peer_t *peer) +{ + set_mb(peer->gnp_last_alive, jiffies); +} + +void +kgnilnd_peer_notify(kgn_peer_t *peer, int error) +{ + int tell_lnet = 0; + int nnets = 0; + int rc; + int i, j; + kgn_conn_t *conn; + kgn_net_t **nets; + kgn_net_t *net; + + + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_DONT_NOTIFY)) + return; + + /* Tell LNet we are giving ups on this peer - but only + * if it isn't already reconnected or trying to reconnect */ + read_lock(&kgnilnd_data.kgn_peer_conn_lock); + + /* use kgnilnd_find_conn_locked to avoid any conns in the process of being nuked + * + * don't tell LNet if we are in reset - we assume that everyone will be able to + * reconnect just fine + */ + conn = kgnilnd_find_conn_locked(peer); + + CDEBUG(D_NETTRACE, "peer 0x%p->%s ting %d conn 0x%p, rst %d error %d\n", + peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting, conn, + kgnilnd_data.kgn_in_reset, error); + + if ((peer->gnp_connecting == GNILND_PEER_IDLE) && + (conn == NULL) && + (!kgnilnd_data.kgn_in_reset) && + (!kgnilnd_conn_clean_errno(error))) { + tell_lnet = 1; + } + + read_unlock(&kgnilnd_data.kgn_peer_conn_lock); + + if (!tell_lnet) { + /* short circuit if we dont need to notify Lnet */ + return; + } + + rc = down_read_trylock(&kgnilnd_data.kgn_net_rw_sem); + + if (rc) { + /* dont do this if this fails since LNET is in shutdown or something else + */ + + for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) { + list_for_each_entry(net , &kgnilnd_data.kgn_nets[i], gnn_list) { + /* if gnn_shutdown set for any net shutdown is in progress just return */ + if (net->gnn_shutdown) { + up_read(&kgnilnd_data.kgn_net_rw_sem); + return; + } + nnets++; + } + } + + if (nnets == 0) { + /* shutdown in progress most likely */ + up_read(&kgnilnd_data.kgn_net_rw_sem); + return; + } + + LIBCFS_ALLOC(nets, nnets * sizeof(*nets)); + + if (nets == NULL) { + up_read(&kgnilnd_data.kgn_net_rw_sem); + CERROR("Failed to allocate nets[%d]\n", nnets); + return; + } + + j = 0; + for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) { + list_for_each_entry(net, &kgnilnd_data.kgn_nets[i], gnn_list) { + nets[j] = net; + kgnilnd_net_addref(net); + j++; + } + } + up_read(&kgnilnd_data.kgn_net_rw_sem); + + for (i = 0; i < nnets; i++) { + lnet_nid_t peer_nid; + + net = nets[i]; + + peer_nid = kgnilnd_lnd2lnetnid(net->gnn_ni->ni_nid, + peer->gnp_nid); + + CDEBUG(D_NET, "peer 0x%p->%s last_alive %lu (%lus ago)\n", + peer, libcfs_nid2str(peer_nid), peer->gnp_last_alive, + cfs_duration_sec(jiffies - peer->gnp_last_alive)); + + lnet_notify(net->gnn_ni, peer_nid, 0, peer->gnp_last_alive); + + + kgnilnd_net_decref(net); + } + + LIBCFS_FREE(nets, nnets * sizeof(*nets)); + } +} + +/* need write_lock on kgn_peer_conn_lock */ +void +kgnilnd_close_conn_locked(kgn_conn_t *conn, int error) +{ + kgn_peer_t *peer = conn->gnc_peer; + ENTRY; + + LASSERT(!in_interrupt()); + + /* store error for tx completion */ + conn->gnc_error = error; + peer->gnp_last_errno = error; + + /* use real error from peer if possible */ + if (error == -ECONNRESET) { + error = conn->gnc_peer_error; + } + + /* if we NETERROR, make sure it is rate limited */ + if (!kgnilnd_conn_clean_errno(error)) { + CNETERR("closing conn to %s: error %d\n", + libcfs_nid2str(peer->gnp_nid), error); + } else { + CDEBUG(D_NET, "closing conn to %s: error %d\n", + libcfs_nid2str(peer->gnp_nid), error); + } + + LASSERTF(conn->gnc_state == GNILND_CONN_ESTABLISHED, + "conn %p to %s with bogus state %s\n", conn, + libcfs_nid2str(conn->gnc_peer->gnp_nid), + kgnilnd_conn_state2str(conn)); + LASSERT(!list_empty(&conn->gnc_hashlist)); + LASSERT(!list_empty(&conn->gnc_list)); + + + /* mark peer count here so any place the EP gets destroyed will + * open up the peer count so that a new ESTABLISHED conn is then free + * to send new messages -- sending before the previous EPs are destroyed + * could end up with messages on the network for the old conn _after_ + * the new conn and break the mbox safety protocol */ + kgnilnd_admin_addref(conn->gnc_peer->gnp_dirty_eps); + + /* Remove from conn hash table: no new callbacks */ + list_del_init(&conn->gnc_hashlist); + kgnilnd_data.kgn_conn_version++; + + /* if we are in reset, go right to CLOSED as there is no scheduler + * thread to move from CLOSING to CLOSED */ + if (unlikely(kgnilnd_data.kgn_in_reset)) { + conn->gnc_state = GNILND_CONN_CLOSED; + } else { + conn->gnc_state = GNILND_CONN_CLOSING; + } + + /* leave on peer->gnp_conns to make sure we don't let the reaper + * or others try to unlink this peer until the conn is fully + * processed for closing */ + + if (kgnilnd_check_purgatory_conn(conn)) { + kgnilnd_add_purgatory_locked(conn, conn->gnc_peer); + } + + /* Reset RX timeout to ensure we wait for an incoming CLOSE + * for the full timeout. If we get a CLOSE we know the + * peer has stopped all RDMA. Otherwise if we wait for + * the full timeout we can also be sure all RDMA has stopped. */ + conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies; + mb(); + + /* schedule sending CLOSE - if we are in quiesce, this adds to + * gnd_ready_conns and allows us to find it in quiesce processing */ + kgnilnd_schedule_conn(conn); + + /* lose peer's ref */ + kgnilnd_conn_decref(conn); + /* -1 for conn table */ + kgnilnd_conn_decref(conn); + + EXIT; +} + +void +kgnilnd_close_conn(kgn_conn_t *conn, int error) +{ + write_lock(&kgnilnd_data.kgn_peer_conn_lock); + /* need to check the state here - this call is racy and we don't + * know the state until after the lock is grabbed */ + if (conn->gnc_state == GNILND_CONN_ESTABLISHED) { + kgnilnd_close_conn_locked(conn, error); + } + write_unlock(&kgnilnd_data.kgn_peer_conn_lock); +} + +void +kgnilnd_complete_closed_conn(kgn_conn_t *conn) +{ + LIST_HEAD (sinners); + kgn_tx_t *tx, *txn; + int nlive = 0; + int nrdma = 0; + int nq_rdma = 0; + int logmsg; + ENTRY; + + /* Dump log on cksum error - wait until complete phase to let + * RX of error happen */ + if (*kgnilnd_tunables.kgn_checksum_dump && + (conn != NULL && conn->gnc_peer_error == -ENOKEY)) { + libcfs_debug_dumplog(); + } + + /* _CLOSED set in kgnilnd_process_fmaq once we decide to + * send the CLOSE or not */ + LASSERTF(conn->gnc_state == GNILND_CONN_CLOSED, + "conn 0x%p->%s with bad state %s\n", + conn, conn->gnc_peer ? + libcfs_nid2str(conn->gnc_peer->gnp_nid) : + "", + kgnilnd_conn_state2str(conn)); + + LASSERT(list_empty(&conn->gnc_hashlist)); + + /* we've sent the close, start nuking */ + + /* we don't use lists to track things that we can get out of the + * tx_ref table... */ + + /* need to hold locks for tx_list_state, sampling it is too racy: + * - the lock actually protects tx != NULL, but we can't take the proper + * lock until we check tx_list_state, which would be too late and + * we could have the TX change under us. + * gnd_rdmaq_lock and gnd_lock and not used together, so taking both + * should be fine */ + spin_lock(&conn->gnc_device->gnd_rdmaq_lock); + spin_lock(&conn->gnc_device->gnd_lock); + + for (nrdma = 0; nrdma < GNILND_MAX_MSG_ID; nrdma++) { + tx = conn->gnc_tx_ref_table[nrdma]; + + if (tx != NULL) { + /* only print the first error and if not CLOSE, we often don't see + * CQ events for that by the time we get here... and really don't care */ + if (nlive || tx->tx_msg.gnm_type == GNILND_MSG_CLOSE) + tx->tx_state |= GNILND_TX_QUIET_ERROR; + nlive++; + GNIDBG_TX(D_NET, tx, "cleaning up on close, nlive %d", nlive); + + /* don't worry about gnc_lock here as nobody else should be + * touching this conn */ + kgnilnd_tx_del_state_locked(tx, NULL, conn, GNILND_TX_ALLOCD); + list_add_tail(&tx->tx_list, &sinners); + } + } + spin_unlock(&conn->gnc_device->gnd_lock); + spin_unlock(&conn->gnc_device->gnd_rdmaq_lock); + + /* nobody should have marked this as needing scheduling after + * we called close - so only ref should be us handling it */ + LASSERTF(conn->gnc_scheduled == GNILND_CONN_PROCESS, + "conn 0x%p scheduled %d\n", conn, conn->gnc_scheduled); + + /* now reset a few to actual counters... */ + nrdma = atomic_read(&conn->gnc_nlive_rdma); + nq_rdma = atomic_read(&conn->gnc_nq_rdma); + + if (!list_empty(&sinners)) { + list_for_each_entry_safe(tx, txn, &sinners, tx_list) { + /* clear tx_list to make tx_add_list_locked happy */ + list_del_init(&tx->tx_list); + /* The error codes determine if we hold onto the MDD */ + kgnilnd_tx_done(tx, conn->gnc_error); + } + } + + logmsg = (nlive + nrdma + nq_rdma); + + if (logmsg) { + if (conn->gnc_peer_error != 0) { + CNETERR("Closed conn 0x%p->%s (errno %d, peer errno %d): " + "canceled %d TX, %d/%d RDMA\n", + conn, libcfs_nid2str(conn->gnc_peer->gnp_nid), + conn->gnc_error, conn->gnc_peer_error, + nlive, nq_rdma, nrdma); + } else { + CNETERR("Closed conn 0x%p->%s (errno %d): " + "canceled %d TX, %d/%d RDMA\n", + conn, libcfs_nid2str(conn->gnc_peer->gnp_nid), + conn->gnc_error, + nlive, nq_rdma, nrdma); + } + } + + kgnilnd_destroy_conn_ep(conn); + + /* Bug 765042 - race this with completing a new conn to same peer - we need + * finish_connect to detach purgatory before we can do it ourselves here */ + CFS_RACE(CFS_FAIL_GNI_FINISH_PURG); + + /* now it is safe to remove from peer list - anyone looking at + * gnp_conns now is free to unlink if not on purgatory */ + write_lock(&kgnilnd_data.kgn_peer_conn_lock); + + conn->gnc_state = GNILND_CONN_DONE; + + /* Decrement counter if we are marked by del_conn_or_peers for closing + */ + if (conn->gnc_needs_closing) + kgnilnd_admin_decref(kgnilnd_data.kgn_npending_conns); + + /* Remove from peer's list of valid connections if its not in purgatory */ + if (!conn->gnc_in_purgatory) { + list_del_init(&conn->gnc_list); + } + + /* NB - only unlinking if we set pending in del_peer_locked from admin or + * shutdown */ + if (kgnilnd_peer_active(conn->gnc_peer) && + conn->gnc_peer->gnp_pending_unlink && + kgnilnd_can_unlink_peer_locked(conn->gnc_peer)) { + kgnilnd_unlink_peer_locked(conn->gnc_peer); + } + + write_unlock(&kgnilnd_data.kgn_peer_conn_lock); + + /* I'm telling Mommy! - use peer_error if they initiated close */ + kgnilnd_peer_notify(conn->gnc_peer, + conn->gnc_error == -ECONNRESET ? conn->gnc_peer_error + : conn->gnc_error); + + EXIT; +} + +int +kgnilnd_set_conn_params(kgn_dgram_t *dgram) +{ + kgn_conn_t *conn = dgram->gndg_conn; + kgn_connreq_t *connreq = &dgram->gndg_conn_in; + kgn_gniparams_t *rem_param = &connreq->gncr_gnparams; + gni_return_t rrc; + int rc = 0; + + /* set timeout vals in conn early so we can use them for the NAK */ + + /* use max of the requested and our timeout, peer will do the same */ + conn->gnc_timeout = MAX(conn->gnc_timeout, connreq->gncr_timeout); + + /* only ep_bind really mucks around with the CQ */ + /* only ep bind if we are not connecting to ourself and the dstnid is not a wildcard. this check + * is necessary as you can only bind an ep once and we must make sure we dont bind when already bound. + */ + if (connreq->gncr_dstnid != LNET_NID_ANY && dgram->gndg_conn_out.gncr_dstnid != connreq->gncr_srcnid) { + mutex_lock(&conn->gnc_device->gnd_cq_mutex); + rrc = kgnilnd_ep_bind(conn->gnc_ephandle, + connreq->gncr_gnparams.gnpr_host_id, + conn->gnc_cqid); + mutex_unlock(&conn->gnc_device->gnd_cq_mutex); + if (rrc != GNI_RC_SUCCESS) { + rc = -ECONNABORTED; + goto return_out; + } + } + + rrc = kgnilnd_ep_set_eventdata(conn->gnc_ephandle, conn->gnc_cqid, + connreq->gncr_gnparams.gnpr_cqid); + if (rrc != GNI_RC_SUCCESS) { + rc = -ECONNABORTED; + goto cleanup_out; + } + + /* Initialize SMSG */ + rrc = kgnilnd_smsg_init(conn->gnc_ephandle, &conn->gnpr_smsg_attr, + &connreq->gncr_gnparams.gnpr_smsg_attr); + if (unlikely(rrc == GNI_RC_INVALID_PARAM)) { + gni_smsg_attr_t *local = &conn->gnpr_smsg_attr; + gni_smsg_attr_t *remote = &connreq->gncr_gnparams.gnpr_smsg_attr; + /* help folks figure out if there is a tunable off, etc. */ + LCONSOLE_ERROR("SMSG attribute mismatch. Data from local/remote:" + " type %d/%d msg_maxsize %u/%u" + " mbox_maxcredit %u/%u. Please check kgni" + " logs for further data\n", + local->msg_type, remote->msg_type, + local->msg_maxsize, remote->msg_maxsize, + local->mbox_maxcredit, remote->mbox_maxcredit); + } + if (rrc != GNI_RC_SUCCESS) { + rc = -ECONNABORTED; + goto cleanup_out; + } + + /* log this for help in debuggin SMSG buffer re-use */ + CDEBUG(D_NET, "conn %p src %s dst %s smsg %p acquired" + " local cqid %u SMSG %p->%u hndl "LPX64"."LPX64 + " remote cqid %u SMSG %p->%u hndl "LPX64"."LPX64"\n", + conn, libcfs_nid2str(connreq->gncr_srcnid), + libcfs_nid2str(connreq->gncr_dstnid), + &conn->gnpr_smsg_attr, + conn->gnc_cqid, + conn->gnpr_smsg_attr.msg_buffer, + conn->gnpr_smsg_attr.mbox_offset, + conn->gnpr_smsg_attr.mem_hndl.qword1, + conn->gnpr_smsg_attr.mem_hndl.qword2, + rem_param->gnpr_cqid, + rem_param->gnpr_smsg_attr.msg_buffer, + rem_param->gnpr_smsg_attr.mbox_offset, + rem_param->gnpr_smsg_attr.mem_hndl.qword1, + rem_param->gnpr_smsg_attr.mem_hndl.qword2); + + conn->gnc_peerstamp = connreq->gncr_peerstamp; + conn->gnc_peer_connstamp = connreq->gncr_connstamp; + + /* We update the reaper timeout once we have a valid conn and timeout */ + kgnilnd_update_reaper_timeout(GNILND_TO2KA(conn->gnc_timeout)); + + return 0; + +cleanup_out: + rrc = kgnilnd_ep_unbind(conn->gnc_ephandle); + /* not sure I can just let this fly */ + LASSERTF(rrc == GNI_RC_SUCCESS, + "bad rc from gni_ep_unbind trying to cleanup: %d\n", rrc); + +return_out: + LASSERTF(rc != 0, "SOFTWARE BUG: rc == 0\n"); + CERROR("Error setting connection params from %s: %d\n", + libcfs_nid2str(connreq->gncr_srcnid), rc); + return rc; +} + +/* needs down_read on kgn_net_rw_sem held from before this call until + * after the write_lock on kgn_peer_conn_lock - this ensures we stay sane + * with kgnilnd_shutdown - it'll get the sem and set shutdown, then get the + * kgn_peer_conn_lock to start del_peer'ing. If we hold the sem until after + * kgn_peer_conn_lock is held, we guarantee that nobody calls + * kgnilnd_add_peer_locked without checking gnn_shutdown */ +int +kgnilnd_create_peer_safe(kgn_peer_t **peerp, lnet_nid_t nid, kgn_net_t *net) +{ + kgn_peer_t *peer; + int rc; + + LASSERT(nid != LNET_NID_ANY); + + /* We dont pass the net around in the dgram anymore so here is where we find it + * this will work unless its in shutdown or the nid has a net that is invalid. + * Either way error code needs to be returned in that case. + * + * If the net passed in is not NULL then we can use it, this alleviates looking it + * when the calling function has access to the data. + */ + if (net == NULL) { + rc = kgnilnd_find_net(nid, &net); + if (rc < 0) + return rc; + } else { + /* find net adds a reference on the net if we are not using + * it we must do it manually so the net references are + * correct when tearing down the net + */ + kgnilnd_net_addref(net); + } + + LIBCFS_ALLOC(peer, sizeof(*peer)); + if (peer == NULL) { + kgnilnd_net_decref(net); + return -ENOMEM; + } + peer->gnp_nid = nid; + + /* translate from nid to nic addr & store */ + rc = kgnilnd_nid_to_nicaddrs(LNET_NIDADDR(nid), 1, &peer->gnp_host_id); + if (rc <= 0) { + kgnilnd_net_decref(net); + LIBCFS_FREE(peer, sizeof(*peer)); + return -ESRCH; + } + CDEBUG(D_NET, "peer 0x%p->%s -> NIC 0x%x\n", peer, + libcfs_nid2str(nid), peer->gnp_host_id); + + atomic_set(&peer->gnp_refcount, 1); /* 1 ref for caller */ + atomic_set(&peer->gnp_dirty_eps, 0); + + INIT_LIST_HEAD(&peer->gnp_list); + INIT_LIST_HEAD(&peer->gnp_connd_list); + INIT_LIST_HEAD(&peer->gnp_conns); + INIT_LIST_HEAD(&peer->gnp_tx_queue); + + /* the first reconnect should happen immediately, so we leave + * gnp_reconnect_interval set to 0 */ + + LASSERTF(net != NULL, "peer 0x%p->%s with NULL net\n", + peer, libcfs_nid2str(nid)); + + /* must have kgn_net_rw_sem held for this... */ + if (net->gnn_shutdown) { + /* shutdown has started already */ + kgnilnd_net_decref(net); + LIBCFS_FREE(peer, sizeof(*peer)); + return -ESHUTDOWN; + } + + peer->gnp_net = net; + + atomic_inc(&kgnilnd_data.kgn_npeers); + + *peerp = peer; + return 0; +} + +void +kgnilnd_destroy_peer(kgn_peer_t *peer) +{ + CDEBUG(D_NET, "peer %s %p deleted\n", + libcfs_nid2str(peer->gnp_nid), peer); + LASSERTF(atomic_read(&peer->gnp_refcount) == 0, + "peer 0x%p->%s refs %d\n", + peer, libcfs_nid2str(peer->gnp_nid), + atomic_read(&peer->gnp_refcount)); + LASSERTF(atomic_read(&peer->gnp_dirty_eps) == 0, + "peer 0x%p->%s dirty eps %d\n", + peer, libcfs_nid2str(peer->gnp_nid), + atomic_read(&peer->gnp_dirty_eps)); + LASSERTF(peer->gnp_net != NULL, "peer %p (%s) with NULL net\n", + peer, libcfs_nid2str(peer->gnp_nid)); + LASSERTF(!kgnilnd_peer_active(peer), + "peer 0x%p->%s\n", + peer, libcfs_nid2str(peer->gnp_nid)); + LASSERTF(peer->gnp_connecting == GNILND_PEER_IDLE || peer->gnp_connecting == GNILND_PEER_KILL, + "peer 0x%p->%s, connecting %d\n", + peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting); + LASSERTF(list_empty(&peer->gnp_conns), + "peer 0x%p->%s\n", + peer, libcfs_nid2str(peer->gnp_nid)); + LASSERTF(list_empty(&peer->gnp_tx_queue), + "peer 0x%p->%s\n", + peer, libcfs_nid2str(peer->gnp_nid)); + LASSERTF(list_empty(&peer->gnp_connd_list), + "peer 0x%p->%s\n", + peer, libcfs_nid2str(peer->gnp_nid)); + + /* NB a peer's connections keep a reference on their peer until + * they are destroyed, so we can be assured that _all_ state to do + * with this peer has been cleaned up when its refcount drops to + * zero. */ + + atomic_dec(&kgnilnd_data.kgn_npeers); + kgnilnd_net_decref(peer->gnp_net); + + LIBCFS_FREE(peer, sizeof(*peer)); +} + +/* the conn might not have made it all the way through to a connected + * state - but we need to purgatory any conn that a remote peer might + * have seen through a posted dgram as well */ +void +kgnilnd_add_purgatory_locked(kgn_conn_t *conn, kgn_peer_t *peer) +{ + kgn_mbox_info_t *mbox = NULL; + ENTRY; + + /* NB - the caller should own conn by removing him from the + * scheduler thread when finishing the close */ + + LASSERTF(peer != NULL, "conn %p with NULL peer\n", conn); + + /* If this is still true, need to add the calls to unlink back in and + * figure out how to close the hole on loopback conns */ + LASSERTF(kgnilnd_peer_active(peer), "can't use inactive peer %s (%p)" + " we'll never recover the resources\n", + libcfs_nid2str(peer->gnp_nid), peer); + + CDEBUG(D_NET, "conn %p peer %p dev %p\n", conn, peer, + conn->gnc_device); + + /* add ref for mbox purgatory hold */ + kgnilnd_peer_addref(peer); + kgnilnd_conn_addref(conn); + conn->gnc_in_purgatory = 1; + + mbox = &conn->gnc_fma_blk->gnm_mbox_info[conn->gnc_mbox_id]; + mbox->mbx_prev_nid = peer->gnp_nid; + mbox->mbx_add_purgatory = jiffies; + kgnilnd_release_mbox(conn, 1); + + LASSERTF(list_empty(&conn->gnc_mdd_list), + "conn 0x%p->%s with active purgatory hold MDD %d\n", + conn, libcfs_nid2str(peer->gnp_nid), + kgnilnd_count_list(&conn->gnc_mdd_list)); + + EXIT; +} + +/* Instead of detaching everything from purgatory here we just mark the conn as needing + * detach, when the reaper checks the conn the next time it will detach it. + * Calling function requires write_lock held on kgn_peer_conn_lock + */ +void +kgnilnd_mark_for_detach_purgatory_all_locked(kgn_peer_t *peer) { + kgn_conn_t *conn; + + list_for_each_entry(conn, &peer->gnp_conns, gnc_list) { + if (conn->gnc_in_purgatory && !conn->gnc_needs_detach) { + conn->gnc_needs_detach = 1; + kgnilnd_admin_addref(kgnilnd_data.kgn_npending_detach); + } + } +} + +/* Calling function needs a write_lock held on kgn_peer_conn_lock */ +void +kgnilnd_detach_purgatory_locked(kgn_conn_t *conn, struct list_head *conn_list) +{ + kgn_mbox_info_t *mbox = NULL; + + /* if needed, add the conn purgatory data to the list passed in */ + if (conn->gnc_in_purgatory) { + CDEBUG(D_NET, "peer %p->%s purg_conn %p@%s mdd_list #tx %d\n", + conn->gnc_peer, libcfs_nid2str(conn->gnc_peer->gnp_nid), + conn, kgnilnd_conn_state2str(conn), + kgnilnd_count_list(&conn->gnc_mdd_list)); + + mbox = &conn->gnc_fma_blk->gnm_mbox_info[conn->gnc_mbox_id]; + mbox->mbx_detach_of_purgatory = jiffies; + + /* conn->gnc_list is the entry point on peer->gnp_conns, so detaching it + * here removes it from the list of 'valid' peer connections. + * We put the current conn onto a list of conns to call kgnilnd_release_purgatory_locked() + * and as such the caller of kgnilnd_detach_purgatory_locked() now owns that conn, since its not + * on the peer's conn_list anymore. + */ + + kgnilnd_peer_decref(conn->gnc_peer); + list_del_init(&conn->gnc_list); + + /* NB - only unlinking if we set pending in del_peer_locked from admin or + * shutdown */ + if (kgnilnd_peer_active(conn->gnc_peer) && + conn->gnc_peer->gnp_pending_unlink && + kgnilnd_can_unlink_peer_locked(conn->gnc_peer)) { + kgnilnd_unlink_peer_locked(conn->gnc_peer); + } + /* The reaper will not call detach unless the conn is fully through kgnilnd_complete_closed_conn. + * If the conn is not in a DONE state somehow we are attempting to detach even though + * the conn has not been fully cleaned up. If we detach while the conn is still closing + * we will end up with an orphaned connection that has valid ep_handle, that is not on a + * peer. + */ + + LASSERTF(conn->gnc_state == GNILND_CONN_DONE, "Conn in invalid state %p@%s \n", + conn, kgnilnd_conn_state2str(conn)); + + /* move from peer to the delayed release list */ + list_add_tail(&conn->gnc_list, conn_list); + } +} + +void +kgnilnd_release_purgatory_list(struct list_head *conn_list) +{ + kgn_device_t *dev; + kgn_conn_t *conn, *connN; + kgn_mdd_purgatory_t *gmp, *gmpN; + + list_for_each_entry_safe(conn, connN, conn_list, gnc_list) { + dev = conn->gnc_device; + + kgnilnd_release_mbox(conn, -1); + conn->gnc_in_purgatory = 0; + + list_del_init(&conn->gnc_list); + + /* gnc_needs_detach is set in kgnilnd_del_conn_or_peer. It is used to keep track + * of conns that have been marked for detach by kgnilnd_del_conn_or_peer. + * The function uses kgn_npending_detach to verify the conn has + * actually been detached. + */ + + if (conn->gnc_needs_detach) + kgnilnd_admin_decref(kgnilnd_data.kgn_npending_detach); + + /* if this guy is really dead (we are doing release from reaper), + * make sure we tell LNet - if this is from other context, + * the checks in the function will prevent an errant + * notification */ + kgnilnd_peer_notify(conn->gnc_peer, conn->gnc_error); + + list_for_each_entry_safe(gmp, gmpN, &conn->gnc_mdd_list, + gmp_list) { + CDEBUG(D_NET, + "dev %p releasing held mdd "LPX64"."LPX64"\n", + conn->gnc_device, gmp->gmp_map_key.qword1, + gmp->gmp_map_key.qword2); + + atomic_dec(&dev->gnd_n_mdd_held); + kgnilnd_mem_mdd_release(conn->gnc_device->gnd_handle, + &gmp->gmp_map_key); + /* ignoring the return code - if kgni/ghal can't find it + * it must be released already */ + + list_del_init(&gmp->gmp_list); + LIBCFS_FREE(gmp, sizeof(*gmp)); + } + /* lose conn ref for purgatory */ + kgnilnd_conn_decref(conn); + } +} + +/* needs write_lock on kgnilnd_data.kgn_peer_conn_lock held */ +void +kgnilnd_peer_increase_reconnect_locked(kgn_peer_t *peer) +{ + int current_to; + + current_to = peer->gnp_reconnect_interval; + + /* we'll try to reconnect fast the first time, then back-off */ + if (current_to == 0) { + peer->gnp_reconnect_time = jiffies - 1; + current_to = *kgnilnd_tunables.kgn_min_reconnect_interval; + } else { + peer->gnp_reconnect_time = jiffies + cfs_time_seconds(current_to); + /* add 50% of min timeout & retry */ + current_to += *kgnilnd_tunables.kgn_min_reconnect_interval / 2; + } + + current_to = MIN(current_to, + *kgnilnd_tunables.kgn_max_reconnect_interval); + + peer->gnp_reconnect_interval = current_to; + CDEBUG(D_NET, "peer %s can reconnect at %lu interval %lu\n", + libcfs_nid2str(peer->gnp_nid), peer->gnp_reconnect_time, + peer->gnp_reconnect_interval); +} + +/* needs kgnilnd_data.kgn_peer_conn_lock held */ +kgn_peer_t * +kgnilnd_find_peer_locked(lnet_nid_t nid) +{ + struct list_head *peer_list = kgnilnd_nid2peerlist(nid); + kgn_peer_t *peer; + + /* Chopping nid down to only NIDADDR using LNET_NIDADDR so we only + * have a single peer per device instead of a peer per nid/net combo. + */ + + list_for_each_entry(peer, peer_list, gnp_list) { + if (LNET_NIDADDR(nid) != LNET_NIDADDR(peer->gnp_nid)) + continue; + + CDEBUG(D_NET, "got peer [%p] -> %s c %d (%d)\n", + peer, libcfs_nid2str(nid), + peer->gnp_connecting, + atomic_read(&peer->gnp_refcount)); + return peer; + } + return NULL; +} + +/* need write_lock on kgn_peer_conn_lock */ +void +kgnilnd_unlink_peer_locked(kgn_peer_t *peer) +{ + LASSERTF(list_empty(&peer->gnp_conns), + "peer 0x%p->%s\n", + peer, libcfs_nid2str(peer->gnp_nid)); + LASSERTF(list_empty(&peer->gnp_tx_queue), + "peer 0x%p->%s\n", + peer, libcfs_nid2str(peer->gnp_nid)); + LASSERTF(kgnilnd_peer_active(peer), + "peer 0x%p->%s\n", + peer, libcfs_nid2str(peer->gnp_nid)); + CDEBUG(D_NET, "unlinking peer 0x%p->%s\n", + peer, libcfs_nid2str(peer->gnp_nid)); + + list_del_init(&peer->gnp_list); + kgnilnd_data.kgn_peer_version++; + kgnilnd_admin_decref(kgnilnd_data.kgn_npending_unlink); + /* lose peerlist's ref */ + kgnilnd_peer_decref(peer); +} + +int +kgnilnd_get_peer_info(int index, + kgn_peer_t **found_peer, + lnet_nid_t *id, __u32 *nic_addr, + int *refcount, int *connecting) +{ + struct list_head *ptmp; + kgn_peer_t *peer; + int i; + int rc = -ENOENT; + + read_lock(&kgnilnd_data.kgn_peer_conn_lock); + + for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) { + + list_for_each(ptmp, &kgnilnd_data.kgn_peers[i]) { + peer = list_entry(ptmp, kgn_peer_t, gnp_list); + + if (peer->gnp_nid != *id) + continue; + + if (index-- > 0) + continue; + + CDEBUG(D_NET, "found peer %p (%s) at index %d\n", + peer, libcfs_nid2str(peer->gnp_nid), index); + + *found_peer = peer; + *id = peer->gnp_nid; + *nic_addr = peer->gnp_host_id; + *refcount = atomic_read(&peer->gnp_refcount); + *connecting = peer->gnp_connecting; + + rc = 0; + goto out; + } + } +out: + read_unlock(&kgnilnd_data.kgn_peer_conn_lock); + if (rc) + CDEBUG(D_NET, "no gni peer at index %d\n", index); + return rc; +} + +/* requires write_lock on kgn_peer_conn_lock held */ +void +kgnilnd_add_peer_locked(lnet_nid_t nid, kgn_peer_t *new_stub_peer, kgn_peer_t **peerp) +{ + kgn_peer_t *peer, *peer2; + + LASSERTF(new_stub_peer != NULL, "bad stub peer for nid %s\n", + libcfs_nid2str(nid)); + + peer2 = kgnilnd_find_peer_locked(nid); + if (peer2 != NULL) { + /* A peer was created during the lock transition, so drop + * the new one we created */ + kgnilnd_peer_decref(new_stub_peer); + peer = peer2; + } else { + peer = new_stub_peer; + /* peer table takes existing ref on peer */ + + LASSERTF(!kgnilnd_peer_active(peer), + "peer 0x%p->%s already in peer table\n", + peer, libcfs_nid2str(peer->gnp_nid)); + list_add_tail(&peer->gnp_list, + kgnilnd_nid2peerlist(nid)); + kgnilnd_data.kgn_peer_version++; + } + + LASSERTF(peer->gnp_net != NULL, "peer 0x%p->%s with NULL net\n", + peer, libcfs_nid2str(peer->gnp_nid)); + *peerp = peer; +} + +int +kgnilnd_add_peer(kgn_net_t *net, lnet_nid_t nid, kgn_peer_t **peerp) +{ + kgn_peer_t *peer; + int rc; + ENTRY; + + if (nid == LNET_NID_ANY) + return -EINVAL; + + /* NB - this will not block during normal operations - + * the only writer of this is in the startup/shutdown path. */ + rc = down_read_trylock(&kgnilnd_data.kgn_net_rw_sem); + if (!rc) { + rc = -ESHUTDOWN; + RETURN(rc); + } + rc = kgnilnd_create_peer_safe(&peer, nid, net); + if (rc != 0) { + up_read(&kgnilnd_data.kgn_net_rw_sem); + RETURN(rc); + } + + write_lock(&kgnilnd_data.kgn_peer_conn_lock); + up_read(&kgnilnd_data.kgn_net_rw_sem); + + kgnilnd_add_peer_locked(nid, peer, peerp); + + CDEBUG(D_NET, "peer 0x%p->%s connecting %d\n", + peerp, libcfs_nid2str((*peerp)->gnp_nid), + (*peerp)->gnp_connecting); + + write_unlock(&kgnilnd_data.kgn_peer_conn_lock); + RETURN(0); +} + +/* needs write_lock on kgn_peer_conn_lock */ +void +kgnilnd_cancel_peer_connect_locked(kgn_peer_t *peer, struct list_head *zombies) +{ + kgn_tx_t *tx, *txn; + + /* we do care about state of gnp_connecting - we could be between + * reconnect attempts, so try to find the dgram and cancel the TX + * anyways. If we are in the process of posting DONT do anything; + * once it fails or succeeds we can nuke the connect attempt. + * We have no idea where in kgnilnd_post_dgram we are so we cant + * attempt to cancel until the function is done. + */ + + /* make sure peer isn't in process of connecting or waiting for connect*/ + spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock); + if (!(list_empty(&peer->gnp_connd_list))) { + list_del_init(&peer->gnp_connd_list); + /* remove connd ref */ + kgnilnd_peer_decref(peer); + } + spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock); + + if (peer->gnp_connecting == GNILND_PEER_POSTING || peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) { + peer->gnp_connecting = GNILND_PEER_NEEDS_DEATH; + /* We are in process of posting right now the xchg set it up for us to + * cancel the connect so we are finished for now */ + } else { + /* no need for exchange we have the peer lock and its ready for us to nuke */ + LASSERTF(peer->gnp_connecting != GNILND_PEER_POSTING, + "Peer in invalid state 0x%p->%s, connecting %d\n", + peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting); + peer->gnp_connecting = GNILND_PEER_IDLE; + set_mb(peer->gnp_last_dgram_errno, -ETIMEDOUT); + kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev, + peer->gnp_nid); + } + + /* The least we can do is nuke the tx's no matter what.... */ + list_for_each_entry_safe(tx, txn, &peer->gnp_tx_queue, tx_list) { + kgnilnd_tx_del_state_locked(tx, peer, NULL, + GNILND_TX_ALLOCD); + list_add_tail(&tx->tx_list, zombies); + } +} + +/* needs write_lock on kgn_peer_conn_lock */ +void +kgnilnd_del_peer_locked(kgn_peer_t *peer, int error) +{ + /* this peer could be passive and only held for purgatory, + * take a ref to ensure it doesn't disappear in this function */ + kgnilnd_peer_addref(peer); + + CFS_RACE(CFS_FAIL_GNI_FIND_TARGET); + + /* if purgatory release cleared it out, don't try again */ + if (kgnilnd_peer_active(peer)) { + /* always do this to allow kgnilnd_start_connect and + * kgnilnd_finish_connect to catch this before they + * wrap up their operations */ + if (kgnilnd_can_unlink_peer_locked(peer)) { + /* already released purgatory, so only active + * conns hold it */ + kgnilnd_unlink_peer_locked(peer); + } else { + kgnilnd_close_peer_conns_locked(peer, error); + /* peer unlinks itself when last conn is closed */ + } + } + + /* we are done, release back to the wild */ + kgnilnd_peer_decref(peer); +} + +int +kgnilnd_del_conn_or_peer(kgn_net_t *net, lnet_nid_t nid, int command, + int error) +{ + LIST_HEAD (souls); + LIST_HEAD (zombies); + struct list_head *ptmp, *pnxt; + kgn_peer_t *peer; + int lo; + int hi; + int i; + int rc = -ENOENT; + + write_lock(&kgnilnd_data.kgn_peer_conn_lock); + + if (nid != LNET_NID_ANY) + lo = hi = kgnilnd_nid2peerlist(nid) - kgnilnd_data.kgn_peers; + else { + lo = 0; + hi = *kgnilnd_tunables.kgn_peer_hash_size - 1; + /* wildcards always succeed */ + rc = 0; + } + + for (i = lo; i <= hi; i++) { + list_for_each_safe(ptmp, pnxt, &kgnilnd_data.kgn_peers[i]) { + peer = list_entry(ptmp, kgn_peer_t, gnp_list); + + LASSERTF(peer->gnp_net != NULL, + "peer %p (%s) with NULL net\n", + peer, libcfs_nid2str(peer->gnp_nid)); + + if (net != NULL && peer->gnp_net != net) + continue; + + if (!(nid == LNET_NID_ANY || LNET_NIDADDR(peer->gnp_nid) == LNET_NIDADDR(nid))) + continue; + + /* In both cases, we want to stop any in-flight + * connect attempts */ + kgnilnd_cancel_peer_connect_locked(peer, &zombies); + + switch (command) { + case GNILND_DEL_CONN: + kgnilnd_close_peer_conns_locked(peer, error); + break; + case GNILND_DEL_PEER: + peer->gnp_pending_unlink = 1; + kgnilnd_admin_addref(kgnilnd_data.kgn_npending_unlink); + kgnilnd_mark_for_detach_purgatory_all_locked(peer); + kgnilnd_del_peer_locked(peer, error); + break; + case GNILND_CLEAR_PURGATORY: + /* Mark everything ready for detach reaper will cleanup + * once we release the kgn_peer_conn_lock + */ + kgnilnd_mark_for_detach_purgatory_all_locked(peer); + peer->gnp_last_errno = -EISCONN; + /* clear reconnect so he can reconnect soon */ + peer->gnp_reconnect_time = 0; + peer->gnp_reconnect_interval = 0; + break; + default: + CERROR("bad command %d\n", command); + LBUG(); + } + /* we matched something */ + rc = 0; + } + } + + write_unlock(&kgnilnd_data.kgn_peer_conn_lock); + + /* release all of the souls found held in purgatory */ + kgnilnd_release_purgatory_list(&souls); + + /* nuke peer TX */ + kgnilnd_txlist_done(&zombies, error); + + /* This function does not return until the commands it initiated have completed, + * since they have to work there way through the other threads. In the case of shutdown + * threads are not woken up until after this call is initiated so we cannot wait, we just + * need to return. The same applies for stack reset we shouldnt wait as the reset thread + * handles closing. + */ + + CFS_RACE(CFS_FAIL_GNI_RACE_RESET); + + if (error == -ENOTRECOVERABLE || error == -ESHUTDOWN) { + return rc; + } + + i = 4; + while (atomic_read(&kgnilnd_data.kgn_npending_conns) || + atomic_read(&kgnilnd_data.kgn_npending_detach) || + atomic_read(&kgnilnd_data.kgn_npending_unlink)) { + + cfs_pause(cfs_time_seconds(1)); + i++; + + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, "Waiting on %d peers %d closes %d detaches\n", + atomic_read(&kgnilnd_data.kgn_npending_unlink), + atomic_read(&kgnilnd_data.kgn_npending_conns), + atomic_read(&kgnilnd_data.kgn_npending_detach)); + } + + return rc; +} + +kgn_conn_t * +kgnilnd_get_conn_by_idx(int index) +{ + kgn_peer_t *peer; + struct list_head *ptmp; + kgn_conn_t *conn; + struct list_head *ctmp; + int i; + + + for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) { + read_lock(&kgnilnd_data.kgn_peer_conn_lock); + list_for_each(ptmp, &kgnilnd_data.kgn_peers[i]) { + + peer = list_entry(ptmp, kgn_peer_t, gnp_list); + + list_for_each(ctmp, &peer->gnp_conns) { + conn = list_entry(ctmp, kgn_conn_t, gnc_list); + + if (conn->gnc_state != GNILND_CONN_ESTABLISHED) + continue; + + if (index-- > 0) + continue; + + CDEBUG(D_NET, "++conn[%p] -> %s (%d)\n", conn, + libcfs_nid2str(conn->gnc_peer->gnp_nid), + atomic_read(&conn->gnc_refcount)); + kgnilnd_conn_addref(conn); + read_unlock(&kgnilnd_data.kgn_peer_conn_lock); + return conn; + } + } + read_unlock(&kgnilnd_data.kgn_peer_conn_lock); + } + + return NULL; +} + +int +kgnilnd_get_conn_info(kgn_peer_t *peer, + int *device_id, __u64 *peerstamp, + int *tx_seq, int *rx_seq, + int *fmaq_len, int *nfma, int *nrdma) +{ + kgn_conn_t *conn; + int rc = 0; + + read_lock(&kgnilnd_data.kgn_peer_conn_lock); + + conn = kgnilnd_find_conn_locked(peer); + if (conn == NULL) { + rc = -ENOENT; + goto out; + } + + *device_id = conn->gnc_device->gnd_host_id; + *peerstamp = conn->gnc_peerstamp; + *tx_seq = conn->gnc_tx_seq; + *rx_seq = conn->gnc_rx_seq; + *fmaq_len = kgnilnd_count_list(&conn->gnc_fmaq); + *nfma = atomic_read(&conn->gnc_nlive_fma); + *nrdma = atomic_read(&conn->gnc_nlive_rdma); +out: + read_unlock(&kgnilnd_data.kgn_peer_conn_lock); + return rc; +} + +/* needs write_lock on kgn_peer_conn_lock */ +int +kgnilnd_close_peer_conns_locked(kgn_peer_t *peer, int why) +{ + kgn_conn_t *conn; + struct list_head *ctmp, *cnxt; + int count = 0; + + list_for_each_safe(ctmp, cnxt, &peer->gnp_conns) { + conn = list_entry(ctmp, kgn_conn_t, gnc_list); + + if (conn->gnc_state != GNILND_CONN_ESTABLISHED) + continue; + + count++; + /* we mark gnc_needs closing and increment kgn_npending_conns so that + * kgnilnd_del_conn_or_peer can wait on the other threads closing + * and cleaning up the connection. + */ + if (!conn->gnc_needs_closing) { + conn->gnc_needs_closing = 1; + kgnilnd_admin_addref(kgnilnd_data.kgn_npending_conns); + } + kgnilnd_close_conn_locked(conn, why); + } + return count; +} + +int +kgnilnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) +{ + struct libcfs_ioctl_data *data = arg; + kgn_net_t *net = ni->ni_data; + int rc = -EINVAL; + + LASSERT(ni == net->gnn_ni); + + switch (cmd) { + case IOC_LIBCFS_GET_PEER: { + lnet_nid_t nid = 0; + kgn_peer_t *peer = NULL; + __u32 nic_addr = 0; + __u64 peerstamp = 0; + int peer_refcount = 0, peer_connecting = 0; + int device_id = 0; + int tx_seq = 0, rx_seq = 0; + int fmaq_len = 0, nfma = 0, nrdma = 0; + + rc = kgnilnd_get_peer_info(data->ioc_count, &peer, + &nid, &nic_addr, &peer_refcount, + &peer_connecting); + if (rc) + break; + + /* Barf */ + /* LNET_MKNID is used to mask from lnet the multiplexing/demultiplexing of connections and peers + * LNET assumes a conn and peer per net, the LNET_MKNID/LNET_NIDADDR allows us to let Lnet see what it + * wants to see instead of the underlying network that is being used to send the data + */ + data->ioc_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), LNET_NIDADDR(nid)); + data->ioc_flags = peer_connecting; + data->ioc_count = peer_refcount; + + rc = kgnilnd_get_conn_info(peer, &device_id, &peerstamp, + &tx_seq, &rx_seq, &fmaq_len, + &nfma, &nrdma); + + /* This is allowable - a persistent peer could not + * have a connection */ + if (rc) { + /* flag to indicate we are not connected - + * need to print as such */ + data->ioc_flags |= (1<<16); + rc = 0; + } else { + /* still barf */ + data->ioc_net = device_id; + data->ioc_u64[0] = peerstamp; + data->ioc_u32[0] = fmaq_len; + data->ioc_u32[1] = nfma; + data->ioc_u32[2] = tx_seq; + data->ioc_u32[3] = rx_seq; + data->ioc_u32[4] = nrdma; + } + break; + } + case IOC_LIBCFS_ADD_PEER: { + /* just dummy value to allow using common interface */ + kgn_peer_t *peer; + rc = kgnilnd_add_peer(net, data->ioc_nid, &peer); + break; + } + case IOC_LIBCFS_DEL_PEER: { + /* NULL is passed in so it affects all peers in existence without regard to network + * as the peer may not exist on the network LNET believes it to be on. + */ + rc = kgnilnd_del_conn_or_peer(NULL, data->ioc_nid, + GNILND_DEL_PEER, -EUCLEAN); + break; + } + case IOC_LIBCFS_GET_CONN: { + kgn_conn_t *conn = kgnilnd_get_conn_by_idx(data->ioc_count); + + if (conn == NULL) + rc = -ENOENT; + else { + rc = 0; + /* LNET_MKNID is used to build the correct address based on what LNET wants to see instead of + * the generic connection that is used to send the data + */ + data->ioc_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), LNET_NIDADDR(conn->gnc_peer->gnp_nid)); + data->ioc_u32[0] = conn->gnc_device->gnd_id; + kgnilnd_conn_decref(conn); + } + break; + } + case IOC_LIBCFS_CLOSE_CONNECTION: { + /* use error = -ENETRESET to indicate it was lctl disconnect */ + /* NULL is passed in so it affects all the nets as the connection is virtual + * and may not exist on the network LNET believes it to be on. + */ + rc = kgnilnd_del_conn_or_peer(NULL, data->ioc_nid, + GNILND_DEL_CONN, -ENETRESET); + break; + } + case IOC_LIBCFS_PUSH_CONNECTION: { + /* we use this to flush purgatory */ + rc = kgnilnd_del_conn_or_peer(NULL, data->ioc_nid, + GNILND_CLEAR_PURGATORY, -EUCLEAN); + break; + } + case IOC_LIBCFS_REGISTER_MYNID: { + /* Ignore if this is a noop */ + if (data->ioc_nid == ni->ni_nid) { + rc = 0; + } else { + CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n", + libcfs_nid2str(data->ioc_nid), + libcfs_nid2str(ni->ni_nid)); + rc = -EINVAL; + } + break; + } + } + + return rc; +} + +void +kgnilnd_query(lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when) +{ + kgn_net_t *net = ni->ni_data; + kgn_tx_t *tx; + kgn_peer_t *peer = NULL; + kgn_conn_t *conn = NULL; + lnet_process_id_t id = {.nid = nid, .pid = LUSTRE_SRV_LNET_PID}; + ENTRY; + + /* I expect to find him, so only take a read lock */ + read_lock(&kgnilnd_data.kgn_peer_conn_lock); + peer = kgnilnd_find_peer_locked(nid); + if (peer != NULL) { + /* LIE if in a quiesce - we will update the timeouts after, + * but we don't want sends failing during it */ + if (kgnilnd_data.kgn_quiesce_trigger) { + *when = jiffies; + read_unlock(&kgnilnd_data.kgn_peer_conn_lock); + GOTO(out, 0); + } + + /* Update to best guess, might refine on later checks */ + *when = peer->gnp_last_alive; + + /* we have a peer, how about a conn? */ + conn = kgnilnd_find_conn_locked(peer); + + if (conn == NULL) { + /* if there is no conn, check peer last errno to see if clean disconnect + * - if it was, we lie to LNet because we believe a TX would complete + * on reconnect */ + if (kgnilnd_conn_clean_errno(peer->gnp_last_errno)) { + *when = jiffies; + } + /* we still want to fire a TX and new conn in this case */ + } else { + /* gnp_last_alive is valid, run for the hills */ + read_unlock(&kgnilnd_data.kgn_peer_conn_lock); + GOTO(out, 0); + } + } + /* if we get here, either we have no peer or no conn for him, so fire off + * new TX to trigger conn setup */ + read_unlock(&kgnilnd_data.kgn_peer_conn_lock); + + /* if we couldn't find him, we'll fire up a TX and get connected - + * if we don't do this, after ni_peer_timeout, LNet will declare him dead. + * So really we treat kgnilnd_query as a bit of a 'connect now' type + * event because it'll only do this when it wants to send + * + * Use a real TX for this to get the proper gnp_tx_queue behavior, etc + * normally we'd use kgnilnd_send_ctlmsg for this, but we don't really + * care that this goes out quickly since we already know we need a new conn + * formed */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_NOOP_SEND)) + return; + + tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP, ni->ni_nid); + if (tx != NULL) { + kgnilnd_launch_tx(tx, net, &id); + } +out: + CDEBUG(D_NETTRACE, "peer 0x%p->%s when %lu\n", peer, + libcfs_nid2str(nid), *when); + EXIT; +} + +int +kgnilnd_dev_init(kgn_device_t *dev) +{ + gni_return_t rrc; + int rc = 0; + unsigned int cq_size; + ENTRY; + + /* size of these CQs should be able to accommodate the outgoing + * RDMA and SMSG transactions. Since we really don't know what we + * really need here, we'll take credits * 2 * 3 to allow a bunch. + * We need to dig into this more with the performance work. */ + cq_size = *kgnilnd_tunables.kgn_credits * 2 * 3; + + rrc = kgnilnd_cdm_create(dev->gnd_id, *kgnilnd_tunables.kgn_ptag, + GNILND_COOKIE, 0, + &dev->gnd_domain); + if (rrc != GNI_RC_SUCCESS) { + CERROR("Can't create CDM %d (%d)\n", dev->gnd_id, rrc); + rc = -ENODEV; + GOTO(failed, rc); + } + + rrc = kgnilnd_cdm_attach(dev->gnd_domain, dev->gnd_id, + &dev->gnd_host_id, &dev->gnd_handle); + if (rrc != GNI_RC_SUCCESS) { + CERROR("Can't attach CDM to device %d (%d)\n", + dev->gnd_id, rrc); + rc = -ENODEV; + GOTO(failed, rc); + } + + rc = kgnilnd_setup_nic_translation(dev->gnd_host_id); + if (rc != 0) { + rc = -ENODEV; + GOTO(failed, rc); + } + + /* only dev 0 gets the errors - no need to reset the stack twice + * - this works because we have a single PTAG, if we had more + * then we'd need to have multiple handlers */ + if (dev->gnd_id == 0) { + rrc = kgnilnd_subscribe_errors(dev->gnd_handle, GNI_ERRMASK_CRITICAL, + 0, NULL, kgnilnd_critical_error, + &dev->gnd_err_handle); + if (rrc != GNI_RC_SUCCESS) { + CERROR("Can't subscribe for errors on device %d: rc %d\n", + dev->gnd_id, rrc); + rc = -ENODEV; + GOTO(failed, rc); + } + + rc = kgnilnd_set_quiesce_callback(dev->gnd_handle, + kgnilnd_quiesce_end_callback); + if (rc != GNI_RC_SUCCESS) { + CERROR("Can't subscribe for quiesce callback on device %d: rc %d\n", + dev->gnd_id, rrc); + rc = -ENODEV; + GOTO(failed, rc); + } + } + + rc = kgnilnd_nicaddr_to_nid(dev->gnd_host_id, &dev->gnd_nid); + if (rc < 0) { + /* log messages during startup */ + if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) { + CERROR("couldn't translate host_id 0x%x to nid. rc %d\n", + dev->gnd_host_id, rc); + } + rc = -ESRCH; + GOTO(failed, rc); + } + CDEBUG(D_NET, "NIC %x -> NID %d\n", dev->gnd_host_id, dev->gnd_nid); + + rrc = kgnilnd_cq_create(dev->gnd_handle, cq_size, + 0, kgnilnd_device_callback, + dev->gnd_id, &dev->gnd_snd_rdma_cqh); + if (rrc != GNI_RC_SUCCESS) { + CERROR("Can't create rdma send cq size %u for device " + "%d (%d)\n", cq_size, dev->gnd_id, rrc); + rc = -EINVAL; + GOTO(failed, rc); + } + + rrc = kgnilnd_cq_create(dev->gnd_handle, cq_size, + 0, kgnilnd_device_callback, dev->gnd_id, + &dev->gnd_snd_fma_cqh); + if (rrc != GNI_RC_SUCCESS) { + CERROR("Can't create fma send cq size %u for device %d (%d)\n", + cq_size, dev->gnd_id, rrc); + rc = -EINVAL; + GOTO(failed, rc); + } + + /* This one we size differently - overflows are possible and it needs to be + * sized based on machine size */ + rrc = kgnilnd_cq_create(dev->gnd_handle, + *kgnilnd_tunables.kgn_fma_cq_size, + 0, kgnilnd_device_callback, dev->gnd_id, + &dev->gnd_rcv_fma_cqh); + if (rrc != GNI_RC_SUCCESS) { + CERROR("Can't create fma cq size %d for device %d (%d)\n", + *kgnilnd_tunables.kgn_fma_cq_size, dev->gnd_id, rrc); + rc = -EINVAL; + GOTO(failed, rc); + } + + RETURN(0); + +failed: + kgnilnd_dev_fini(dev); + RETURN(rc); +} + +void +kgnilnd_dev_fini(kgn_device_t *dev) +{ + gni_return_t rrc; + ENTRY; + + /* At quiesce or rest time, need to loop through and clear gnd_ready_conns ?*/ + LASSERTF(list_empty(&dev->gnd_ready_conns) && + list_empty(&dev->gnd_map_tx) && + list_empty(&dev->gnd_rdmaq), + "dev 0x%p ready_conns %d@0x%p map_tx %d@0x%p rdmaq %d@0x%p\n", + dev, kgnilnd_count_list(&dev->gnd_ready_conns), &dev->gnd_ready_conns, + kgnilnd_count_list(&dev->gnd_map_tx), &dev->gnd_map_tx, + kgnilnd_count_list(&dev->gnd_rdmaq), &dev->gnd_rdmaq); + + /* These should follow from tearing down all connections */ + LASSERTF(dev->gnd_map_nphys == 0 && dev->gnd_map_physnop == 0, + "%d physical mappings of %d pages still mapped\n", + dev->gnd_map_nphys, dev->gnd_map_physnop); + + LASSERTF(dev->gnd_map_nvirt == 0 && dev->gnd_map_virtnob == 0, + "%d virtual mappings of "LPU64" bytes still mapped\n", + dev->gnd_map_nvirt, dev->gnd_map_virtnob); + + LASSERTF(atomic_read(&dev->gnd_n_mdd) == 0 && + atomic_read(&dev->gnd_n_mdd_held) == 0 && + atomic64_read(&dev->gnd_nbytes_map) == 0, + "%d SMSG mappings of %ld bytes still mapped or held %d\n", + atomic_read(&dev->gnd_n_mdd), + atomic64_read(&dev->gnd_nbytes_map), atomic_read(&dev->gnd_n_mdd_held)); + + LASSERT(list_empty(&dev->gnd_map_list)); + + /* What other assertions needed to ensure all connections torn down ? */ + + /* check all counters == 0 (EP, MDD, etc) */ + + /* if we are resetting due to quiese (stack reset), don't check + * thread states */ + LASSERTF(kgnilnd_data.kgn_quiesce_trigger || + atomic_read(&kgnilnd_data.kgn_nthreads) == 0, + "tried to shutdown with threads active\n"); + + if (dev->gnd_rcv_fma_cqh) { + rrc = kgnilnd_cq_destroy(dev->gnd_rcv_fma_cqh); + LASSERTF(rrc == GNI_RC_SUCCESS, + "bad rc from gni_cq_destroy on rcv_fma_cqh: %d\n", rrc); + dev->gnd_rcv_fma_cqh = NULL; + } + + if (dev->gnd_snd_rdma_cqh) { + rrc = kgnilnd_cq_destroy(dev->gnd_snd_rdma_cqh); + LASSERTF(rrc == GNI_RC_SUCCESS, + "bad rc from gni_cq_destroy on send_rdma_cqh: %d\n", rrc); + dev->gnd_snd_rdma_cqh = NULL; + } + + if (dev->gnd_snd_fma_cqh) { + rrc = kgnilnd_cq_destroy(dev->gnd_snd_fma_cqh); + LASSERTF(rrc == GNI_RC_SUCCESS, + "bad rc from gni_cq_destroy on snd_fma_cqh: %d\n", rrc); + dev->gnd_snd_fma_cqh = NULL; + } + + if (dev->gnd_err_handle) { + rrc = kgnilnd_release_errors(dev->gnd_err_handle); + LASSERTF(rrc == GNI_RC_SUCCESS, + "bad rc from gni_release_errors: %d\n", rrc); + dev->gnd_err_handle = NULL; + } + + if (dev->gnd_domain) { + rrc = kgnilnd_cdm_destroy(dev->gnd_domain); + LASSERTF(rrc == GNI_RC_SUCCESS, + "bad rc from gni_cdm_destroy: %d\n", rrc); + dev->gnd_domain = NULL; + } + + EXIT; +} + + +int kgnilnd_base_startup(void) +{ + struct timeval tv; + int pkmem = atomic_read(&libcfs_kmemory); + int rc; + int i; + kgn_device_t *dev; + struct task_struct *thrd; + ENTRY; + + LASSERTF(kgnilnd_data.kgn_init == GNILND_INIT_NOTHING, + "init %d\n", kgnilnd_data.kgn_init); + + /* zero pointers, flags etc */ + memset(&kgnilnd_data, 0, sizeof(kgnilnd_data)); + memset(&kgnilnd_hssops, 0, sizeof(kgnilnd_hssops)); + + /* CAVEAT EMPTOR: Every 'Fma' message includes the sender's NID and + * a unique (for all time) connstamp so we can uniquely identify + * the sender. The connstamp is an incrementing counter + * initialised with seconds + microseconds at startup time. So we + * rely on NOT creating connections more frequently on average than + * 1MHz to ensure we don't use old connstamps when we reboot. */ + do_gettimeofday(&tv); + kgnilnd_data.kgn_connstamp = + kgnilnd_data.kgn_peerstamp = + (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; + + init_rwsem(&kgnilnd_data.kgn_net_rw_sem); + + for (i = 0; i < GNILND_MAXDEVS; i++) { + kgn_device_t *dev = &kgnilnd_data.kgn_devices[i]; + + dev->gnd_id = i; + INIT_LIST_HEAD(&dev->gnd_ready_conns); + INIT_LIST_HEAD(&dev->gnd_map_tx); + INIT_LIST_HEAD(&dev->gnd_fma_buffs); + mutex_init(&dev->gnd_cq_mutex); + sema_init(&dev->gnd_fmablk_sem, 1); + spin_lock_init(&dev->gnd_fmablk_lock); + init_waitqueue_head(&dev->gnd_waitq); + init_waitqueue_head(&dev->gnd_dgram_waitq); + init_waitqueue_head(&dev->gnd_dgping_waitq); + spin_lock_init(&dev->gnd_lock); + INIT_LIST_HEAD(&dev->gnd_map_list); + spin_lock_init(&dev->gnd_map_lock); + atomic_set(&dev->gnd_nfmablk, 0); + atomic_set(&dev->gnd_fmablk_vers, 1); + atomic_set(&dev->gnd_neps, 0); + atomic_set(&dev->gnd_canceled_dgrams, 0); + INIT_LIST_HEAD(&dev->gnd_connd_peers); + spin_lock_init(&dev->gnd_connd_lock); + spin_lock_init(&dev->gnd_dgram_lock); + spin_lock_init(&dev->gnd_rdmaq_lock); + INIT_LIST_HEAD(&dev->gnd_rdmaq); + + /* alloc & setup nid based dgram table */ + LIBCFS_ALLOC(dev->gnd_dgrams, + sizeof(struct list_head) * *kgnilnd_tunables.kgn_peer_hash_size); + + if (dev->gnd_dgrams == NULL) { + rc = -ENOMEM; + GOTO(failed, rc); + } + + for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) { + INIT_LIST_HEAD(&dev->gnd_dgrams[i]); + } + atomic_set(&dev->gnd_ndgrams, 0); + + /* setup timer for RDMAQ processing */ + setup_timer(&dev->gnd_rdmaq_timer, kgnilnd_schedule_device_timer, + (unsigned long)dev); + } + + /* CQID 0 isn't allowed, set to MAX_MSG_ID - 1 to check for conflicts early */ + kgnilnd_data.kgn_next_cqid = GNILND_MAX_MSG_ID - 1; + kgnilnd_data.kgn_new_min_timeout = *kgnilnd_tunables.kgn_timeout; + init_waitqueue_head(&kgnilnd_data.kgn_reaper_waitq); + init_waitqueue_head(&kgnilnd_data.kgn_ruhroh_waitq); + spin_lock_init(&kgnilnd_data.kgn_reaper_lock); + + sema_init(&kgnilnd_data.kgn_quiesce_sem, 1); + atomic_set(&kgnilnd_data.kgn_nquiesce, 0); + atomic_set(&kgnilnd_data.kgn_npending_conns, 0); + atomic_set(&kgnilnd_data.kgn_npending_unlink, 0); + atomic_set(&kgnilnd_data.kgn_npending_detach, 0); + /* OK to call kgnilnd_api_shutdown() to cleanup now */ + kgnilnd_data.kgn_init = GNILND_INIT_DATA; + PORTAL_MODULE_USE; + + rwlock_init(&kgnilnd_data.kgn_peer_conn_lock); + + LIBCFS_ALLOC(kgnilnd_data.kgn_peers, + sizeof(struct list_head) * *kgnilnd_tunables.kgn_peer_hash_size); + + if (kgnilnd_data.kgn_peers == NULL) { + rc = -ENOMEM; + GOTO(failed, rc); + } + + for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) { + INIT_LIST_HEAD(&kgnilnd_data.kgn_peers[i]); + } + + LIBCFS_ALLOC(kgnilnd_data.kgn_conns, + sizeof(struct list_head) * *kgnilnd_tunables.kgn_peer_hash_size); + + if (kgnilnd_data.kgn_conns == NULL) { + rc = -ENOMEM; + GOTO(failed, rc); + } + + for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) { + INIT_LIST_HEAD(&kgnilnd_data.kgn_conns[i]); + } + + LIBCFS_ALLOC(kgnilnd_data.kgn_nets, + sizeof(struct list_head) * *kgnilnd_tunables.kgn_net_hash_size); + + if (kgnilnd_data.kgn_nets == NULL) { + rc = -ENOMEM; + GOTO(failed, rc); + } + + for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) { + INIT_LIST_HEAD(&kgnilnd_data.kgn_nets[i]); + } + + kgnilnd_data.kgn_mbox_cache = + cfs_mem_cache_create("kgn_mbox_block", + KMALLOC_MAX_SIZE, + 0, /* offset */ + SLAB_HWCACHE_ALIGN); /* flags */ + if (kgnilnd_data.kgn_mbox_cache == NULL) { + CERROR("Can't create slab for physical mbox blocks\n"); + rc = -ENOMEM; + GOTO(failed, rc); + } + + kgnilnd_data.kgn_rx_cache = + cfs_mem_cache_create("kgn_rx_t", + sizeof(kgn_rx_t), + 0, /* offset */ + 0); /* flags */ + if (kgnilnd_data.kgn_rx_cache == NULL) { + CERROR("Can't create slab for kgn_rx_t descriptors\n"); + rc = -ENOMEM; + GOTO(failed, rc); + } + + kgnilnd_data.kgn_tx_cache = + cfs_mem_cache_create("kgn_tx_t", + sizeof(kgn_tx_t), + 0, /* offset */ + 0); /* flags */ + if (kgnilnd_data.kgn_tx_cache == NULL) { + CERROR("Can't create slab for kgn_tx_t\n"); + rc = -ENOMEM; + GOTO(failed, rc); + } + + kgnilnd_data.kgn_tx_phys_cache = + cfs_mem_cache_create("kgn_tx_phys", + LNET_MAX_IOV * sizeof(gni_mem_segment_t), + 0, /* offset */ + 0); /* flags */ + if (kgnilnd_data.kgn_tx_phys_cache == NULL) { + CERROR("Can't create slab for kgn_tx_phys\n"); + rc = -ENOMEM; + GOTO(failed, rc); + } + + kgnilnd_data.kgn_dgram_cache = + cfs_mem_cache_create("kgn_dgram_t", + sizeof(kgn_dgram_t), + 0, /* offset */ + 0); /* flags */ + if (kgnilnd_data.kgn_dgram_cache == NULL) { + CERROR("Can't create slab for outgoing datagrams\n"); + rc = -ENOMEM; + GOTO(failed, rc); + } + + /* allocate a MAX_IOV array of page pointers for each cpu */ + kgnilnd_data.kgn_cksum_map_pages = kmalloc(num_possible_cpus() * sizeof (struct page *), + GFP_KERNEL); + if (kgnilnd_data.kgn_cksum_map_pages == NULL) { + CERROR("Can't allocate vmap cksum pages\n"); + rc = -ENOMEM; + GOTO(failed, rc); + } + kgnilnd_data.kgn_cksum_npages = num_possible_cpus(); + memset(kgnilnd_data.kgn_cksum_map_pages, 0, + kgnilnd_data.kgn_cksum_npages * sizeof (struct page *)); + + for (i = 0; i < kgnilnd_data.kgn_cksum_npages; i++) { + kgnilnd_data.kgn_cksum_map_pages[i] = kmalloc(LNET_MAX_IOV * sizeof (struct page *), + GFP_KERNEL); + if (kgnilnd_data.kgn_cksum_map_pages[i] == NULL) { + CERROR("Can't allocate vmap cksum pages for cpu %d\n", i); + rc = -ENOMEM; + GOTO(failed, rc); + } + } + + LASSERT(kgnilnd_data.kgn_ndevs == 0); + + /* Use all available GNI devices */ + for (i = 0; i < GNILND_MAXDEVS; i++) { + dev = &kgnilnd_data.kgn_devices[kgnilnd_data.kgn_ndevs]; + + rc = kgnilnd_dev_init(dev); + if (rc == 0) { + /* Increment here so base_shutdown cleans it up */ + kgnilnd_data.kgn_ndevs++; + + rc = kgnilnd_allocate_phys_fmablk(dev); + if (rc) { + GOTO(failed, rc); + } + } + } + + if (kgnilnd_data.kgn_ndevs == 0) { + CERROR("Can't initialise any GNI devices\n"); + rc = -ENODEV; + GOTO(failed, rc); + } + + rc = kgnilnd_thread_start(kgnilnd_reaper, NULL, "kgnilnd_rpr", 0); + if (rc != 0) { + CERROR("Can't spawn gnilnd reaper: %d\n", rc); + GOTO(failed, rc); + } + + /* + * Start ruhroh thread. We can't use kgnilnd_thread_start() because + * we don't want this thread included in kgnilnd_data.kgn_nthreads + * count. This thread controls quiesce, so it mustn't + * quiesce itself. + */ + thrd = kthread_run(kgnilnd_ruhroh_thread, NULL, "%s_%02d", "kgnilnd_rr", 0); + if (IS_ERR(thrd)) { + rc = PTR_ERR(thrd); + CERROR("Can't spawn gnilnd ruhroh thread: %d\n", rc); + GOTO(failed, rc); + } + + /* threads will load balance across devs as they are available */ + for (i = 0; i < *kgnilnd_tunables.kgn_sched_threads; i++) { + rc = kgnilnd_thread_start(kgnilnd_scheduler, (void *)((long)i), + "kgnilnd_sd", i); + if (rc != 0) { + CERROR("Can't spawn gnilnd scheduler[%d]: %d\n", + i, rc); + GOTO(failed, rc); + } + } + + for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) { + dev = &kgnilnd_data.kgn_devices[i]; + rc = kgnilnd_thread_start(kgnilnd_dgram_mover, dev, + "kgnilnd_dg", dev->gnd_id); + if (rc != 0) { + CERROR("Can't spawn gnilnd dgram_mover[%d]: %d\n", + dev->gnd_id, rc); + GOTO(failed, rc); + } + + rc = kgnilnd_thread_start(kgnilnd_dgram_waitq, dev, + "kgnilnd_dgn", dev->gnd_id); + if (rc != 0) { + CERROR("Can't spawn gnilnd dgram_waitq[%d]: %d\n", + dev->gnd_id, rc); + GOTO(failed, rc); + } + + rc = kgnilnd_setup_wildcard_dgram(dev); + + if (rc != 0) { + CERROR("Can't create wildcard dgrams[%d]: %d\n", + dev->gnd_id, rc); + GOTO(failed, rc); + } + } + + + + /* flag everything initialised */ + kgnilnd_data.kgn_init = GNILND_INIT_ALL; + /*****************************************************/ + + CDEBUG(D_MALLOC, "initial kmem %d\n", pkmem); + RETURN(0); + +failed: + kgnilnd_base_shutdown(); + kgnilnd_data.kgn_init = GNILND_INIT_NOTHING; + RETURN(rc); +} + +void +kgnilnd_base_shutdown(void) +{ + int i; + ENTRY; + + while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_PAUSE_SHUTDOWN, 1)) {}; + + kgnilnd_data.kgn_wc_kill = 1; + + for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) { + kgn_device_t *dev = &kgnilnd_data.kgn_devices[i]; + kgnilnd_cancel_wc_dgrams(dev); + kgnilnd_del_conn_or_peer(NULL, LNET_NID_ANY, GNILND_DEL_PEER, -ESHUTDOWN); + kgnilnd_wait_for_canceled_dgrams(dev); + } + + /* Peer state all cleaned up BEFORE setting shutdown, so threads don't + * have to worry about shutdown races. NB connections may be created + * while there are still active connds, but these will be temporary + * since peer creation always fails after the listener has started to + * shut down. + * all peers should have been cleared out on the nets */ + LASSERTF(atomic_read(&kgnilnd_data.kgn_npeers) == 0, + "peers left %d\n", atomic_read(&kgnilnd_data.kgn_npeers)); + + /* Wait for the ruhroh thread to shut down. */ + kgnilnd_data.kgn_ruhroh_shutdown = 1; + wake_up(&kgnilnd_data.kgn_ruhroh_waitq); + i = 2; + while (kgnilnd_data.kgn_ruhroh_running != 0) { + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, + "Waiting for ruhroh thread to terminate\n"); + cfs_pause(cfs_time_seconds(1)); + } + + /* Flag threads to terminate */ + kgnilnd_data.kgn_shutdown = 1; + + for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) { + kgn_device_t *dev = &kgnilnd_data.kgn_devices[i]; + + /* should clear all the MDDs */ + kgnilnd_unmap_phys_fmablk(dev); + + kgnilnd_schedule_device(dev); + wake_up_all(&dev->gnd_dgram_waitq); + wake_up_all(&dev->gnd_dgping_waitq); + LASSERT(list_empty(&dev->gnd_connd_peers)); + } + + spin_lock(&kgnilnd_data.kgn_reaper_lock); + wake_up_all(&kgnilnd_data.kgn_reaper_waitq); + spin_unlock(&kgnilnd_data.kgn_reaper_lock); + + /* Wait for threads to exit */ + i = 2; + while (atomic_read(&kgnilnd_data.kgn_nthreads) != 0) { + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + "Waiting for %d threads to terminate\n", + atomic_read(&kgnilnd_data.kgn_nthreads)); + cfs_pause(cfs_time_seconds(1)); + } + + LASSERTF(atomic_read(&kgnilnd_data.kgn_npeers) == 0, + "peers left %d\n", atomic_read(&kgnilnd_data.kgn_npeers)); + + if (kgnilnd_data.kgn_peers != NULL) { + for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) + LASSERT(list_empty(&kgnilnd_data.kgn_peers[i])); + + LIBCFS_FREE(kgnilnd_data.kgn_peers, + sizeof (struct list_head) * + *kgnilnd_tunables.kgn_peer_hash_size); + } + + down_write(&kgnilnd_data.kgn_net_rw_sem); + if (kgnilnd_data.kgn_nets != NULL) { + for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) + LASSERT(list_empty(&kgnilnd_data.kgn_nets[i])); + + LIBCFS_FREE(kgnilnd_data.kgn_nets, + sizeof (struct list_head) * + *kgnilnd_tunables.kgn_net_hash_size); + } + up_write(&kgnilnd_data.kgn_net_rw_sem); + + LASSERTF(atomic_read(&kgnilnd_data.kgn_nconns) == 0, + "conns left %d\n", atomic_read(&kgnilnd_data.kgn_nconns)); + + if (kgnilnd_data.kgn_conns != NULL) { + for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) + LASSERT(list_empty(&kgnilnd_data.kgn_conns[i])); + + LIBCFS_FREE(kgnilnd_data.kgn_conns, + sizeof (struct list_head) * + *kgnilnd_tunables.kgn_peer_hash_size); + } + + for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) { + kgn_device_t *dev = &kgnilnd_data.kgn_devices[i]; + kgnilnd_dev_fini(dev); + + LASSERTF(atomic_read(&dev->gnd_ndgrams) == 0, + "dgrams left %d\n", atomic_read(&dev->gnd_ndgrams)); + + if (dev->gnd_dgrams != NULL) { + for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) + LASSERT(list_empty(&dev->gnd_dgrams[i])); + + LIBCFS_FREE(dev->gnd_dgrams, + sizeof (struct list_head) * + *kgnilnd_tunables.kgn_peer_hash_size); + } + + kgnilnd_free_phys_fmablk(dev); + } + + if (kgnilnd_data.kgn_mbox_cache != NULL) { + i = cfs_mem_cache_destroy(kgnilnd_data.kgn_mbox_cache); + LASSERTF(i == 0, "rc %d destroying kgn_mbox_cache\n", i); + } + + if (kgnilnd_data.kgn_rx_cache != NULL) { + i = cfs_mem_cache_destroy(kgnilnd_data.kgn_rx_cache); + LASSERTF(i == 0, "rc %d destroying kgn_rx_cache\n", i); + } + + if (kgnilnd_data.kgn_tx_cache != NULL) { + i = cfs_mem_cache_destroy(kgnilnd_data.kgn_tx_cache); + LASSERTF(i == 0, "rc %d destroying kgn_tx_cache\n", i); + } + + if (kgnilnd_data.kgn_tx_phys_cache != NULL) { + i = cfs_mem_cache_destroy(kgnilnd_data.kgn_tx_phys_cache); + LASSERTF(i == 0, "rc %d destroying kgn_tx_phys_cache\n", i); + } + + if (kgnilnd_data.kgn_dgram_cache != NULL) { + i = cfs_mem_cache_destroy(kgnilnd_data.kgn_dgram_cache); + LASSERTF(i == 0, "rc %d destroying kgn_dgram_cache\n", i); + } + + if (kgnilnd_data.kgn_cksum_map_pages != NULL) { + for (i = 0; i < kgnilnd_data.kgn_cksum_npages; i++) { + if (kgnilnd_data.kgn_cksum_map_pages[i] != NULL) { + kfree(kgnilnd_data.kgn_cksum_map_pages[i]); + } + } + kfree(kgnilnd_data.kgn_cksum_map_pages); + } + + CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", + atomic_read(&libcfs_kmemory)); + + kgnilnd_data.kgn_init = GNILND_INIT_NOTHING; + PORTAL_MODULE_UNUSE; + + EXIT; +} + +int +kgnilnd_startup(lnet_ni_t *ni) +{ + int rc, devno; + kgn_net_t *net; + ENTRY; + + LASSERTF(ni->ni_lnd == &the_kgnilnd, + "bad LND 0x%p != the_kgnilnd @ 0x%p\n", + ni->ni_lnd, &the_kgnilnd); + + if (kgnilnd_data.kgn_init == GNILND_INIT_NOTHING) { + rc = kgnilnd_base_startup(); + if (rc != 0) + RETURN(rc); + } + + /* Serialize with shutdown. */ + down(&kgnilnd_data.kgn_quiesce_sem); + + LIBCFS_ALLOC(net, sizeof(*net)); + if (net == NULL) { + CERROR("could not allocate net for new interface instance\n"); + rc = -ENOMEM; + /* no need to cleanup the CDM... */ + GOTO(failed, rc); + } + INIT_LIST_HEAD(&net->gnn_list); + ni->ni_data = net; + net->gnn_ni = ni; + ni->ni_maxtxcredits = *kgnilnd_tunables.kgn_credits; + ni->ni_peertxcredits = *kgnilnd_tunables.kgn_peer_credits; + + if (*kgnilnd_tunables.kgn_peer_health) { + int fudge; + + /* give this a bit of leeway - we don't have a hard timeout + * as we only check timeouts periodically - see comment in kgnilnd_reaper */ + fudge = (GNILND_TO2KA(*kgnilnd_tunables.kgn_timeout) / GNILND_REAPER_NCHECKS); + + ni->ni_peertimeout = *kgnilnd_tunables.kgn_timeout + fudge; + + LCONSOLE_INFO("Enabling LNet peer health for gnilnd, timeout %ds\n", + ni->ni_peertimeout); + } + + atomic_set(&net->gnn_refcount, 1); + + /* if we have multiple devices, spread the nets around */ + net->gnn_netnum = LNET_NETNUM(LNET_NIDNET(ni->ni_nid)); + + devno = LNET_NIDNET(ni->ni_nid) % GNILND_MAXDEVS; + net->gnn_dev = &kgnilnd_data.kgn_devices[devno]; + + /* allocate a 'dummy' cdm for datagram use. We can only have a single + * datagram between a nid:inst_id and nid2:inst_id. The fake cdm + * give us additional inst_id to use, allowing the datagrams to flow + * like rivers of honey and beer */ + + /* the instance id for the cdm is the NETNUM offset by MAXDEVS - + * ensuring we'll have a unique id */ + + + ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), net->gnn_dev->gnd_nid); + CDEBUG(D_NET, "adding net %p nid=%s on dev %d \n", + net, libcfs_nid2str(ni->ni_nid), net->gnn_dev->gnd_id); + /* until the gnn_list is set, we need to cleanup ourselves as + * kgnilnd_shutdown is just gonna get confused */ + + down_write(&kgnilnd_data.kgn_net_rw_sem); + list_add_tail(&net->gnn_list, kgnilnd_netnum2netlist(net->gnn_netnum)); + up_write(&kgnilnd_data.kgn_net_rw_sem); + + /* we need a separate thread to call probe_wait_by_id until + * we get a function callback notifier from kgni */ + up(&kgnilnd_data.kgn_quiesce_sem); + RETURN(0); + failed: + up(&kgnilnd_data.kgn_quiesce_sem); + kgnilnd_shutdown(ni); + RETURN(rc); +} + +void +kgnilnd_shutdown(lnet_ni_t *ni) +{ + kgn_net_t *net = ni->ni_data; + int i; + int rc; + ENTRY; + + CFS_RACE(CFS_FAIL_GNI_SR_DOWN_RACE); + + LASSERTF(kgnilnd_data.kgn_init == GNILND_INIT_ALL, + "init %d\n", kgnilnd_data.kgn_init); + + /* Serialize with startup. */ + down(&kgnilnd_data.kgn_quiesce_sem); + CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", + atomic_read(&libcfs_kmemory)); + + if (net == NULL) { + CERROR("got NULL net for ni %p\n", ni); + rc = -EINVAL; + GOTO(out, rc); + } + + LASSERTF(ni == net->gnn_ni, + "ni %p gnn_ni %p\n", net, net->gnn_ni); + + ni->ni_data = NULL; + + LASSERT(!net->gnn_shutdown); + LASSERTF(atomic_read(&net->gnn_refcount) != 0, + "net %p refcount %d\n", + net, atomic_read(&net->gnn_refcount)); + + if (!list_empty(&net->gnn_list)) { + /* serialize with peer creation */ + down_write(&kgnilnd_data.kgn_net_rw_sem); + net->gnn_shutdown = 1; + up_write(&kgnilnd_data.kgn_net_rw_sem); + + kgnilnd_cancel_net_dgrams(net); + + kgnilnd_del_conn_or_peer(net, LNET_NID_ANY, GNILND_DEL_PEER, -ESHUTDOWN); + + /* if we are quiesced, need to wake up - we need those threads + * alive to release peers, etc */ + if (GNILND_IS_QUIESCED) { + set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_IDLE); + kgnilnd_quiesce_wait("shutdown"); + } + + kgnilnd_wait_for_canceled_dgrams(net->gnn_dev); + + /* We wait until the nets ref's are 1, we will release final ref which is ours + * this allows us to make sure everything else is done before we free the + * net. + */ + i = 4; + while (atomic_read(&net->gnn_refcount) != 1) { + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, + "Waiting for %d references to clear on net %d\n", + atomic_read(&net->gnn_refcount), + net->gnn_netnum); + cfs_pause(cfs_time_seconds(1)); + } + + /* release ref from kgnilnd_startup */ + kgnilnd_net_decref(net); + /* serialize with reaper and conn_task looping */ + down_write(&kgnilnd_data.kgn_net_rw_sem); + list_del_init(&net->gnn_list); + up_write(&kgnilnd_data.kgn_net_rw_sem); + + } + + /* not locking, this can't race with writers */ + LASSERTF(atomic_read(&net->gnn_refcount) == 0, + "net %p refcount %d\n", + net, atomic_read(&net->gnn_refcount)); + LIBCFS_FREE(net, sizeof(*net)); + +out: + down_read(&kgnilnd_data.kgn_net_rw_sem); + for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) { + if (!list_empty(&kgnilnd_data.kgn_nets[i])) { + up_read(&kgnilnd_data.kgn_net_rw_sem); + break; + } + + if (i == *kgnilnd_tunables.kgn_net_hash_size - 1) { + up_read(&kgnilnd_data.kgn_net_rw_sem); + kgnilnd_base_shutdown(); + } + } + CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", + atomic_read(&libcfs_kmemory)); + + up(&kgnilnd_data.kgn_quiesce_sem); + EXIT; + return; +} + +void __exit +kgnilnd_module_fini(void) +{ + lnet_unregister_lnd(&the_kgnilnd); + kgnilnd_proc_fini(); + kgnilnd_remove_sysctl(); + kgnilnd_tunables_fini(); +} + +int __init +kgnilnd_module_init(void) +{ + int rc; + + rc = kgnilnd_tunables_init(); + if (rc != 0) + return rc; + + printk(KERN_INFO "Lustre: kgnilnd build version: "KGNILND_BUILD_REV"\n"); + + kgnilnd_insert_sysctl(); + kgnilnd_proc_init(); + + lnet_register_lnd(&the_kgnilnd); + + return 0; +} + +MODULE_AUTHOR("Cray, Inc. "); +MODULE_DESCRIPTION("Kernel Gemini LND v"KGNILND_BUILD_REV); +MODULE_LICENSE("GPL"); + +module_init(kgnilnd_module_init); +module_exit(kgnilnd_module_fini); diff --git a/lnet/klnds/gnilnd/gnilnd.h b/lnet/klnds/gnilnd/gnilnd.h new file mode 100644 index 0000000..de43728 --- /dev/null +++ b/lnet/klnds/gnilnd/gnilnd.h @@ -0,0 +1,1790 @@ +/* + * Copyright (C) 2004 Cluster File Systems, Inc. + * + * Copyright (C) 2009-2012 Cray, Inc. + * + * Derived from work by: Eric Barton + * Author: Nic Henke + * Author: James Shimek + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ +#ifndef _GNILND_GNILND_H_ +#define _GNILND_GNILND_H_ + +#define DEBUG_SUBSYSTEM S_LND + +#include +#include +#include +#include + +#include +#include "gnilnd_version.h" +#include "gnilnd_hss_ops.h" + +/* tunables determined at compile time */ +#define GNILND_MIN_TIMEOUT 5 /* minimum timeout interval (seconds) */ +#define GNILND_BASE_TIMEOUT 60 /* default sane timeout */ +#define GNILND_TO2KA(t) (((t)-1)/2) /* timeout -> keepalive interval */ +#define GNILND_MIN_RECONNECT_TO (GNILND_BASE_TIMEOUT/4) +#define GNILND_MAX_RECONNECT_TO GNILND_BASE_TIMEOUT +#define GNILND_HARDWARE_TIMEOUT 15 /* maximum time for data to travel between nodes */ +#define GNILND_MDD_TIMEOUT 15 /* MDD hold timeout in minutes */ + +/* reaper thread wakup interval */ +#define GNILND_REAPER_THREAD_WAKE 1 +/* reaper thread checks each conn NCHECKS time every kgnilnd_data.kgn_new_min_timeout */ +#define GNILND_REAPER_NCHECKS 4 + +/* fixed constants */ +#define GNILND_MAXDEVS 1 /* max # of GNI devices currently supported */ +#define GNILND_MBOX_CREDITS 256 /* number of credits per mailbox */ +#define GNILND_COOKIE 0xa3579 /* cookie used by along with ptag by GNI */ + +/* checksum values */ +#define GNILND_CHECKSUM_OFF 0 /* checksum turned off */ +#define GNILND_CHECKSUM_SMSG_HEADER 1 /* Only checksum SMSG header */ +#define GNILND_CHECKSUM_SMSG 2 /* checksum entire SMSG packet */ +#define GNILND_CHECKSUM_SMSG_BTE 3 /* Full checksum support */ + +/* tune down some COMPUTE options as they won't see the same number of connections and + * don't need the throughput of multiple threads by default */ +#if defined(CONFIG_CRAY_COMPUTE) +#define GNILND_SCHED_THREADS 1 /* default # of kgnilnd_scheduler threads */ +#define GNILND_FMABLK 64 /* default number of mboxes per fmablk */ +#else +#define GNILND_SCHED_THREADS 3 /* default # of kgnilnd_scheduler threads */ +#define GNILND_FMABLK 1024 /* default number of mboxes per fmablk */ +#endif + +/* EXTRA_BITS are there to allow us to hide NOOP/CLOSE and anything else out of band */ +#define GNILND_EXTRA_BITS 1 +/* maximum number of conns & bits for cqid in the SMSG event data */ +#define GNILND_CQID_NBITS (21 - GNILND_EXTRA_BITS) +#define GNILND_MSGID_TX_NBITS (32 - GNILND_CQID_NBITS) +#define GNILND_MAX_CQID (1 << GNILND_CQID_NBITS) +#define GNILND_MAX_MSG_ID (1 << GNILND_MSGID_TX_NBITS) +#define GNILND_MAX_MSG_SIZE (*kgnilnd_tunables.kgn_max_immediate + sizeof(kgn_msg_t)) + +/* need sane upper bound to limit copy overhead */ +#define GNILND_MAX_IMMEDIATE (64<<10) + +/* payload size to add to the base mailbox size + * This is subtracting 2 from the concurrent_sends as 4 messages are included in the size + * gni_smsg_buff_size_needed calculates, the MAX_PAYLOAD is added to + * the calculation return from that function.*/ +#define GNILND_MBOX_PAYLOAD \ + (GNILND_MAX_MSG_SIZE * \ + ((*kgnilnd_tunables.kgn_concurrent_sends - 2) * 2)); + +/* timeout -> deadman timer for kgni mdd holds */ +#define GNILND_TIMEOUT2DEADMAN ((*kgnilnd_tunables.kgn_mdd_timeout) * 1000 * 60) + +/* timeout for failing sends in t is in jiffies*/ +#define GNILND_TIMEOUTRX(t) (t + cfs_time_seconds(*kgnilnd_tunables.kgn_hardware_timeout)) + +/* time when to release from purgatory in the reaper thread in jiffies */ +#define GNILND_PURG_RELEASE(t) (GNILND_TIMEOUTRX(t) * 3) + +/* Macro for finding last_rx 2 datapoints are compared + * and the most recent one in jiffies is returned. + */ +#define GNILND_LASTRX(conn) (time_after(conn->gnc_last_rx, conn->gnc_last_rx_cq) \ + ? conn->gnc_last_rx : conn->gnc_last_rx_cq) + +/************************************************************************ + * Enum, flag and tag data + */ +#define GNILND_INIT_NOTHING 0 +#define GNILND_INIT_DATA 1 +#define GNILND_INIT_ALL 2 + +/* If you change the ordering away from MAPPED = UNMAPPED + 1, things break */ +#define GNILND_BUF_NONE 0 /* buffer type not set */ +#define GNILND_BUF_IMMEDIATE 1 /* immediate data */ +#define GNILND_BUF_IMMEDIATE_KIOV 2 /* immediate data */ +#define GNILND_BUF_PHYS_UNMAPPED 3 /* physical: not mapped yet */ +#define GNILND_BUF_PHYS_MAPPED 4 /* physical: mapped already */ +#define GNILND_BUF_VIRT_UNMAPPED 5 /* virtual: not mapped yet */ +#define GNILND_BUF_VIRT_MAPPED 6 /* virtual: mapped already */ + +#define GNILND_TX_WAITING_REPLY (1<<1) /* expecting to receive reply */ +#define GNILND_TX_WAITING_COMPLETION (1<<2) /* waiting for smsg_send to complete */ +#define GNILND_TX_PENDING_RDMA (1<<3) /* RDMA transaction pending until we get prev. completion */ +#define GNILND_TX_QUIET_ERROR (1<<4) /* don't print error on tx_done */ +#define GNILND_TX_FAIL_SMSG (1<<5) /* pass down error injection for SMSG fail */ + +/* stash above max CQID to avoid any collision */ +#define GNILND_MSGID_NOOP (GNILND_MAX_CQID + 128) +#define GNILND_MSGID_CLOSE (GNILND_MSGID_NOOP + 1) + +/* kgn_msg_t::gnm_type */ +#define GNILND_MSG_NONE 0x00 /* illegal message */ +#define GNILND_MSG_NOOP 0x01 /* empty gnm_u (keepalive) */ +#define GNILND_MSG_IMMEDIATE 0x02 /* gnm_u.immediate */ +#define GNILND_MSG_PUT_REQ 0x03 /* gnm_u.putreq (src->sink) */ +#define GNILND_MSG_PUT_NAK 0x04 /* gnm_u.completion (no PUT match: sink->src) */ +#define GNILND_MSG_PUT_ACK 0x05 /* gnm_u.putack (PUT matched: sink->src) */ +#define GNILND_MSG_PUT_DONE 0x06 /* gnm_u.completion (src->sink) */ +#define GNILND_MSG_GET_REQ 0x07 /* gnm_u.get (sink->src) */ +#define GNILND_MSG_GET_NAK 0x08 /* gnm_u.completion (no GET match: src->sink) */ +#define GNILND_MSG_GET_DONE 0x09 /* gnm_u.completion (src->sink) */ +#define GNILND_MSG_CLOSE 0x0a /* empty gnm_u */ + +/* defines for gnc_*scheduled states */ +#define GNILND_CONN_IDLE 0 +#define GNILND_CONN_SCHED 1 +#define GNILND_CONN_WANTS_SCHED 2 +#define GNILND_CONN_PROCESS 3 + +#define GNILND_DEV_IDLE 0 +#define GNILND_DEV_IRQ 1 +#define GNILND_DEV_LOOP 2 + +#define GNILND_DGRAM_IDLE 0 +#define GNILND_DGRAM_SCHED 1 +#define GNILND_DGRAM_PROCESS 2 + +#define GNILND_PEER_IDLE 0 +#define GNILND_PEER_CONNECT 1 +#define GNILND_PEER_POSTING 2 +#define GNILND_PEER_POSTED 3 +#define GNILND_PEER_NEEDS_DEATH 4 +#define GNILND_PEER_KILL 5 + +/* for gnc_close_recvd */ +#define GNILND_CLOSE_RX 1 +#define GNILND_CLOSE_INJECT1 2 +#define GNILND_CLOSE_INJECT2 3 +#define GNILND_CLOSE_EARLY 4 + +/* defines for why quiesce trigger set */ +#define GNILND_QUIESCE_IDLE 0 +#define GNILND_QUIESCE_ADMIN 1 +#define GNILND_QUIESCE_RESET 2 +#define GNILND_QUIESCE_HW_QUIESCE 3 + +#define GNILND_PEER_CLEAN 0 +#define GNILND_PEER_PERSISTING 1 + +#define GNILND_DEL_CONN 0 +#define GNILND_DEL_PEER 1 +#define GNILND_CLEAR_PURGATORY 2 + +typedef enum kgn_fmablk_state { + GNILND_FMABLK_IDLE = 0, /* is allocated or ready to be freed */ + GNILND_FMABLK_PHYS, /* allocated out of slab of physical memory */ + GNILND_FMABLK_VIRT, /* 'standard' vmalloc hunk */ + GNILND_FMABLK_FREED, /* after free */ +} kgn_fmablk_state_t; + +typedef enum kgn_tx_list_state { + GNILND_TX_IDLE = 0, /* TX is on the idle list, kgn_idle_txs */ + GNILND_TX_ALLOCD, /* TX has been alloced (off of idle), could be in any state transition */ + GNILND_TX_PEERQ, /* TX on peer->gnp_tx_queue (no live conn) */ + GNILND_TX_MAPQ, /* TX on dev:gnd_map_tx for buffer mapping */ + GNILND_TX_FMAQ, /* TX waiting to be send on conn FMA */ + GNILND_TX_LIVE_FMAQ, /* TX live on the FMA wire, waiting for completion or reply */ + GNILND_TX_RDMAQ, /* TX waiting to send FMA confirmation to auth RDMA PUT */ + GNILND_TX_LIVE_RDMAQ, /* TX live on the RDMA wire, waiting for completion */ + GNILND_TX_DYING, /* TX got caught on MAPQ or RDMAQ while conn was closing, needs someone to call tx_done */ + GNILND_TX_FREED /* TX is free! */ +} kgn_tx_list_state_t; + +typedef enum kgn_conn_state { + /* don't start @ 0 - prevent memset(0) badness */ + GNILND_CONN_DUMMY = 0, + GNILND_CONN_LISTEN, + GNILND_CONN_CONNECTING, + GNILND_CONN_ESTABLISHED, + GNILND_CONN_CLOSING, + GNILND_CONN_CLOSED, + GNILND_CONN_DONE, + GNILND_CONN_DESTROY_EP +} kgn_conn_state_t; + +/* changing these requires a change to GNILND_CONNREQ_VERSION and + * will result in dropped packets instead of NAKs. Adding to this is + * acceptable without changing the CONNREQ_VERSION, but code should + * be ready to handle NAKs on version mismatch */ +typedef enum kgn_connreq_type { + GNILND_CONNREQ_REQ = 1, /* how YOU doin' ? */ + GNILND_CONNREQ_NAK, /* NO soup for you! */ + GNILND_CONNREQ_CLOSE, /* we should see other people */ +} kgn_connreq_type_t; + +typedef enum kgn_dgram_state { + /* don't use 0 to avoid thinking a memset of zero is valid data */ + GNILND_DGRAM_USED = 1, + GNILND_DGRAM_POSTING, + GNILND_DGRAM_POSTED, + GNILND_DGRAM_PROCESSING, + GNILND_DGRAM_CANCELED, + GNILND_DGRAM_DONE, +} kgn_dgram_state_t; + +typedef enum kgn_dgram_type { + GNILND_DGRAM_REQ = 1, /* how YOU doin' ? */ + GNILND_DGRAM_WC_REQ, /* you talkin' to ME? */ + GNILND_DGRAM_NAK, /* NO soup for you! */ + GNILND_DGRAM_CLOSE, /* we should see other people */ +} kgn_dgram_type_t; + +/************************************************************************ + * Wire message structs. These are sent in sender's byte order + * (i.e. receiver checks magic and flips if required). + */ + +#define GNILND_MSG_MAGIC LNET_PROTO_GNI_MAGIC /* unique magic */ +#define GNILND_DGRAM_MAGIC 0x0DDBA11 + +/* kgn_msg_t - FMA/SMSG wire struct + v2: + * - added checksum to FMA + * moved seq before paylod + * WIRE_ATTR added for alignment + v3: + * added gnm_payload_len for FMA payload size + v4: + * added gncm_retval to completion, allowing return code transmission + on RDMA NAKs + v5: + * changed how CQID and TX ids are assigned + v6: + * added retval on CLOSE + v7: + * added payload checksumming + v8: + * reworked checksumming a bit, changed payload checksums +*/ +#define GNILND_MSG_VERSION 8 +/* kgn_connreq_t connection request datagram wire struct + v2: + * added NAKs +*/ + +#define GNILND_CONNREQ_VERSION 2 + +typedef struct kgn_gniparams { + __u32 gnpr_host_id; /* ph. host ID of the NIC */ + __u32 gnpr_cqid; /* cqid I want peer to use when sending events to me */ + gni_smsg_attr_t gnpr_smsg_attr; /* my short msg. attributes */ +} WIRE_ATTR kgn_gniparams_t; + +typedef struct kgn_nak_data { + __s32 gnnd_errno; /* errno reason for NAK */ + +} WIRE_ATTR kgn_nak_data_t; + +/* the first bits of the connreq struct CANNOT CHANGE FORM EVER + * without breaking the ability for us to properly NAK someone */ +typedef struct kgn_connreq { /* connection request/response */ + __u32 gncr_magic; /* I'm an gnilnd connreq */ + __u32 gncr_cksum; /* checksum (0 == disabled) */ + __u16 gncr_type; /* REQ, NAK, etc */ + __u16 gncr_version; /* this is my version number */ + __u32 gncr_timeout; /* sender's timeout */ + __u64 gncr_srcnid; /* sender's NID */ + __u64 gncr_dstnid; /* who sender expects to listen */ + __u64 gncr_peerstamp; /* sender's instance stamp */ + __u64 gncr_connstamp; /* sender's connection stamp */ + + /* everything before this needs to stay static, adding after should + * result in a change to GNILND_CONNREQ_VERSION */ + + union { + kgn_gniparams_t gncr_gnparams; /* sender's endpoint info */ + kgn_nak_data_t gncr_nakdata; /* data (rc, etc) for NAK */ + }; +} WIRE_ATTR kgn_connreq_t; + +typedef struct { + gni_mem_handle_t gnrd_key; + __u64 gnrd_addr; + __u32 gnrd_nob; +} WIRE_ATTR kgn_rdma_desc_t; + +typedef struct { + lnet_hdr_t gnim_hdr; /* LNet header */ + /* LNet payload is in FMA "Message Data" */ +} WIRE_ATTR kgn_immediate_msg_t; + +typedef struct { + lnet_hdr_t gnprm_hdr; /* LNet header */ + __u64 gnprm_cookie; /* opaque completion cookie */ +} WIRE_ATTR kgn_putreq_msg_t; + +typedef struct { + __u64 gnpam_src_cookie; /* reflected completion cookie */ + __u64 gnpam_dst_cookie; /* opaque completion cookie */ + kgn_rdma_desc_t gnpam_desc; /* sender's sink buffer */ +} WIRE_ATTR kgn_putack_msg_t; + +typedef struct { + lnet_hdr_t gngm_hdr; /* LNet header */ + __u64 gngm_cookie; /* opaque completion cookie */ + kgn_rdma_desc_t gngm_desc; /* sender's sink buffer */ +} WIRE_ATTR kgn_get_msg_t; + +typedef struct { + int gncm_retval; /* error on NAK, size on REQ */ + __u64 gncm_cookie; /* reflected completion cookie */ +} WIRE_ATTR kgn_completion_msg_t; + +typedef struct { /* NB must fit in FMA "Prefix" */ + __u32 gnm_magic; /* I'm an gni message */ + __u16 gnm_version; /* this is my version number */ + __u16 gnm_type; /* msg type */ + __u64 gnm_srcnid; /* sender's NID */ + __u64 gnm_connstamp; /* sender's connection stamp */ + __u32 gnm_seq; /* incrementing sequence number */ + __u16 gnm_cksum; /* checksum (0 == no checksum ) */ + __u16 gnm_payload_cksum; /* payload checksum (0 == no checksum ) */ + __u32 gnm_payload_len; /* size of the FMA payload sent */ + union { + kgn_immediate_msg_t immediate; + kgn_putreq_msg_t putreq; + kgn_putack_msg_t putack; + kgn_get_msg_t get; + kgn_completion_msg_t completion; + } gnm_u; +} WIRE_ATTR kgn_msg_t; + +/************************************************************************ + * runtime tunable data + */ + +typedef struct kgn_tunables { + int *kgn_min_reconnect_interval; /* connreq starting timeout & retransmit interval */ + int *kgn_max_reconnect_interval; /* ...exponentially increasing to this */ + int *kgn_credits; /* # concurrent sends */ + int *kgn_fma_cq_size; /* # entries in receive CQ */ + int *kgn_peer_credits; /* # LNet peer credits */ + int *kgn_concurrent_sends; /* max # of max_immediate in mbox */ + int *kgn_timeout; /* comms timeout (seconds) */ + int *kgn_max_immediate; /* immediate payload breakpoint */ + int *kgn_checksum; /* checksum data */ + int *kgn_checksum_dump; /* dump raw data to D_INFO log when checksumming */ + int *kgn_bte_hash; /* hashing on BTE transfers */ + int *kgn_bte_adapt; /* adaptive routing on BTE transfers */ + int *kgn_bte_relaxed_ordering; /* relaxed ordering (PASSPW) on BTE transfers */ + int *kgn_ptag; /* PTAG for cdm_create */ + int *kgn_max_retransmits; /* max number of FMA retransmits */ + int *kgn_nwildcard; /* # wildcard per net to post */ + int *kgn_nice; /* nice value for kgnilnd threads */ + int *kgn_rdmaq_intervals; /* # intervals per second for rdmaq throttle */ + int *kgn_loops; /* # of loops sched does before flush/heartbeat tickle */ + int *kgn_peer_hash_size; /* size of kgn_peers */ + int *kgn_peer_health; /* enable/disable peer health */ + int *kgn_vmap_cksum; /* enable/disable vmap of kiov checksums */ + int *kgn_mbox_per_block; /* mailboxes per fmablk */ + int *kgn_nphys_mbox; /* # mailboxes to preallocate with physical memory */ + int *kgn_mbox_credits; /* max credits per fma */ + int *kgn_sched_threads; /* number of kgnilnd_scheduler threads */ + int *kgn_net_hash_size; /* size of kgn_net_ht */ + int *kgn_hardware_timeout; /* max time for a message to get across the network */ + int *kgn_mdd_timeout; /* max time for ghal to hold an mdd in minutes */ +#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM + cfs_sysctl_table_header_t *kgn_sysctl; /* sysctl interface */ +#endif +} kgn_tunables_t; + +typedef struct kgn_mbox_info { + lnet_nid_t mbx_prev_nid; + unsigned long mbx_create_conn_memset; + unsigned long mbx_add_purgatory; + unsigned long mbx_detach_of_purgatory; + unsigned long mbx_release_from_purgatory; + unsigned long mbx_release_purg_active_dgram; +} kgn_mbox_info_t; + +typedef struct kgn_fma_memblock { + struct list_head gnm_bufflist; /* memblock is part of device's gnd_fma_buffs */ + kgn_fmablk_state_t gnm_state; /* how this memory allocated & state of it */ + int gnm_hold_timeout; /* hold_timeout if used at unmap time */ + int gnm_num_mboxs; /* total mboxes allocated */ + int gnm_avail_mboxs; /* number of available mailboxes in the block */ + int gnm_held_mboxs; /* number of purgatory held mailboxes */ + int gnm_mbox_size; /* size of the single mailbox */ + int gnm_next_avail_mbox; /* next available mailbox */ + long gnm_max_timeout; /* max timeout for possible purgatory hold */ + unsigned int gnm_blk_size; /* how big is our hunk o memory ? */ + void *gnm_block; /* pointer to mem. block */ + gni_mem_handle_t gnm_hndl; /* mem. handle of the block */ + unsigned long *gnm_bit_array; /* bit array tracking allocation of mailboxes */ + kgn_mbox_info_t *gnm_mbox_info; /* array of mbox_information about each mbox */ +} kgn_fma_memblock_t; + +typedef struct kgn_device { + gni_nic_handle_t gnd_handle; /* device handle */ + gni_cdm_handle_t gnd_domain; /* GNI communication domain */ + gni_err_handle_t gnd_err_handle; /* device error handle */ + unsigned long gnd_sched_alive; /* scheduler thread alive stamp */ + gni_cq_handle_t gnd_rcv_fma_cqh; /* FMA rcv. completion queue handle */ + gni_cq_handle_t gnd_snd_rdma_cqh; /* rdma send completion queue handle */ + gni_cq_handle_t gnd_snd_fma_cqh; /* rdma send completion queue handle */ + struct mutex gnd_cq_mutex; /* CQ access serialization */ + __u32 gnd_host_id; /* ph. host ID of the NIC */ + int gnd_id; /* device id, also index in kgn_devices */ + __u32 gnd_nid; /* ph host ID translated to NID */ + struct list_head gnd_fma_buffs; /* list of FMA memory blocks */ + struct semaphore gnd_fmablk_sem; /* semaphore for FMA block memory alloc/free */ + spinlock_t gnd_fmablk_lock; /* lock for mbox alloc/release */ + atomic_t gnd_nfmablk; /* # of fmablk live */ + atomic_t gnd_fmablk_vers; /* gnd_fma_bufs stamp */ + atomic_t gnd_neps; /* # EP allocated to conns */ + short gnd_ready; /* stuff to do in scheduler thread */ + struct list_head gnd_ready_conns; /* connections ready to tx/rx */ + struct list_head gnd_map_tx; /* TX: needing buffer mapping */ + wait_queue_head_t gnd_waitq; /* scheduler wakeup */ + spinlock_t gnd_lock; /* serialise gnd_ready_conns */ + struct list_head gnd_connd_peers; /* peers waiting for a connection */ + spinlock_t gnd_connd_lock; /* serialise connd_peers */ + wait_queue_head_t gnd_dgram_waitq; /* dgram_mover thread wakeup */ + wait_queue_head_t gnd_dgping_waitq; /* dgram thread ping-pong */ + int gnd_dgram_ready; /* dgrams need movin' */ + struct list_head *gnd_dgrams; /* nid hash to dgrams */ + atomic_t gnd_ndgrams; /* # dgrams extant */ + spinlock_t gnd_dgram_lock; /* serialize gnd_dgrams */ + struct list_head gnd_map_list; /* list of all mapped regions */ + int gnd_map_version; /* version flag for map list */ + atomic_t gnd_n_mdd; /* number of total MDD - fma, tx, etc */ + atomic_t gnd_n_mdd_held; /* number of total MDD held - fma, tx, etc */ + atomic_t gnd_nq_map; /* # queued waiting for mapping (MDD/GART) */ + atomic64_t gnd_nbytes_map; /* bytes of total GART maps - fma, tx, etc */ + __u32 gnd_map_nphys; /* # TX phys mappings */ + __u32 gnd_map_physnop; /* # TX phys pages mapped */ + __u32 gnd_map_nvirt; /* # TX virt mappings */ + __u64 gnd_map_virtnob; /* # TX virt bytes mapped */ + spinlock_t gnd_map_lock; /* serialize gnd_map_XXX */ + struct list_head gnd_rdmaq; /* RDMA to be sent */ + spinlock_t gnd_rdmaq_lock; /* play nice with others */ + atomic64_t gnd_rdmaq_bytes_out; /* # bytes authorized */ + atomic64_t gnd_rdmaq_bytes_ok; /* # bytes allowed until deadline */ + atomic_t gnd_rdmaq_nstalls; /* # stalls due to throttle */ + unsigned long gnd_rdmaq_deadline; /* when does bucket roll over ? */ + struct timer_list gnd_rdmaq_timer; /* wakey-wakey */ + atomic_t gnd_short_ntx; /* TX stats: short messages */ + atomic64_t gnd_short_txbytes; /* TX stats: short message payload*/ + atomic_t gnd_rdma_ntx; /* TX stats: rdma messages */ + atomic64_t gnd_rdma_txbytes; /* TX stats: rdma message payload*/ + atomic_t gnd_short_nrx; /* RX stats: short messages */ + atomic64_t gnd_short_rxbytes; /* RX stats: short message payload*/ + atomic_t gnd_rdma_nrx; /* RX stats: rdma messages */ + atomic64_t gnd_rdma_rxbytes; /* RX stats: rdma message payload*/ + atomic_t gnd_fast_try; /* # of times fast send tried */ + atomic_t gnd_fast_ok; /* # of times fast send ok */ + atomic_t gnd_fast_block; /* # of times fast send blocked */ + unsigned long gnd_mutex_delay; + atomic_t gnd_n_yield; + atomic_t gnd_n_schedule; + atomic_t gnd_canceled_dgrams; /* # of outstanding cancels */ +} kgn_device_t; + +typedef struct kgn_net { + struct list_head gnn_list; /* chain on kgni_data::kgn_nets */ + kgn_device_t *gnn_dev; /* device for this net */ + lnet_ni_t *gnn_ni; /* network interface instance */ + atomic_t gnn_refcount; /* # current references */ + int gnn_shutdown; /* lnd_shutdown set */ + __u16 gnn_netnum; /* stash netnum for quicker lookup */ +} kgn_net_t; + +static inline lnet_nid_t +kgnilnd_lnd2lnetnid(lnet_nid_t ni_nid, lnet_nid_t kgnilnd_nid) +{ + return LNET_MKNID(LNET_NIDNET(ni_nid), LNET_NIDADDR(kgnilnd_nid)); +} + +static inline lnet_nid_t +kgnilnd_lnet2lndnid(lnet_nid_t lnet_nid, lnet_nid_t kgnilnd_nid) +{ + return LNET_MKNID(LNET_NIDNET(kgnilnd_nid), LNET_NIDADDR(lnet_nid)); +} + +/* The code for this is a bit ugly - but really this just boils down to a __u64 + * that can have various parts accessed separately. + * + * The lower 32 bits is the ID + * we give to SMSG for our completion event - it needs to be globally unique across + * all TX currently in flight. We separate that out into the CQID so that we can + * reference the connection (kgnilnd_cqid2conn_locked) and then the msg_id to pull + * the actual TX out of the per-connection gnc_tx_ref_table. + * + * The upper 32 bits are just extra stuff we put into the cookie to ensure this TX + * has a unique value we can send with RDMA setup messages to ensure the completion for + * those is unique across the wire. The extra 32 bits are there to ensure that TX id + * reuse is separated. + */ + +typedef struct kgn_tx_ev_id { + union { + __u64 txe_cookie; /* are you my mommy ? */ + struct { + __u32 txe_chips; /* extra bits to ensure ID unique across reuse */ + union { + __u32 txe_smsg_id; /* ID for SMSG CQ event */ + /* N.B: Never ever ever ever use the bit shifts directly, + * you are just asking for a world of pain and are at the + * mercy of the compiler layouts */ + struct { + __u32 txe_cqid :GNILND_CQID_NBITS; + __u32 txe_idx :GNILND_MSGID_TX_NBITS; + }; + }; + }; + }; +} kgn_tx_ev_id_t; + +typedef struct kgn_dgram { + struct list_head gndg_list; /* on hash dev::gnd_dgrams */ + kgn_dgram_state_t gndg_state; /* state of this dgram */ + kgn_dgram_type_t gndg_type; /* REQ, NAK, etc */ + __u32 gndg_magic; /* saftey word */ + unsigned long gndg_post_time; /* time when we posted */ + struct kgn_conn *gndg_conn; /* unbound conn with ep & smsg */ + kgn_connreq_t gndg_conn_out; /* connreq from local node */ + kgn_connreq_t gndg_conn_in; /* connreq from remote node */ +} kgn_dgram_t; + +typedef struct kgn_tx { /* message descriptor */ + struct list_head tx_list; /* TX queues - peer, conn, rdma */ + kgn_tx_list_state_t tx_list_state;/* where in state machine is this TX ? */ + struct list_head *tx_list_p; /* pointer to current list */ + struct kgn_conn *tx_conn; /* owning conn */ + lnet_msg_t *tx_lntmsg[2]; /* ptl msgs to finalize on completion */ + unsigned long tx_qtime; /* when tx started to wait for something (jiffies) */ + unsigned long tx_cred_wait; /* time spend waiting for smsg creds */ + struct list_head tx_map_list; /* list entry on device map list */ + unsigned int tx_nob; /* # bytes of payload */ + int tx_buftype; /* payload buffer type */ + int tx_phys_npages; /* # physical pages */ + gni_mem_handle_t tx_map_key; /* mapping key */ + gni_mem_segment_t *tx_phys; /* page descriptors */ + kgn_msg_t tx_msg; /* FMA message buffer */ + kgn_tx_ev_id_t tx_id; /* who are you, who ? who ? */ + __u8 tx_state; /* state of the descriptor */ + int tx_retrans; /* retrans count of RDMA */ + int tx_rc; /* if we need to stash the ret code until we see completion */ + void *tx_buffer; /* source/sink buffer */ + union { + gni_post_descriptor_t tx_rdma_desc; /* rdma descriptor */ + struct page *tx_imm_pages[GNILND_MAX_IMMEDIATE/PAGE_SIZE]; /* page array to map kiov for immediate send */ + }; + + /* we only use one or the other */ + union { + kgn_putack_msg_t tx_putinfo; /* data for differed rdma & re-try */ + kgn_get_msg_t tx_getinfo; /* data for rdma re-try*/ + }; +} kgn_tx_t; + +typedef struct kgn_conn { + kgn_device_t *gnc_device; /* which device */ + struct kgn_peer *gnc_peer; /* owning peer */ + struct list_head gnc_list; /* stash on peer's conn list - or pending purgatory lists as we clear them */ + struct list_head gnc_hashlist; /* stash in connection hash table */ + struct list_head gnc_schedlist; /* schedule (on gnd_?_conns) for attention */ + struct list_head gnc_fmaq; /* txs queued for FMA */ + struct list_head gnc_mdd_list; /* hold list for MDD on hard conn reset */ + __u64 gnc_peerstamp; /* peer's unique stamp */ + __u64 gnc_peer_connstamp; /* peer's unique connection stamp */ + __u64 gnc_my_connstamp; /* my unique connection stamp */ + unsigned long gnc_first_rx; /* when I first received an FMA message (jiffies) */ + unsigned long gnc_last_tx; /* when I last sent an FMA message (jiffies) */ + unsigned long gnc_last_rx; /* when I last sent an FMA message (jiffies) */ + unsigned long gnc_last_tx_cq; /* when I last received an FMA CQ (jiffies) */ + unsigned long gnc_last_rx_cq; /* when I last received an FMA CQ (jiffies) */ + unsigned long gnc_last_noop_want; /* time I wanted to send NOOP */ + unsigned long gnc_last_noop_sent; /* time I did gni_smsg_send on NOOP */ + unsigned long gnc_last_noop_cq; /* time when NOOP completed */ + unsigned long gnc_last_sched_ask; /* time when conn added to ready_conns */ + unsigned long gnc_last_sched_do; /* time when conn processed from ready_conns */ + atomic_t gnc_reaper_noop; /* # reaper triggered NOOP */ + atomic_t gnc_sched_noop; /* # sched triggered NOOP */ + unsigned int gnc_timeout; /* infer peer death if no rx for this many seconds */ + __u32 gnc_cqid; /* my completion callback id (non-unique) */ + __u32 gnc_tx_seq; /* tx msg sequence number */ + __u32 gnc_rx_seq; /* rx msg sequence number */ + __u64 gnc_tx_retrans; /* # retrans on SMSG */ + atomic_t gnc_nlive_fma; /* # live FMA */ + atomic_t gnc_nq_rdma; /* # queued (on device) RDMA */ + atomic_t gnc_nlive_rdma; /* # live RDMA */ + short gnc_close_sent; /* I've sent CLOSE */ + short gnc_close_recvd; /* I've received CLOSE */ + short gnc_in_purgatory; /* in the sin bin */ + int gnc_error; /* errno when conn being closed due to error */ + int gnc_peer_error; /* errno peer sent us on CLOSE */ + kgn_conn_state_t gnc_state; /* connection state */ + int gnc_scheduled; /* being attented to */ + atomic_t gnc_refcount; /* # users */ + spinlock_t gnc_list_lock; /* serialise tx lists, max_rx_age */ + gni_ep_handle_t gnc_ephandle; /* GNI endpoint */ + kgn_fma_memblock_t *gnc_fma_blk; /* pointer to fma block for our mailbox */ + gni_smsg_attr_t gnpr_smsg_attr; /* my short msg. attributes */ + spinlock_t gnc_tx_lock; /* protect tx alloc/free */ + __u8 gnc_tx_bits[GNILND_MAX_MSG_ID/8]; /* bit table for tx id */ + int gnc_next_tx; /* next tx to use in tx_ref_table */ + kgn_tx_t **gnc_tx_ref_table; /* table of TX descriptors for this conn */ + int gnc_mbox_id; /* id of mbox in fma_blk */ + short gnc_needs_detach; /* flag set in detach_purgatory_all_locked so reaper will clear out purgatory */ + short gnc_needs_closing; /* flag set in del_conns when called from kgnilnd_del_peer_or_conn */ +} kgn_conn_t; + +typedef struct kgn_mdd_purgatory { + gni_mem_handle_t gmp_map_key; /* mapping key */ + struct list_head gmp_list; /* entry point for purgatory list */ +} kgn_mdd_purgatory_t; + +typedef struct kgn_peer { + struct list_head gnp_list; /* stash on global peer list */ + struct list_head gnp_connd_list; /* schedule on kgn_connd_peers */ + struct list_head gnp_conns; /* all active connections and all conns in purgatory for the peer */ + struct list_head gnp_tx_queue; /* msgs waiting for a conn */ + kgn_net_t *gnp_net; /* net instance for this peer */ + lnet_nid_t gnp_nid; /* who's on the other end(s) */ + atomic_t gnp_refcount; /* # users */ + __u32 gnp_host_id; /* ph. host ID of the peer */ + short gnp_connecting; /* connection forming */ + short gnp_pending_unlink; /* need last conn close to trigger unlink */ + int gnp_last_errno; /* last error conn saw */ + unsigned long gnp_last_alive; /* last time I had valid comms */ + int gnp_last_dgram_errno; /* last error dgrams saw */ + unsigned long gnp_last_dgram_time; /* last time I tried to connect */ + unsigned long gnp_reconnect_time; /* CURRENT_SECONDS when reconnect OK */ + unsigned long gnp_reconnect_interval; /* exponential backoff */ + atomic_t gnp_dirty_eps; /* # of old but yet to be destroyed EPs from conns */ +} kgn_peer_t; + +/* the kgn_rx_t is a struct for handing to LNET as the private pointer for things + * like lnet_parse. It allows a single pointer to let us get enough + * information in _recv and friends */ +typedef struct kgn_rx { + kgn_conn_t *grx_conn; /* connection */ + kgn_msg_t *grx_msg; /* message */ + lnet_msg_t *grx_lntmsg; /* lnet msg for this rx (eager only) */ + int grx_eager; /* if eager, we copied msg to somewhere */ + struct timespec grx_received; /* time this msg received */ +} kgn_rx_t; + +typedef struct kgn_data { + int kgn_init; /* initialisation state */ + int kgn_shutdown; /* shut down? */ + int kgn_wc_kill; /* Should I repost the WC */ + atomic_t kgn_nthreads; /* # live threads */ + int kgn_nresets; /* number of stack resets */ + int kgn_in_reset; /* are we in stack reset ? */ + + kgn_device_t kgn_devices[GNILND_MAXDEVS]; /* device/ptag/cq etc */ + int kgn_ndevs; /* # devices */ + + int kgn_ruhroh_running; /* ruhroh thread is running */ + int kgn_ruhroh_shutdown; /* ruhroh thread should or is shut down */ + wait_queue_head_t kgn_ruhroh_waitq; /* ruhroh thread wakeup */ + int kgn_quiesce_trigger; /* should we quiesce ? */ + atomic_t kgn_nquiesce; /* how many quiesced ? */ + struct semaphore kgn_quiesce_sem; /* serialize ruhroh task, startup and shutdown */ + int kgn_needs_reset; /* we need stack reset */ + + /* These next three members implement communication from gnilnd into + * the ruhroh task. To ensure correct operation of the task, code that + * writes into them must use memory barriers to ensure that the changes + * are visible to other cores in the order the members appear below. */ + __u32 kgn_quiesce_secs; /* seconds to bump timeouts */ + int kgn_bump_info_rdy; /* we have info needed to bump */ + int kgn_needs_pause; /* we need to pause for network quiesce */ + + struct list_head *kgn_nets; /* hashtable of kgn_net instances */ + struct rw_semaphore kgn_net_rw_sem; /* serialise gnn_shutdown, kgn_nets */ + + rwlock_t kgn_peer_conn_lock; /* stabilize peer/conn ops */ + struct list_head *kgn_peers; /* hash table of all my known peers */ + atomic_t kgn_npeers; /* # peers extant */ + int kgn_peer_version; /* version flag for peer tables */ + + struct list_head *kgn_conns; /* conns hashed by cqid */ + atomic_t kgn_nconns; /* # connections extant */ + __u64 kgn_peerstamp; /* when I started up */ + __u64 kgn_connstamp; /* conn stamp generator */ + int kgn_conn_version; /* version flag for conn tables */ + int kgn_next_cqid; /* cqid generator */ + + long kgn_new_min_timeout; /* minimum timeout on any new conn */ + wait_queue_head_t kgn_reaper_waitq; /* reaper sleeps here */ + spinlock_t kgn_reaper_lock; /* serialise */ + + cfs_mem_cache_t *kgn_rx_cache; /* rx descriptor space */ + cfs_mem_cache_t *kgn_tx_cache; /* tx descriptor memory */ + cfs_mem_cache_t *kgn_tx_phys_cache; /* tx phys descriptor memory */ + atomic_t kgn_ntx; /* # tx in use */ + cfs_mem_cache_t *kgn_dgram_cache; /* outgoing datagrams */ + + struct page ***kgn_cksum_map_pages; /* page arrays for mapping pages on checksum */ + __u64 kgn_cksum_npages; /* Number of pages allocated for checksumming */ + atomic_t kgn_nvmap_cksum; /* # times we vmapped for checksums */ + atomic_t kgn_nvmap_short; /* # times we vmapped for short kiov */ + + atomic_t kgn_nkmap_short; /* # time we kmapped for a short kiov */ + long kgn_rdmaq_override; /* bytes per second override */ + + struct kmem_cache *kgn_mbox_cache; /* mailboxes from not-GART */ + + atomic_t kgn_npending_unlink; /* # of peers pending unlink */ + atomic_t kgn_npending_conns; /* # of conns with pending closes */ + atomic_t kgn_npending_detach; /* # of conns with a pending detach */ + +} kgn_data_t; + +extern kgn_data_t kgnilnd_data; +extern kgn_tunables_t kgnilnd_tunables; + +extern void kgnilnd_destroy_peer(kgn_peer_t *peer); +extern void kgnilnd_destroy_conn(kgn_conn_t *conn); +extern void kgnilnd_schedule_conn(kgn_conn_t *conn); + +static inline int +kgnilnd_thread_start(int(*fn)(void *arg), void *arg, char *name, int id) +{ + struct task_struct *thrd = kthread_run(fn, arg, "%s_%02d", name, id); + if (IS_ERR(thrd)) + return PTR_ERR(thrd); + + atomic_inc(&kgnilnd_data.kgn_nthreads); + return 0; +} + +static inline void +kgnilnd_thread_fini(void) +{ + atomic_dec(&kgnilnd_data.kgn_nthreads); +} + +/* like mutex_trylock but with a jiffies spinner. This is to allow certain + * parts of the code to avoid a scheduler trip when the mutex is held + * + * Try to acquire the mutex atomically for 1 jiffie. Returns 1 if the mutex + * has been acquired successfully, and 0 on contention. + * + * NOTE: this function follows the spin_trylock() convention, so + * it is negated to the down_trylock() return values! Be careful + * about this when converting semaphore users to mutexes. + * + * This function must not be used in interrupt context. The + * mutex must be released by the same task that acquired it. + */ +static inline int kgnilnd_mutex_trylock(struct mutex *lock) +{ + int ret; + unsigned long timeout; + + LASSERT(!in_interrupt()); + + for (timeout = jiffies + 1; time_before(jiffies, timeout);) { + + ret = mutex_trylock(lock); + if (ret) + return ret; + } + return 0; +} + +/* Copied from DEBUG_REQ in Lustre - the dance is needed to save stack space */ + +extern void +_kgnilnd_debug_msg(kgn_msg_t *msg, + struct libcfs_debug_msg_data *data, const char *fmt, ... ); + +#define kgnilnd_debug_msg(msgdata, mask, cdls, msg, fmt, a...) \ +do { \ + CFS_CHECK_STACK(msgdata, mask, cdls); \ + \ + if (((mask) & D_CANTMASK) != 0 || \ + ((libcfs_debug & (mask)) != 0 && \ + (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) \ + _kgnilnd_debug_msg((msg), msgdata, fmt, ##a); \ +} while(0) + +/* for most callers (level is a constant) this is resolved at compile time */ +#define GNIDBG_MSG(level, msg, fmt, args...) \ +do { \ + if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) { \ + static cfs_debug_limit_state_t cdls; \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls); \ + kgnilnd_debug_msg(&msgdata, level, &cdls, msg, \ + "$$ "fmt" from %s ", ## args, \ + libcfs_nid2str((msg)->gnm_srcnid)); \ + } else { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL); \ + kgnilnd_debug_msg(&msgdata, level, NULL, msg, \ + "$$ "fmt" from %s ", ## args, \ + libcfs_nid2str((msg)->gnm_srcnid)); \ + } \ +} while (0) + +/* user puts 'to nid' in msg for us */ +#define GNIDBG_TOMSG(level, msg, fmt, args...) \ +do { \ + if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) { \ + static cfs_debug_limit_state_t cdls; \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls); \ + kgnilnd_debug_msg(&msgdata, level, &cdls, msg, \ + "$$ "fmt" ", ## args); \ + } else { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL); \ + kgnilnd_debug_msg(&msgdata, level, NULL, msg, \ + "$$ "fmt" ", ## args); \ + } \ +} while (0) + +extern void +_kgnilnd_debug_conn(kgn_conn_t *conn, + struct libcfs_debug_msg_data *data, const char *fmt, ... ); + +#define kgnilnd_debug_conn(msgdata, mask, cdls, conn, fmt, a...) \ +do { \ + CFS_CHECK_STACK(msgdata, mask, cdls); \ + \ + if (((mask) & D_CANTMASK) != 0 || \ + ((libcfs_debug & (mask)) != 0 && \ + (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) \ + _kgnilnd_debug_conn((conn), msgdata, fmt, ##a); \ +} while(0) + +/* for most callers (level is a constant) this is resolved at compile time */ +#define GNIDBG_CONN(level, conn, fmt, args...) \ +do { \ + if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) { \ + static cfs_debug_limit_state_t cdls; \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls); \ + kgnilnd_debug_conn(&msgdata, level, &cdls, conn, \ + "$$ "fmt" ", ## args); \ + } else { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL); \ + kgnilnd_debug_conn(&msgdata, level, NULL, conn, \ + "$$ "fmt" ", ## args); \ + } \ +} while (0) + +extern void +_kgnilnd_debug_tx(kgn_tx_t *tx, + struct libcfs_debug_msg_data *data, const char *fmt, ... ); + +#define kgnilnd_debug_tx(msgdata, mask, cdls, tx, fmt, a...) \ +do { \ + CFS_CHECK_STACK(msgdata, mask, cdls); \ + \ + if (((mask) & D_CANTMASK) != 0 || \ + ((libcfs_debug & (mask)) != 0 && \ + (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) \ + _kgnilnd_debug_tx((tx), msgdata, fmt, ##a); \ +} while(0) + +/* for most callers (level is a constant) this is resolved at compile time */ +#define GNIDBG_TX(level, tx, fmt, args...) \ +do { \ + if ((level) & (D_ERROR | D_WARNING | D_NETERROR)) { \ + static cfs_debug_limit_state_t cdls; \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls); \ + kgnilnd_debug_tx(&msgdata, level, &cdls, tx, \ + "$$ "fmt" ", ## args); \ + } else { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL); \ + kgnilnd_debug_tx(&msgdata, level, NULL, tx, \ + "$$ "fmt" ", ## args); \ + } \ +} while (0) + +#define GNITX_ASSERTF(tx, cond, fmt, a...) \ +({ \ + if (unlikely(!(cond))) { \ + GNIDBG_TX(D_EMERG, tx, "ASSERTION(" #cond ") failed:" fmt, a); \ + LBUG(); \ + } \ +}) + +#define GNILND_IS_QUIESCED \ + (atomic_read(&kgnilnd_data.kgn_nquiesce) == \ + atomic_read(&kgnilnd_data.kgn_nthreads)) + +#define KGNILND_SPIN_QUIESCE \ +do { \ + /* E.T phone home */ \ + atomic_inc(&kgnilnd_data.kgn_nquiesce); \ + CDEBUG(D_NET, "Waiting for thread pause to be over...\n"); \ + while (kgnilnd_data.kgn_quiesce_trigger) { \ + set_current_state(TASK_INTERRUPTIBLE); \ + cfs_schedule_timeout_and_set_state(TASK_INTERRUPTIBLE, \ + cfs_time_seconds(1)); \ + } \ + /* Mom, my homework is done */ \ + CDEBUG(D_NET, "Waking up from thread pause\n"); \ + atomic_dec(&kgnilnd_data.kgn_nquiesce); \ +} while(0) + +/* use macros for addref/decref to get the calling function name in the CDEBUG */ +#ifndef LIBCFS_DEBUG +#error "this code uses actions inside LASSERT for ref counting" +#endif + +#define kgnilnd_admin_addref(atomic) \ +do { \ + int val = atomic_inc_return(&atomic); \ + LASSERTF(val > 0, #atomic " refcount %d\n", val); \ + CDEBUG(D_NETTRACE, #atomic " refcount %d\n", val); \ +} while (0) + +#define kgnilnd_admin_decref(atomic) \ +do { \ + int val = atomic_dec_return(&atomic); \ + LASSERTF(val >=0, #atomic " refcount %d\n", val); \ + CDEBUG(D_NETTRACE, #atomic " refcount %d\n", val); \ +}while (0) + +#define kgnilnd_net_addref(net) \ +do { \ + int val = atomic_inc_return(&net->gnn_refcount); \ + LASSERTF(val > 1, "net %p refcount %d\n", net, val); \ + CDEBUG(D_NETTRACE, "net %p->%s++ (%d)\n", net, \ + libcfs_nid2str(net->gnn_ni->ni_nid), val); \ +} while (0) + +#define kgnilnd_net_decref(net) \ +do { \ + int val = atomic_dec_return(&net->gnn_refcount); \ + LASSERTF(val >= 0, "net %p refcount %d\n", net, val); \ + CDEBUG(D_NETTRACE, "net %p->%s-- (%d)\n", net, \ + libcfs_nid2str(net->gnn_ni->ni_nid), val); \ +} while (0) + +#define kgnilnd_peer_addref(peer) \ +do { \ + int val = atomic_inc_return(&peer->gnp_refcount); \ + LASSERTF(val > 1, "peer %p refcount %d\n", peer, val); \ + CDEBUG(D_NETTRACE, "peer %p->%s++ (%d)\n", peer, \ + libcfs_nid2str(peer->gnp_nid), val); \ +} while (0) + +#define kgnilnd_peer_decref(peer) \ +do { \ + int val = atomic_dec_return(&peer->gnp_refcount); \ + LASSERTF(val >= 0, "peer %p refcount %d\n", peer, val); \ + CDEBUG(D_NETTRACE, "peer %p->%s--(%d)\n", peer, \ + libcfs_nid2str(peer->gnp_nid), val); \ + if (atomic_read(&peer->gnp_refcount) == 0) \ + kgnilnd_destroy_peer(peer); \ +} while(0) + +#define kgnilnd_conn_addref(conn) \ +do { \ + int val; \ + \ + smp_wmb(); \ + val = atomic_inc_return(&conn->gnc_refcount); \ + LASSERTF(val >= 0, "conn %p refc %d to %s\n", \ + conn, val, \ + conn->gnc_peer \ + ? libcfs_nid2str(conn->gnc_peer->gnp_nid) \ + : ""); \ + CDEBUG(D_NETTRACE, "conn %p->%s++ (%d)\n", conn, \ + conn->gnc_peer \ + ? libcfs_nid2str(conn->gnc_peer->gnp_nid) \ + : "", \ + val); \ +} while (0) + +/* we hijack conn_decref && gnc_refcount = 1 to allow us to push the conn + * through the scheduler thread to get the EP destroyed. This avoids some + * messy semaphore business and allows us to reuse the connd_list and existing + * linkage and avoid creating extra lists just for destroying EPs */ + +/* Safety Disclaimer: + * Q: If we decrement the refcount and then check it again, is it possible that + * another caller could have passed through this macro concurrently? If so, + * then it is possible that both will attempt to call kgnilnd_destroy_conn(). + * + * A: Yes, entirely possible in most cases, but we can't get concurrent users + * once we are refcount <= 2. It hinges around gnc_state and membership of + * gnc_hashlist. There are two ways to find a connection - either ask for + * it from the peer, kgnilnd_find_conn_locked(peer) or from the CQ id, + * kgnilnd_cqid2conn_locked(id). While a conn is live, we'll have at least + * 4 refcounts + * + * - #1 from create (kgnilnd_create_conn) + * - #2 for EP (kgnilnd_create_conn) + * - #3 - living on peer (gnc_list, kgnilnd_finish_connect) + * - #4 living in global hash (gnc_hashlist, kgnilnd_finish_connect). + * + * Actually, only 3 live, as at the end of kgnilnd_finish_connect, we drop: + * - #1 - the ref the dgram inherited from kgnilnd_create_conn. + * + * There could be more from TX descriptors during the lifetime of a live + * conn. + * + * If we nuke the conn before finish_connect, we won't have parallel paths + * because nobody besides the dgram handler for the single outstanding + * dgram can find the connection as it isn't in any searchable tables yet. + * + * This leaves connection close, we'll drop 2 refs (#4 and #3) but only + * after calling kgnilnd_schedule_conn, which would add a new ref (#5). At + * this point gnc_refcount=2 (#2, #5). We have a 'maybe' send of the CLOSE + * now on the next scheduler loop, this could be #6 (schedule_conn again) + * and #7 (TX on gnc_fmaq). Both would be cleared quickly as that TX is + * sent. Now the gnc_state == CLOSED, so we hit + * kgnilnd_complete_closed_conn. At this point, nobody can 'find' this conn + * - we've nuked them from the peer and CQ id tables, so we own them and + * are guaranteed serial access - hence the complete lack of conn list + * locking in kgnilnd_complete_closed_conn. We are free then to mark the + * conn DESTROY_EP (add #6 for schedule_conn), then lose #5 in + * kgnilnd_process_conns. Then the next scheduler loop would call + * kgnilnd_destroy_conn_ep (drop #2 for EP) and lose #6 (refcount=0) in + * kgnilnd_process_conns. + * + * Clearly, we are totally safe. Clearly. + */ + +#define kgnilnd_conn_decref(conn) \ +do { \ + int val; \ + \ + smp_wmb(); \ + val = atomic_dec_return(&conn->gnc_refcount); \ + LASSERTF(val >= 0, "conn %p refc %d to %s\n", \ + conn, val, \ + conn->gnc_peer \ + ? libcfs_nid2str(conn->gnc_peer->gnp_nid) \ + : ""); \ + CDEBUG(D_NETTRACE, "conn %p->%s-- (%d)\n", conn, \ + conn->gnc_peer \ + ? libcfs_nid2str(conn->gnc_peer->gnp_nid) \ + : "", \ + val); \ + smp_rmb(); \ + if ((atomic_read(&conn->gnc_refcount) == 1) && \ + (conn->gnc_ephandle != NULL) && \ + (conn->gnc_state != GNILND_CONN_DESTROY_EP)) { \ + set_mb(conn->gnc_state, GNILND_CONN_DESTROY_EP); \ + kgnilnd_schedule_conn(conn); \ + } else if (atomic_read(&conn->gnc_refcount) == 0) { \ + kgnilnd_destroy_conn(conn); \ + } \ +} while (0) + +static inline struct list_head * +kgnilnd_nid2peerlist(lnet_nid_t nid) +{ + unsigned int hash = ((unsigned int)LNET_NIDADDR(nid)) % *kgnilnd_tunables.kgn_peer_hash_size; + + RETURN(&kgnilnd_data.kgn_peers[hash]); +} + +static inline struct list_head * +kgnilnd_netnum2netlist(__u16 netnum) +{ + unsigned int hash = ((unsigned int) netnum) % *kgnilnd_tunables.kgn_net_hash_size; + + RETURN(&kgnilnd_data.kgn_nets[hash]); +} + +static inline int +kgnilnd_peer_active(kgn_peer_t *peer) +{ + /* Am I in the peer hash table? */ + return (!list_empty(&peer->gnp_list)); +} + +/* need write_lock on kgn_peer_conn_lock */ +static inline int +kgnilnd_can_unlink_peer_locked(kgn_peer_t *peer) +{ + CDEBUG(D_NET, "peer 0x%p->%s conns? %d tx? %d\n", + peer, libcfs_nid2str(peer->gnp_nid), + !list_empty(&peer->gnp_conns), + !list_empty(&peer->gnp_tx_queue)); + + /* kgn_peer_conn_lock protects us from conflict with + * kgnilnd_peer_notify and gnp_persistent */ + RETURN ((list_empty(&peer->gnp_conns)) && + (list_empty(&peer->gnp_tx_queue))); +} + +/* returns positive if error was for a clean shutdown of conn */ +static inline int +kgnilnd_conn_clean_errno(int errno) +{ + /* - ESHUTDOWN - LND is unloading + * - EUCLEAN - admin requested via "lctl del_peer" + * - ENETRESET - admin requested via "lctl disconnect" + * - ENOTRECOVERABLE - stack reset + * - EISCONN - cleared via "lctl push" + * not doing ESTALE - that isn't clean */ + RETURN ((errno == 0) || + (errno == -ESHUTDOWN) || + (errno == -EUCLEAN) || + (errno == -ENETRESET) || + (errno == -EISCONN) || + (errno == -ENOTRECOVERABLE)); +} + +/* returns positive if error results in purgatory hold */ +static inline int +kgnilnd_check_purgatory_errno(int errno) +{ + /* We don't want to save the purgatory lists these cases: + * - EUCLEAN - admin requested via "lctl del_peer" + * - ESHUTDOWN - LND is unloading + */ + RETURN ((errno != -ESHUTDOWN) && + (errno != -EUCLEAN)); + +} + +/* returns positive if a purgatory hold is needed */ +static inline int +kgnilnd_check_purgatory_conn(kgn_conn_t *conn) +{ + int loopback = 0; + + if (conn->gnc_peer) { + loopback = conn->gnc_peer->gnp_nid == + conn->gnc_peer->gnp_net->gnn_ni->ni_nid; + } else { + /* short circuit - a conn that didn't complete + * setup never needs a purgatory hold */ + RETURN(0); + } + CDEBUG(D_NETTRACE, "conn 0x%p->%s loopback %d close_recvd %d\n", + conn, conn->gnc_peer ? + libcfs_nid2str(conn->gnc_peer->gnp_nid) : + "", + loopback, conn->gnc_close_recvd); + + /* we only use a purgatory hold if we've not received the CLOSE msg + * from our peer - without that message, we can't know the state of + * the other end of this connection and must put it into purgatory + * to prevent reuse and corruption. + * The theory is that a TX error can be communicated in all other cases + */ + RETURN(likely(!loopback) && !conn->gnc_close_recvd && + kgnilnd_check_purgatory_errno(conn->gnc_error)); +} + +static inline const char * +kgnilnd_tx_state2str(kgn_tx_list_state_t state); + +static inline struct list_head * +kgnilnd_tx_state2list(kgn_peer_t *peer, kgn_conn_t *conn, + kgn_tx_list_state_t to_state) +{ + switch (to_state) { + case GNILND_TX_PEERQ: + return &peer->gnp_tx_queue; + case GNILND_TX_FMAQ: + return &conn->gnc_fmaq; + case GNILND_TX_LIVE_FMAQ: + case GNILND_TX_LIVE_RDMAQ: + case GNILND_TX_DYING: + return NULL; + case GNILND_TX_MAPQ: + return &conn->gnc_device->gnd_map_tx; + case GNILND_TX_RDMAQ: + return &conn->gnc_device->gnd_rdmaq; + default: + /* IDLE, FREED or ALLOCD is not valid "on list" state */ + CERROR("invalid state requested: %s\n", + kgnilnd_tx_state2str(to_state)); + LBUG(); + break; + } +} + +/* should hold tx, conn or peer lock when calling */ +static inline void +kgnilnd_tx_add_state_locked(kgn_tx_t *tx, kgn_peer_t *peer, + kgn_conn_t *conn, kgn_tx_list_state_t state, + int add_tail) +{ + struct list_head *list = NULL; + + /* make sure we have a sane TX state to start */ + GNITX_ASSERTF(tx, (tx->tx_list_p == NULL && + tx->tx_list_state == GNILND_TX_ALLOCD) && + list_empty(&tx->tx_list), + "bad state with tx_list %s", + list_empty(&tx->tx_list) ? "empty" : "not empty"); + + /* WTF - you are already on that state buttmunch */ + GNITX_ASSERTF(tx, state != tx->tx_list_state, + "already at %s", kgnilnd_tx_state2str(state)); + + /* get proper list from the state requested */ + list = kgnilnd_tx_state2list(peer, conn, state); + + /* add refcount */ + switch (state) { + case GNILND_TX_PEERQ: + kgnilnd_peer_addref(peer); + break; + case GNILND_TX_ALLOCD: + /* no refs needed */ + break; + case GNILND_TX_FMAQ: + kgnilnd_conn_addref(conn); + break; + case GNILND_TX_MAPQ: + atomic_inc(&conn->gnc_device->gnd_nq_map); + kgnilnd_conn_addref(conn); + break; + case GNILND_TX_LIVE_FMAQ: + atomic_inc(&conn->gnc_nlive_fma); + kgnilnd_conn_addref(conn); + break; + case GNILND_TX_LIVE_RDMAQ: + atomic_inc(&conn->gnc_nlive_rdma); + kgnilnd_conn_addref(conn); + break; + case GNILND_TX_RDMAQ: + atomic_inc(&conn->gnc_nq_rdma); + kgnilnd_conn_addref(conn); + break; + case GNILND_TX_DYING: + kgnilnd_conn_addref(conn); + break; + default: + CERROR("invalid state requested: %s\n", + kgnilnd_tx_state2str(state)); + LBUG(); + break;; + } + + /* if this changes, change kgnilnd_alloc_tx */ + tx->tx_list_state = state; + + /* some states don't have lists - we track them in the per conn + * TX table instead. Waste not, want not! */ + if (list != NULL) { + tx->tx_list_p = list; + if (add_tail) + list_add_tail(&tx->tx_list, list); + else + list_add(&tx->tx_list, list); + } else { + /* set dummy list_p to make book keeping happy and let debugging + * be a hair easier */ + tx->tx_list_p = (void *)state; + } + + GNIDBG_TX(D_NET, tx, "onto %s->0x%p", + kgnilnd_tx_state2str(state), list); +} + +static inline void +kgnilnd_tx_del_state_locked(kgn_tx_t *tx, kgn_peer_t *peer, + kgn_conn_t *conn, kgn_tx_list_state_t new_state) +{ + /* These is only 1 "off-list" state */ + GNITX_ASSERTF(tx, new_state == GNILND_TX_ALLOCD, + "invalid new_state %s", kgnilnd_tx_state2str(new_state)); + + /* new_state == ALLOCD means we are deallocating this tx, + * so make sure it was on a valid list to start with */ + GNITX_ASSERTF(tx, (tx->tx_list_p != NULL) && + (((tx->tx_list_state == GNILND_TX_LIVE_FMAQ) || + (tx->tx_list_state == GNILND_TX_LIVE_RDMAQ) || + (tx->tx_list_state == GNILND_TX_DYING)) == list_empty(&tx->tx_list)), + "bad state", NULL); + + GNIDBG_TX(D_NET, tx, "off %p", tx->tx_list_p); + + /* drop refcount */ + switch (tx->tx_list_state) { + case GNILND_TX_PEERQ: + kgnilnd_peer_decref(peer); + break; + case GNILND_TX_FREED: + case GNILND_TX_IDLE: + case GNILND_TX_ALLOCD: + /* no refs needed */ + break; + case GNILND_TX_DYING: + kgnilnd_conn_decref(conn); + break; + case GNILND_TX_FMAQ: + kgnilnd_conn_decref(conn); + break; + case GNILND_TX_MAPQ: + atomic_dec(&conn->gnc_device->gnd_nq_map); + kgnilnd_conn_decref(conn); + break; + case GNILND_TX_LIVE_FMAQ: + atomic_dec(&conn->gnc_nlive_fma); + kgnilnd_conn_decref(conn); + break; + case GNILND_TX_LIVE_RDMAQ: + atomic_dec(&conn->gnc_nlive_rdma); + kgnilnd_conn_decref(conn); + break; + case GNILND_TX_RDMAQ: + atomic_dec(&conn->gnc_nq_rdma); + kgnilnd_conn_decref(conn); + /* don't need to assert on default, already did in set */ + } + + /* for ALLOCD, this might already be true, but no harm doing it again */ + list_del_init(&tx->tx_list); + tx->tx_list_p = NULL; + tx->tx_list_state = new_state; +} + +static inline int +kgnilnd_tx_mapped(kgn_tx_t *tx) +{ + return (tx->tx_buftype == GNILND_BUF_VIRT_MAPPED || + tx->tx_buftype == GNILND_BUF_PHYS_MAPPED); +} + +static inline struct list_head * +kgnilnd_cqid2connlist(__u32 cqid) +{ + unsigned int hash = cqid % *kgnilnd_tunables.kgn_peer_hash_size; + + return (&kgnilnd_data.kgn_conns [hash]); +} + +static inline kgn_conn_t * +kgnilnd_cqid2conn_locked(__u32 cqid) +{ + struct list_head *conns = kgnilnd_cqid2connlist(cqid); + struct list_head *tmp; + kgn_conn_t *conn; + + list_for_each(tmp, conns) { + conn = list_entry(tmp, kgn_conn_t, gnc_hashlist); + + if (conn->gnc_cqid == cqid) + return conn; + } + + return NULL; +} + +/* returns 1..GNILND_MAX_CQID on success, 0 on failure */ +static inline __u32 +kgnilnd_get_cqid_locked(void) +{ + int looped = 0; + __u32 cqid; + + do { + cqid = kgnilnd_data.kgn_next_cqid++; + if (kgnilnd_data.kgn_next_cqid >= GNILND_MAX_CQID) { + if (looped) { + return 0; + } + kgnilnd_data.kgn_next_cqid = 1; + looped = 1; + } + } while (kgnilnd_cqid2conn_locked(cqid) != NULL); + + return cqid; +} + +static inline void +kgnilnd_validate_tx_ev_id(kgn_tx_ev_id_t *ev_id, kgn_tx_t **txp, kgn_conn_t **connp) +{ + kgn_tx_t *tx = NULL; + kgn_conn_t *conn = NULL; + + /* set to NULL so any early return is an error */ + *txp = NULL; + *connp = NULL; + + LASSERTF((ev_id->txe_idx > 0) && + (ev_id->txe_idx < GNILND_MAX_MSG_ID), + "bogus txe_idx %d >= %d\n", + ev_id->txe_idx, GNILND_MAX_MSG_ID); + + LASSERTF((ev_id->txe_cqid > 0) && + (ev_id->txe_cqid < GNILND_MAX_CQID), + "bogus txe_cqid %d >= %d\n", + ev_id->txe_cqid, GNILND_MAX_CQID); + + read_lock(&kgnilnd_data.kgn_peer_conn_lock); + conn = kgnilnd_cqid2conn_locked(ev_id->txe_cqid); + + if (conn == NULL) { + /* Conn was destroyed? */ + read_unlock(&kgnilnd_data.kgn_peer_conn_lock); + CDEBUG(D_NET, "CQID %d lookup failed\n", ev_id->txe_cqid); + return; + } + /* just insurance */ + kgnilnd_conn_addref(conn); + read_unlock(&kgnilnd_data.kgn_peer_conn_lock); + + /* we know this is safe - as the TX won't be reused until AFTER + * the conn is unlinked from the cqid hash, so we can use the TX + * (serializing to avoid any cache oddness) freely from the conn tx ref table */ + + spin_lock(&conn->gnc_tx_lock); + tx = conn->gnc_tx_ref_table[ev_id->txe_idx]; + spin_unlock(&conn->gnc_tx_lock); + + /* We could have a tx that was cleared out by other forces + * lctl disconnect or del_peer. */ + if (tx == NULL) { + CNETERR("txe_idx %d is gone, ignoring event\n", ev_id->txe_idx); + kgnilnd_conn_decref(conn); + return; + } + + /* check tx->tx_msg magic to make sure kgni didn't eat it */ + GNITX_ASSERTF(tx, tx->tx_msg.gnm_magic == GNILND_MSG_MAGIC, + "came back from kgni with bad magic %x", tx->tx_msg.gnm_magic); + + GNITX_ASSERTF(tx, tx->tx_id.txe_idx == ev_id->txe_idx, + "conn 0x%p->%s tx_ref_table hosed: wanted txe_idx %d " + "found tx %p txe_idx %d", + conn, libcfs_nid2str(conn->gnc_peer->gnp_nid), + ev_id->txe_idx, tx, tx->tx_id.txe_idx); + + GNITX_ASSERTF(tx, tx->tx_conn != NULL, "tx with NULL connection", NULL); + + GNITX_ASSERTF(tx, tx->tx_conn == conn, "tx conn does not equal conn", NULL); + + *txp = tx; + *connp = conn; + + GNIDBG_TX(D_NET, tx, "validated to 0x%p", conn); +} + +/* set_normalized_timepsec isn't exported from the kernel, so + * we need to do the same thing inline */ +static inline struct timespec +kgnilnd_ts_sub(struct timespec lhs, struct timespec rhs) +{ + time_t sec; + long nsec; + struct timespec ts; + + sec = lhs.tv_sec - rhs.tv_sec; + nsec = lhs.tv_nsec - rhs.tv_nsec; + + while (nsec >= NSEC_PER_SEC) { + nsec -= NSEC_PER_SEC; + ++sec; + } + while (nsec < 0) { + nsec += NSEC_PER_SEC; + --sec; + } + ts.tv_sec = sec; + ts.tv_nsec = nsec; + return ts; +} + +static inline int +kgnilnd_count_list(struct list_head *q) +{ + struct list_head *e; + int n = 0; + + list_for_each(e, q) { + n++; + } + + return n; +} + +/* kgnilnd_find_net adds a reference to the net it finds + * this is so the net will not be removed before the calling function + * has time to use the data returned. This reference needs to be released + * by the calling function once it has finished using the returned net + */ + +static inline int +kgnilnd_find_net(lnet_nid_t nid, kgn_net_t **netp) +{ + kgn_net_t *net; + int rc; + + rc = down_read_trylock(&kgnilnd_data.kgn_net_rw_sem); + + if (!rc) { + return -ESHUTDOWN; + } + + list_for_each_entry(net, kgnilnd_netnum2netlist(LNET_NETNUM(LNET_NIDNET(nid))), gnn_list) { + if (!net->gnn_shutdown && LNET_NIDNET(net->gnn_ni->ni_nid) == LNET_NIDNET(nid)) { + kgnilnd_net_addref(net); + up_read(&kgnilnd_data.kgn_net_rw_sem); + *netp = net; + return 0; + } + } + + up_read(&kgnilnd_data.kgn_net_rw_sem); + + return -ENONET; +} + +#ifdef CONFIG_DEBUG_SLAB +#define KGNILND_POISON(ptr, c, s) do {} while(0) +#else +#define KGNILND_POISON(ptr, c, s) memset(ptr, c, s) +#endif + +int kgnilnd_dev_init(kgn_device_t *dev); +void kgnilnd_dev_fini(kgn_device_t *dev); +int kgnilnd_startup(lnet_ni_t *ni); +void kgnilnd_shutdown(lnet_ni_t *ni); +int kgnilnd_base_startup(void); +void kgnilnd_base_shutdown(void); + +int kgnilnd_allocate_phys_fmablk(kgn_device_t *device); +int kgnilnd_map_phys_fmablk(kgn_device_t *device); +void kgnilnd_unmap_phys_fmablk(kgn_device_t *device); +void kgnilnd_free_phys_fmablk(kgn_device_t *device); + +int kgnilnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg); +void kgnilnd_query(lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when); +int kgnilnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); +int kgnilnd_eager_recv(lnet_ni_t *ni, void *private, + lnet_msg_t *lntmsg, void **new_private); +int kgnilnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, + int delayed, unsigned int niov, + struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen); + +__u16 kgnilnd_cksum_kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int offset, unsigned int nob, int dump_blob); + +/* purgatory functions */ +void kgnilnd_add_purgatory_locked(kgn_conn_t *conn, kgn_peer_t *peer); +void kgnilnd_mark_for_detach_purgatory_all_locked(kgn_peer_t *peer); +void kgnilnd_detach_purgatory_locked(kgn_conn_t *conn, struct list_head *conn_list); +void kgnilnd_release_purgatory_list(struct list_head *conn_list); + +void kgnilnd_update_reaper_timeout(long timeout); +void kgnilnd_unmap_buffer(kgn_tx_t *tx, int error); +kgn_tx_t *kgnilnd_new_tx_msg(int type, lnet_nid_t source); +void kgnilnd_tx_done(kgn_tx_t *tx, int completion); +void kgnilnd_txlist_done(struct list_head *txlist, int error); +void kgnilnd_unlink_peer_locked(kgn_peer_t *peer); +void kgnilnd_schedule_conn(kgn_conn_t *conn); +void kgnilnd_schedule_process_conn(kgn_conn_t *conn, int sched_intent); + +void kgnilnd_schedule_dgram(kgn_device_t *dev); +int kgnilnd_create_peer_safe(kgn_peer_t **peerp, lnet_nid_t nid, kgn_net_t *net); +void kgnilnd_add_peer_locked(lnet_nid_t nid, kgn_peer_t *new_stub_peer, kgn_peer_t **peerp); +int kgnilnd_add_peer(kgn_net_t *net, lnet_nid_t nid, kgn_peer_t **peerp); + +kgn_peer_t *kgnilnd_find_peer_locked(lnet_nid_t nid); +int kgnilnd_del_conn_or_peer(kgn_net_t *net, lnet_nid_t nid, int command, int error); +void kgnilnd_peer_increase_reconnect_locked(kgn_peer_t *peer); +void kgnilnd_queue_reply(kgn_conn_t *conn, kgn_tx_t *tx); +void kgnilnd_queue_tx(kgn_conn_t *conn, kgn_tx_t *tx); +void kgnilnd_launch_tx(kgn_tx_t *tx, kgn_net_t *net, lnet_process_id_t *target); +int kgnilnd_send_mapped_tx(kgn_tx_t *tx, int try_map_if_full); +void kgnilnd_consume_rx(kgn_rx_t *rx); + +void kgnilnd_schedule_device(kgn_device_t *dev); +void kgnilnd_device_callback(__u32 devid, __u64 arg); +void kgnilnd_schedule_device_timer(unsigned long arg); + +int kgnilnd_reaper(void *arg); +int kgnilnd_scheduler(void *arg); +int kgnilnd_dgram_mover(void *arg); + +int kgnilnd_create_conn(kgn_conn_t **connp, kgn_device_t *dev); +int kgnilnd_conn_isdup_locked(kgn_peer_t *peer, kgn_conn_t *newconn); +kgn_conn_t *kgnilnd_find_conn_locked(kgn_peer_t *peer); +int kgnilnd_get_conn(kgn_conn_t **connp, kgn_peer_t); +kgn_conn_t *kgnilnd_find_or_create_conn_locked(kgn_peer_t *peer); +void kgnilnd_peer_cancel_tx_queue(kgn_peer_t *peer); +void kgnilnd_cancel_peer_connect_locked(kgn_peer_t *peer, struct list_head *zombies); +int kgnilnd_close_stale_conns_locked(kgn_peer_t *peer, kgn_conn_t *newconn); +void kgnilnd_peer_alive(kgn_peer_t *peer); +void kgnilnd_peer_notify(kgn_peer_t *peer, int error); +void kgnilnd_close_conn_locked(kgn_conn_t *conn, int error); +void kgnilnd_close_conn(kgn_conn_t *conn, int error); +void kgnilnd_complete_closed_conn(kgn_conn_t *conn); +void kgnilnd_destroy_conn_ep(kgn_conn_t *conn); + +int kgnilnd_close_peer_conns_locked(kgn_peer_t *peer, int why); + +int kgnilnd_tunables_init(void); +void kgnilnd_tunables_fini(void); +void kgnilnd_init_msg(kgn_msg_t *msg, int type, lnet_nid_t source); + +void kgnilnd_bump_timeouts(__u32 nap_time, char *reason); +void kgnilnd_pause_threads(void); +int kgnilnd_hw_in_quiesce(void); +int kgnilnd_check_hw_quiesce(void); +void kgnilnd_quiesce_wait(char *reason); +void kgnilnd_quiesce_end_callback(gni_nic_handle_t nic_handle, uint64_t msecs); +int kgnilnd_ruhroh_thread(void *arg); +void kgnilnd_reset_stack(void); +void kgnilnd_critical_error(gni_err_handle_t err_handle); + +void kgnilnd_insert_sysctl(void); +void kgnilnd_remove_sysctl(void); +void kgnilnd_proc_init(void); +void kgnilnd_proc_fini(void); + +/* gnilnd_conn.c */ +void kgnilnd_release_mbox(kgn_conn_t *conn, int purgatory_hold); + +int kgnilnd_find_and_cancel_dgram(kgn_device_t *dev, lnet_nid_t dst_nid); +void kgnilnd_cancel_dgram_locked(kgn_dgram_t *dgram); +void kgnilnd_release_dgram(kgn_device_t *dev, kgn_dgram_t *dgram); + +int kgnilnd_setup_wildcard_dgram(kgn_device_t *dev); +int kgnilnd_cancel_net_dgrams(kgn_net_t *net); +int kgnilnd_cancel_wc_dgrams(kgn_device_t *dev); +void kgnilnd_wait_for_canceled_dgrams(kgn_device_t *dev); + +int kgnilnd_dgram_waitq(void *arg); + +int kgnilnd_set_conn_params(kgn_dgram_t *dgram); + +/* struct2str functions - we don't use a default: case to cause the compile + * to fail if there is a missing case. This allows us to hide these down here + * out of the way but ensure we'll catch any updates to the enum/types + * above */ + +#define DO_TYPE(x) case x: return #x; +static inline const char * +kgnilnd_fmablk_state2str(kgn_fmablk_state_t state) +{ + /* Only want single char string for this */ + switch (state) { + case GNILND_FMABLK_IDLE: + return "I"; + case GNILND_FMABLK_PHYS: + return "P"; + case GNILND_FMABLK_VIRT: + return "V"; + case GNILND_FMABLK_FREED: + return "F"; + } + return ""; +} + +static inline const char * +kgnilnd_msgtype2str(int type) +{ + switch (type) { + DO_TYPE(GNILND_MSG_NONE); + DO_TYPE(GNILND_MSG_NOOP); + DO_TYPE(GNILND_MSG_IMMEDIATE); + DO_TYPE(GNILND_MSG_PUT_REQ); + DO_TYPE(GNILND_MSG_PUT_NAK); + DO_TYPE(GNILND_MSG_PUT_ACK); + DO_TYPE(GNILND_MSG_PUT_DONE); + DO_TYPE(GNILND_MSG_GET_REQ); + DO_TYPE(GNILND_MSG_GET_NAK); + DO_TYPE(GNILND_MSG_GET_DONE); + DO_TYPE(GNILND_MSG_CLOSE); + } + return ""; +} + +static inline const char * +kgnilnd_tx_state2str(kgn_tx_list_state_t state) +{ + switch (state) { + DO_TYPE(GNILND_TX_IDLE); + DO_TYPE(GNILND_TX_ALLOCD); + DO_TYPE(GNILND_TX_PEERQ); + DO_TYPE(GNILND_TX_MAPQ); + DO_TYPE(GNILND_TX_FMAQ); + DO_TYPE(GNILND_TX_LIVE_FMAQ); + DO_TYPE(GNILND_TX_RDMAQ); + DO_TYPE(GNILND_TX_LIVE_RDMAQ); + DO_TYPE(GNILND_TX_DYING); + DO_TYPE(GNILND_TX_FREED); + } + return ""; +} + +static inline const char * +kgnilnd_conn_state2str(kgn_conn_t *conn) +{ + kgn_conn_state_t state = conn->gnc_state; + switch (state) { + DO_TYPE(GNILND_CONN_DUMMY); + DO_TYPE(GNILND_CONN_LISTEN); + DO_TYPE(GNILND_CONN_CONNECTING); + DO_TYPE(GNILND_CONN_ESTABLISHED); + DO_TYPE(GNILND_CONN_CLOSING); + DO_TYPE(GNILND_CONN_CLOSED); + DO_TYPE(GNILND_CONN_DONE); + DO_TYPE(GNILND_CONN_DESTROY_EP); + } + return ""; +} + +static inline const char * +kgnilnd_connreq_type2str(kgn_connreq_t *connreq) +{ + kgn_connreq_type_t type = connreq->gncr_type; + + switch (type) { + DO_TYPE(GNILND_CONNREQ_REQ); + DO_TYPE(GNILND_CONNREQ_NAK); + DO_TYPE(GNILND_CONNREQ_CLOSE); + } + return ""; +} + +static inline const char * +kgnilnd_dgram_state2str(kgn_dgram_t *dgram) +{ + kgn_dgram_state_t state = dgram->gndg_state; + + switch (state) { + DO_TYPE(GNILND_DGRAM_USED); + DO_TYPE(GNILND_DGRAM_POSTING); + DO_TYPE(GNILND_DGRAM_POSTED); + DO_TYPE(GNILND_DGRAM_PROCESSING); + DO_TYPE(GNILND_DGRAM_DONE); + DO_TYPE(GNILND_DGRAM_CANCELED); + } + return ""; +} + +static inline const char * +kgnilnd_dgram_type2str(kgn_dgram_t *dgram) +{ + kgn_dgram_type_t type = dgram->gndg_type; + + switch (type) { + DO_TYPE(GNILND_DGRAM_REQ); + DO_TYPE(GNILND_DGRAM_WC_REQ); + DO_TYPE(GNILND_DGRAM_NAK); + DO_TYPE(GNILND_DGRAM_CLOSE); + } + return ""; +} + + +#undef DO_TYPE + +/* API wrapper functions - include late to pick up all of the other defines */ +#include "gnilnd_api_wrap.h" + +#endif /* _GNILND_GNILND_H_ */ diff --git a/lnet/klnds/gnilnd/gnilnd_api_wrap.h b/lnet/klnds/gnilnd/gnilnd_api_wrap.h new file mode 100644 index 0000000..e7ba9ab --- /dev/null +++ b/lnet/klnds/gnilnd/gnilnd_api_wrap.h @@ -0,0 +1,1505 @@ +/* + * Copyright (C) 2009-2012 Cray, Inc. + * + * Author: Nic Henke + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ +#ifndef _GNILND_API_WRAP_H +#define _GNILND_API_WRAP_H + +/* LNet is allocated failure locations 0xe000 to 0xffff */ + +/* GNILND has 0xf0XX */ +#define CFS_FAIL_GNI 0xf000 +#define CFS_FAIL_GNI_PHYS_MAP 0xf001 +#define CFS_FAIL_GNI_VIRT_MAP 0xf002 +#define CFS_FAIL_GNI_GET_UNMAP 0xf003 +#define CFS_FAIL_GNI_PUT_UNMAP 0xf004 +#define CFS_FAIL_GNI_MAP_TX 0xf005 +#define CFS_FAIL_GNI_SMSG_SEND 0xf006 +#define CFS_FAIL_GNI_CLOSE_SEND 0xf007 +#define CFS_FAIL_GNI_CDM_CREATE 0xf008 +#define CFS_FAIL_GNI_CDM_DESTROY 0xf009 +#define CFS_FAIL_GNI_CDM_ATTACH 0xf00a +#define CFS_FAIL_GNI_CQ_CREATE 0xf00b +#define CFS_FAIL_GNI_CQ_DESTROY 0xf00c +#define CFS_FAIL_GNI_EP_BIND 0xf00d +#define CFS_FAIL_GNI_EP_UNBIND 0xf00e +#define CFS_FAIL_GNI_EP_SET_EVDATA 0xf00f +#define CFS_FAIL_GNI_SMSG_INIT 0xf010 +#define CFS_FAIL_GNI_SMSG_RELEASE 0xf011 +#define CFS_FAIL_GNI_POST_RDMA 0xf012 +#define CFS_FAIL_GNI_GET_COMPLETED 0xf013 +#define CFS_FAIL_GNI_EP_DESTROY 0xf015 +#define CFS_FAIL_GNI_VIRT_UNMAP 0xf016 +#define CFS_FAIL_GNI_MDD_RELEASE 0xf017 +#define CFS_FAIL_GNI_NOOP_SEND 0xf018 +#define CFS_FAIL_GNI_ERR_SUBSCRIBE 0xf01a +#define CFS_FAIL_GNI_QUIESCE_RACE 0xf01b +#define CFS_FAIL_GNI_DG_TERMINATE 0xf01c +#define CFS_FAIL_GNI_REG_QUIESCE 0xf01d +#define CFS_FAIL_GNI_IN_QUIESCE 0xf01e +#define CFS_FAIL_GNI_DELAY_RDMA 0xf01f +#define CFS_FAIL_GNI_SR_DOWN_RACE 0xf020 +#define CFS_FAIL_GNI_ALLOC_TX 0xf021 +#define CFS_FAIL_GNI_FMABLK_AVAIL 0xf022 +#define CFS_FAIL_GNI_EP_CREATE 0xf023 +#define CFS_FAIL_GNI_CQ_GET_EVENT 0xf024 +#define CFS_FAIL_GNI_PROBE 0xf025 +#define CFS_FAIL_GNI_EP_TEST 0xf026 +#define CFS_FAIL_GNI_CONNREQ_DROP 0xf027 +#define CFS_FAIL_GNI_CONNREQ_PROTO 0xf028 +#define CFS_FAIL_GNI_CONND_PILEUP 0xf029 +#define CFS_FAIL_GNI_PHYS_SETUP 0xf02a +#define CFS_FAIL_GNI_FIND_TARGET 0xf02b +#define CFS_FAIL_GNI_WC_DGRAM_FREE 0xf02c +#define CFS_FAIL_GNI_DROP_CLOSING 0xf02d +#define CFS_FAIL_GNI_RX_CLOSE_CLOSING 0xf02e +#define CFS_FAIL_GNI_RX_CLOSE_CLOSED 0xf02f +#define CFS_FAIL_GNI_EP_POST 0xf030 +#define CFS_FAIL_GNI_PACK_SRCNID 0xf031 +#define CFS_FAIL_GNI_PACK_DSTNID 0xf032 +#define CFS_FAIL_GNI_PROBE_WAIT 0xf033 +#define CFS_FAIL_GNI_SMSG_CKSUM1 0xf034 +#define CFS_FAIL_GNI_SMSG_CKSUM2 0xf035 +#define CFS_FAIL_GNI_SMSG_CKSUM3 0xf036 +#define CFS_FAIL_GNI_DROP_DESTROY_EP 0xf037 +#define CFS_FAIL_GNI_SMSG_GETNEXT 0xf038 +#define CFS_FAIL_GNI_FINISH_PURG 0xf039 +#define CFS_FAIL_GNI_PURG_REL_DELAY 0xf03a +#define CFS_FAIL_GNI_DONT_NOTIFY 0xf03b +#define CFS_FAIL_GNI_VIRT_SMALL_MAP 0xf03c +#define CFS_FAIL_GNI_DELAY_RDMAQ 0xf03d +#define CFS_FAIL_GNI_PAUSE_SHUTDOWN 0xf03e +#define CFS_FAIL_GNI_PAUSE_DGRAM_COMP 0xf03f +#define CFS_FAIL_GNI_NET_LOOKUP 0xf040 +#define CFS_FAIL_GNI_RECV_TIMEOUT 0xf041 +#define CFS_FAIL_GNI_SEND_TIMEOUT 0xf042 +#define CFS_FAIL_GNI_ONLY_NOOP 0xf043 +#define CFS_FAIL_GNI_FINISH_PURG2 0xf044 +#define CFS_FAIL_GNI_RACE_RESET 0xf045 +#define CFS_FAIL_GNI_GNP_CONNECTING1 0xf046 +#define CFS_FAIL_GNI_GNP_CONNECTING2 0xf047 +#define CFS_FAIL_GNI_GNP_CONNECTING3 0xf048 +#define CFS_FAIL_GNI_PUT_ACK_AGAIN 0xf050 +#define CFS_FAIL_GNI_GET_REQ_AGAIN 0xf051 + +/* helper macros */ +extern void +_kgnilnd_api_rc_lbug(const char *rcstr, int rc, struct libcfs_debug_msg_data *data, + const char *fmt, ...) + __attribute__ ((format (printf, 4, 5))); + +#define kgnilnd_api_rc_lbug(msgdata, rc, fmt, a...) \ +do { \ + CFS_CHECK_STACK(msgdata, D_ERROR, NULL); \ + /* we don't mask this - it is always at D_ERROR */ \ + _kgnilnd_api_rc_lbug(kgnilnd_api_rc2str(rc), (rc), msgdata, fmt, ##a); \ +} while (0) + +#define DO_RETCODE(x) case x: return #x; +static inline const char * +kgnilnd_api_rc2str(gni_return_t rrc) +{ + + switch (rrc) { + DO_RETCODE(GNI_RC_SUCCESS) + DO_RETCODE(GNI_RC_NOT_DONE); + DO_RETCODE(GNI_RC_INVALID_PARAM); + DO_RETCODE(GNI_RC_ERROR_RESOURCE); + DO_RETCODE(GNI_RC_TIMEOUT); + DO_RETCODE(GNI_RC_PERMISSION_ERROR); + DO_RETCODE(GNI_RC_DESCRIPTOR_ERROR); + DO_RETCODE(GNI_RC_ALIGNMENT_ERROR); + DO_RETCODE(GNI_RC_INVALID_STATE); + DO_RETCODE(GNI_RC_NO_MATCH); + DO_RETCODE(GNI_RC_SIZE_ERROR); + DO_RETCODE(GNI_RC_TRANSACTION_ERROR); + DO_RETCODE(GNI_RC_ILLEGAL_OP); + DO_RETCODE(GNI_RC_ERROR_NOMEM); + } + LBUG(); +} +#undef DO_RETCODE + +/* log an error and LBUG for unhandled rc from gni api function + * the fmt should be something like: + * gni_api_call(arg1, arg2, arg3) + */ + +/* apick_fn and apick_fmt should be defined for each site */ +#undef apick_fn +#undef apick_fmt + +#define GNILND_API_RC_LBUG(args...) \ +do { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_ERROR, NULL); \ + kgnilnd_api_rc_lbug(&msgdata, rrc, apick_fn"("apick_fmt")", ##args); \ +} while (0) + +#define GNILND_API_SWBUG(args...) \ +do { \ + CERROR("likely SOFTWARE BUG "apick_fn"("apick_fmt") rc %s\n", \ + ##args, kgnilnd_api_rc2str(rrc)); \ +} while (0) + +#define GNILND_API_EINVAL(args...) \ +do { \ + CERROR("invalid parameter to "apick_fn"("apick_fmt") rc %s\n", \ + ##args, kgnilnd_api_rc2str(rrc)); \ +} while (0) + +#define GNILND_API_RESOURCE(args...) \ +do { \ + CERROR("no resources for "apick_fn"("apick_fmt") rc %s\n", \ + ##args, kgnilnd_api_rc2str(rrc)); \ +} while (0) + +#define GNILND_API_BUSY(args...) \ +do { \ + CERROR("resources busy for "apick_fn"("apick_fmt") rc %s\n", \ + ##args, kgnilnd_api_rc2str(rrc)); \ +} while (0) + +#undef DEBUG_SMSG_CREDITS +#ifdef DEBUG_SMSG_CREDITS +#define CRAY_CONFIG_GHAL_GEMINI +#include +#define GNIDBG_SMSG_CREDS(level, conn) \ +do { \ + gni_ep_smsg_mbox_t *smsg = conn->gnc_ephandle->smsg; \ + CDEBUG(level, "SMSGDBG: conn %p mcred %d/%d bcred %d/%d " \ + "s_seq %d/%d/%d r_seq %d/%d/%d retr %d\n", \ + conn, smsg->mbox_credits, smsg->back_mbox_credits, \ + smsg->buffer_credits, smsg->back_buffer_credits, \ + smsg->s_seqno, smsg->s_seqno_back_mbox_credits, \ + smsg->s_seqno_back_buffer_credits, smsg->r_seqno, \ + smsg->r_seqno_back_mbox_credits, \ + smsg->r_seqno_back_buffer_credits, smsg->retransmit_count); \ +} while (0) +#else +#define GNIDBG_SMSG_CREDS(level, conn) do {} while(0) +#endif + +/* these are all wrappers around gni_XXX functions. + * This allows us to handle all the return codes and api checks without + * dirtying up the logic code */ + +/* TODO: RETURN wrapper that translates integer to GNI API RC string */ + +#define apick_fn "kgnilnd_cdm_create" +#define apick_fmt "%u, %u, %u, %u, 0x%p" +static inline gni_return_t kgnilnd_cdm_create( + IN uint32_t inst_id, + IN uint8_t ptag, + IN uint32_t cookie, + IN uint32_t modes, + OUT gni_cdm_handle_t *cdm_hndl + ) +{ + gni_return_t rrc; + + /* Error injection */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CDM_CREATE)) { + rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM; + } else { + rrc = gni_cdm_create(inst_id, ptag, cookie, modes, cdm_hndl); + } + + switch (rrc) { + case GNI_RC_SUCCESS: + break; + case GNI_RC_ERROR_RESOURCE: + case GNI_RC_INVALID_PARAM: + /* Try to bail gracefully */ + GNILND_API_SWBUG( + inst_id, ptag, cookie, modes, cdm_hndl); + break; + default: + GNILND_API_RC_LBUG( + inst_id, ptag, cookie, modes, cdm_hndl); + + /* LBUG never returns, but just for style and consistency */ + break; + } + RETURN(rrc); +} + +#undef apick_fn +#undef apick_fmt + +#define apick_fn "kgnilnd_cdm_attach" +#define apick_fmt "0x%p, %u, 0x%p, 0x%p" +static inline gni_return_t kgnilnd_cdm_attach( + IN gni_cdm_handle_t cdm_hndl, + IN uint32_t device_id, + OUT uint32_t *local_addr, + OUT gni_nic_handle_t *nic_hndl + ) +{ + gni_return_t rrc; + + /* Error injection */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CDM_ATTACH)) { + rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM; + } else { + rrc = gni_cdm_attach(cdm_hndl, device_id, local_addr, nic_hndl); + } + + switch (rrc) { + case GNI_RC_SUCCESS: + break; + case GNI_RC_NO_MATCH: + case GNI_RC_INVALID_PARAM: + GNILND_API_SWBUG( + cdm_hndl, device_id, local_addr, nic_hndl); + break; + case GNI_RC_ERROR_RESOURCE: + case GNI_RC_INVALID_STATE: + GNILND_API_RESOURCE( + cdm_hndl, device_id, local_addr, nic_hndl); + break; + default: + GNILND_API_RC_LBUG( + cdm_hndl, device_id, local_addr, nic_hndl); + + /* LBUG never returns, but just for style and consistency */ + break; + } + RETURN(rrc); +} +#undef apick_fmt +#undef apick_fn + +#define apick_fn "kgnilnd_cdm_destroy" +#define apick_fmt "0x%p" +static inline gni_return_t kgnilnd_cdm_destroy( + IN gni_cdm_handle_t cdm_hndl + ) +{ + gni_return_t rrc; + + /* Error injection */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CQ_DESTROY)) { + rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM; + } else { + rrc = gni_cdm_destroy( + cdm_hndl); + } + + switch (rrc) { + case GNI_RC_SUCCESS: + break; + case GNI_RC_INVALID_PARAM: + GNILND_API_SWBUG( + cdm_hndl); + break; + default: + GNILND_API_RC_LBUG( + cdm_hndl); + + /* LBUG never returns, but just for style and consistency */ + break; + } + RETURN(rrc); +} +#undef apick_fn +#undef apick_fmt + +#define apick_fn "kgnilnd_subscribe_errors" +#define apick_fmt "0x%p,%x,%u,0x%p,0x%p,0x%p" +static inline gni_return_t kgnilnd_subscribe_errors( + IN gni_nic_handle_t nic_handle, + IN gni_error_mask_t mask, + IN uint32_t EEQ_size, + IN void (*EQ_new_event)(gni_err_handle_t), + IN void (*app_crit_err)(gni_err_handle_t), + OUT gni_err_handle_t *err_handle + ) +{ + gni_return_t rrc; + + /* Error injection */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_ERR_SUBSCRIBE)) { + rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM; + } else { + rrc = gni_subscribe_errors( + nic_handle, mask, EEQ_size, EQ_new_event, app_crit_err, + err_handle); + } + + switch (rrc) { + case GNI_RC_SUCCESS: + break; + case GNI_RC_INVALID_PARAM: + GNILND_API_SWBUG( + nic_handle, mask, EEQ_size, EQ_new_event, app_crit_err, + err_handle); + break; + case GNI_RC_ERROR_RESOURCE: + GNILND_API_RESOURCE( + nic_handle, mask, EEQ_size, EQ_new_event, app_crit_err, + err_handle); + break; + default: + GNILND_API_RC_LBUG( + nic_handle, mask, EEQ_size, EQ_new_event, app_crit_err, + err_handle); + /* LBUG never returns, but just for style and consistency */ + break; + } + RETURN(rrc); +} +#undef apick_fn +#undef apick_fmt + +#define apick_fn "kgnilnd_release_errors" +#define apick_fmt "0x%p" +static inline gni_return_t kgnilnd_release_errors( + IN gni_err_handle_t err_handle + ) +{ + gni_return_t rrc; + + rrc = gni_release_errors( + err_handle); + + switch (rrc) { + case GNI_RC_SUCCESS: + break; + case GNI_RC_INVALID_PARAM: + case GNI_RC_NOT_DONE: + GNILND_API_SWBUG( + err_handle); + break; + default: + GNILND_API_RC_LBUG( + err_handle); + /* LBUG never returns, but just for style and consistency */ + break; + } + RETURN(rrc); +} +#undef apick_fn +#undef apick_fmt + +#define apick_fn "kgnilnd_set_quiesce_callback" +#define apick_fmt "0x%p,0x%p" +static inline gni_return_t kgnilnd_set_quiesce_callback( + IN gni_nic_handle_t nic_handle, + IN void (*qsce_func)(gni_nic_handle_t, uint64_t msecs) + ) +{ + gni_return_t rrc; + + /* Error injection */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_REG_QUIESCE)) { + rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM; + } else { + rrc = gni_set_quiesce_callback( + nic_handle, qsce_func); + } + + switch (rrc) { + case GNI_RC_SUCCESS: + break; + case GNI_RC_INVALID_STATE: + case GNI_RC_INVALID_PARAM: + GNILND_API_SWBUG( + nic_handle, qsce_func); + break; + default: + GNILND_API_RC_LBUG( + nic_handle, qsce_func); + /* LBUG never returns, but just for style and consistency */ + break; + } + RETURN(rrc); +} +#undef apick_fn +#undef apick_fmt + +#define apick_fn "kgnilnd_get_quiesce_status" +#define apick_fmt "0x%p" +static inline gni_return_t kgnilnd_get_quiesce_status( + IN gni_nic_handle_t nic_handle + ) +{ + uint32_t rrc; + + /* this has weird RC - + * 0 - quiesce not in progress + * 1 - quiesce is turned on + */ + + /* Error injection */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_IN_QUIESCE)) { + rrc = 1; + } else { + rrc = gni_get_quiesce_status( + nic_handle); + } + + switch (rrc) { + case 1: + case 0: + break; + default: + GNILND_API_RC_LBUG( + nic_handle); + /* LBUG never returns, but just for style and consistency */ + break; + } + RETURN(rrc); +} +#undef apick_fn +#undef apick_fmt + +#define apick_fn "kgnilnd_cq_create" +#define apick_fmt "0x%p, %u, %u, 0x%p, "LPX64", 0x%p" +static inline gni_return_t kgnilnd_cq_create( + IN gni_nic_handle_t nic_hndl, + IN uint32_t entry_count, + IN uint32_t delay_index, + IN gni_cq_event_hndlr_f *event_handler, + IN uint64_t usr_event_data, + OUT gni_cq_handle_t *cq_hndl + ) +{ + gni_return_t rrc; + + /* Error injection */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CQ_CREATE)) { + rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM; + } else { + rrc = gni_cq_create( + nic_hndl, entry_count, delay_index, event_handler, + usr_event_data, cq_hndl); + } + + switch (rrc) { + case GNI_RC_SUCCESS: + break; + case GNI_RC_INVALID_PARAM: + GNILND_API_SWBUG( + nic_hndl, entry_count, delay_index, event_handler, + usr_event_data, cq_hndl); + break; + case GNI_RC_ERROR_RESOURCE: + GNILND_API_RESOURCE( + nic_hndl, entry_count, delay_index, event_handler, + usr_event_data, cq_hndl); + break; + default: + GNILND_API_RC_LBUG( + nic_hndl, entry_count, delay_index, event_handler, + usr_event_data, cq_hndl); + + /* LBUG never returns, but just for style and consistency */ + break; + } + RETURN(rrc); +} +#undef apick_fn +#undef apick_fmt + +#define apick_fn "kgnilnd_cq_destroy" +#define apick_fmt "0x%p" +static inline gni_return_t kgnilnd_cq_destroy( + IN gni_cq_handle_t cq_hndl + ) +{ + gni_return_t rrc; + + /* Error injection */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CQ_DESTROY)) { + rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM; + } else { + + rrc = gni_cq_destroy( + cq_hndl); + } + + switch (rrc) { + case GNI_RC_SUCCESS: + break; + case GNI_RC_INVALID_PARAM: + GNILND_API_SWBUG( + cq_hndl); + break; + case GNI_RC_ERROR_RESOURCE: + GNILND_API_BUSY( + cq_hndl); + break; + default: + GNILND_API_RC_LBUG( + cq_hndl); + + /* LBUG never returns, but just for style and consistency */ + break; + } + RETURN(rrc); +} +#undef apick_fn +#undef apick_fmt + +#define apick_fn "kgnilnd_cq_get_event" +#define apick_fmt "0x%p, 0x%p" +static inline gni_return_t kgnilnd_cq_get_event( + IN gni_cq_handle_t cq_hndl, + OUT gni_cq_entry_t *event_data + ) +{ + gni_return_t rrc; + + /* no error injection - CQs are touchy about the data. + * where appropriate, we'll do this on the CQs that should be able to + * handle the various errors */ + rrc = gni_cq_get_event( + cq_hndl, event_data); + + switch (rrc) { + case GNI_RC_SUCCESS: + case GNI_RC_NOT_DONE: + case GNI_RC_TRANSACTION_ERROR: + break; + case GNI_RC_ERROR_RESOURCE: + LASSERTF(GNI_CQ_OVERRUN(*event_data), + "kgni returned ERROR_RESOURCE but cq_hndl 0x%p is not " + "overrun\n", cq_hndl); + break; + case GNI_RC_INVALID_PARAM: + GNILND_API_SWBUG( + cq_hndl, event_data); + break; + default: + GNILND_API_RC_LBUG( + cq_hndl, event_data); + + /* LBUG never returns, but just for style and consistency */ + break; + } + return rrc; +} +#undef apick_fn +#undef apick_fmt + +#define apick_fn "kgnilnd_smsg_init" +#define apick_fmt "0x%p, 0x%p, 0x%p" +static inline gni_return_t kgnilnd_smsg_init( + IN gni_ep_handle_t ep_hndl, + IN gni_smsg_attr_t *local_smsg_attr, + IN gni_smsg_attr_t *remote_smsg_attr + ) +{ + gni_return_t rrc; + + /* Error injection */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_INIT)) { + rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_ERROR_RESOURCE; + } else { + rrc = gni_smsg_init( + ep_hndl, local_smsg_attr, remote_smsg_attr); + } + + switch (rrc) { + /* both of these are OK, upper SW needs to handle */ + case GNI_RC_SUCCESS: + case GNI_RC_NOT_DONE: + break; + case GNI_RC_INVALID_PARAM: + case GNI_RC_INVALID_STATE: + GNILND_API_SWBUG( + ep_hndl, local_smsg_attr, remote_smsg_attr); + break; + case GNI_RC_ERROR_RESOURCE: + GNILND_API_RESOURCE( + ep_hndl, local_smsg_attr, remote_smsg_attr); + break; + default: + GNILND_API_RC_LBUG( + ep_hndl, local_smsg_attr, remote_smsg_attr); + + /* LBUG never returns, but just for style and consistency */ + break; + } + RETURN(rrc); +} +#undef apick_fn +#undef apick_fmt + +#define apick_fn "kgnilnd_smsg_send" +#define apick_fmt "0x%p, 0x%p, %d, 0x%p, %u %u" +static inline gni_return_t kgnilnd_smsg_send( + IN gni_ep_handle_t ep_hndl, + IN void *header, + IN uint32_t header_length, + IN void *data, + IN uint32_t data_length, + IN uint32_t msg_id + ) +{ + gni_return_t rrc; + + /* Error injection */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_SEND)) { + if (cfs_fail_loc & CFS_FAIL_RAND) { + rrc = GNI_RC_NOT_DONE; + } else { + rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE; + } + } else { + rrc = gni_smsg_send( + ep_hndl, header, header_length, data, data_length, msg_id); + } + + switch (rrc) { + /* both of these are OK, upper SW needs to handle */ + case GNI_RC_SUCCESS: + case GNI_RC_NOT_DONE: + break; + case GNI_RC_INVALID_PARAM: + GNILND_API_SWBUG( + ep_hndl, header, header_length, data, data_length, msg_id); + break; + case GNI_RC_ERROR_RESOURCE: + GNILND_API_RESOURCE( + ep_hndl, header, header_length, data, data_length, msg_id); + break; + default: + GNILND_API_RC_LBUG( + ep_hndl, header, header_length, data, data_length, msg_id); + + /* LBUG never returns, but just for style and consistency */ + break; + } + RETURN(rrc); +} +#undef apick_fn +#undef apick_fmt + +#define apick_fn "kgnilnd_smsg_getnext" +#define apick_fmt "0x%p,0x%p" +static inline gni_return_t kgnilnd_smsg_getnext( + IN gni_ep_handle_t ep_hndl, + OUT void **header + ) +{ + gni_return_t rrc; + + /* Error injection */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_RELEASE)) { + rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE; + } else { + rrc = gni_smsg_getnext( + ep_hndl, header); + } + + switch (rrc) { + /* both of these are OK, upper SW needs to handle */ + case GNI_RC_SUCCESS: + case GNI_RC_NOT_DONE: + case GNI_RC_INVALID_STATE: + break; + case GNI_RC_INVALID_PARAM: + GNILND_API_SWBUG( + ep_hndl, header); + break; + default: + GNILND_API_RC_LBUG( + ep_hndl, header); + + /* LBUG never returns, but just for style and consistency */ + break; + } + RETURN(rrc); +} +#undef apick_fn +#undef apick_fmt + +#define apick_fn "kgnilnd_smsg_release" +#define apick_fmt "0x%p" +static inline gni_return_t kgnilnd_smsg_release( + IN gni_ep_handle_t ep_hndl + ) +{ + gni_return_t rrc; + + /* Error injection */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_RELEASE)) { + rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM; + } else { + rrc = gni_smsg_release( + ep_hndl); + } + + switch (rrc) { + /* both of these are OK, upper SW needs to handle */ + case GNI_RC_SUCCESS: + case GNI_RC_NOT_DONE: + break; + case GNI_RC_INVALID_PARAM: + GNILND_API_SWBUG( + ep_hndl); + break; + default: + GNILND_API_RC_LBUG( + ep_hndl); + + /* LBUG never returns, but just for style and consistency */ + break; + } + RETURN(rrc); +} +#undef apick_fn +#undef apick_fmt + +#define apick_fn "kgnilnd_ep_create" +#define apick_fmt "0x%p, 0x%p, 0x%p" +static inline gni_return_t kgnilnd_ep_create( + IN gni_nic_handle_t nic_hndl, + IN gni_cq_handle_t src_cq_hndl, + OUT gni_ep_handle_t *ep_hndl + ) +{ + gni_return_t rrc; + + /* error injection */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_CREATE)) { + rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_ERROR_NOMEM; + } else { + rrc = gni_ep_create( + nic_hndl, src_cq_hndl, ep_hndl); + } + + switch (rrc) { + case GNI_RC_SUCCESS: + break; + case GNI_RC_INVALID_PARAM: + GNILND_API_SWBUG( + nic_hndl, src_cq_hndl, ep_hndl); + break; + case GNI_RC_ERROR_NOMEM: + GNILND_API_RESOURCE( + nic_hndl, src_cq_hndl, ep_hndl); + break; + default: + GNILND_API_RC_LBUG( + nic_hndl, src_cq_hndl, ep_hndl); + + /* lbug never returns, but just for style and consistency */ + break; + } + RETURN(rrc); +} +#undef apick_fn +#undef apick_fmt + +#define apick_fn "kgnilnd_ep_bind" +#define apick_fmt "0x%p, %x, %x" +static inline gni_return_t kgnilnd_ep_bind( + IN gni_ep_handle_t ep_hndl, + IN uint32_t remote_addr, + IN uint32_t remote_id + ) +{ + gni_return_t rrc; + + /* error injection */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_BIND)) { + rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE; + } else { + rrc = gni_ep_bind( + ep_hndl, remote_addr, remote_id); + } + + switch (rrc) { + /* both of these are ok, upper sw needs to handle */ + case GNI_RC_SUCCESS: + case GNI_RC_NOT_DONE: + break; + case GNI_RC_INVALID_PARAM: + GNILND_API_SWBUG( + ep_hndl, remote_addr, remote_id); + break; + default: + GNILND_API_RC_LBUG( + ep_hndl, remote_addr, remote_id); + + /* lbug never returns, but just for style and consistency */ + break; + } + RETURN(rrc); +} +#undef apick_fn +#undef apick_fmt + +#define apick_fn "kgnilnd_ep_set_eventdata" +#define apick_fmt "0x%p, %x, %x" +static inline gni_return_t kgnilnd_ep_set_eventdata( + IN gni_ep_handle_t ep_hndl, + IN uint32_t local_event, + IN uint32_t remote_event + ) +{ + gni_return_t rrc; + + /* Error injection */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_SET_EVDATA)) { + rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM; + } else { + rrc = gni_ep_set_eventdata( + ep_hndl, local_event, remote_event); + } + + switch (rrc) { + case GNI_RC_SUCCESS: + break; + case GNI_RC_INVALID_PARAM: + GNILND_API_SWBUG( + ep_hndl, local_event, remote_event); + break; + default: + GNILND_API_RC_LBUG( + ep_hndl, local_event, remote_event); + + /* LBUG never returns, but just for style and consistency */ + break; + } + RETURN(rrc); +} +#undef apick_fn +#undef apick_fmt + +#define apick_fn "kgnilnd_ep_unbind" +#define apick_fmt "0x%p" +static inline gni_return_t kgnilnd_ep_unbind( + IN gni_ep_handle_t ep_hndl + ) +{ + gni_return_t rrc; + + /* Error injection */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_UNBIND)) { + rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE; + } else { + rrc = gni_ep_unbind( + ep_hndl); + } + + switch (rrc) { + /* both of these are OK, upper SW needs to handle */ + case GNI_RC_NOT_DONE: + case GNI_RC_SUCCESS: + break; + case GNI_RC_INVALID_PARAM: + GNILND_API_SWBUG( + ep_hndl); + break; + default: + GNILND_API_RC_LBUG( + ep_hndl); + + /* LBUG never returns, but just for style and consistency */ + break; + } + RETURN(rrc); +} +#undef apick_fn +#undef apick_fmt + +#define apick_fn "kgnilnd_ep_destroy" +#define apick_fmt "0x%p" +static inline gni_return_t kgnilnd_ep_destroy( + IN gni_ep_handle_t ep_hndl + ) +{ + gni_return_t rrc; + + /* Error injection */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_DESTROY)) { + rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE; + } else { + rrc = gni_ep_destroy( + ep_hndl); + } + + switch (rrc) { + case GNI_RC_SUCCESS: + break; + case GNI_RC_INVALID_PARAM: + GNILND_API_SWBUG( + ep_hndl); + break; + default: + GNILND_API_RC_LBUG( + ep_hndl); + + /* LBUG never returns, but just for style and consistency */ + break; + } + RETURN(rrc); +} +#undef apick_fn +#undef apick_fmt + +#define apick_fn "kgnilnd_ep_postdata_w_id" +#define apick_fmt "0x%p, 0x%p, %d, 0x%p, %d, "LPU64"" +static inline gni_return_t kgnilnd_ep_postdata_w_id( + IN gni_ep_handle_t ep_hndl, + IN void *in_data, + IN uint16_t data_len, + IN void *out_buf, + IN uint16_t buf_size, + IN uint64_t datagram_id + ) +{ + gni_return_t rrc; + + /* Error injection */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_POST)) { + rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_SIZE_ERROR; + } else { + rrc = gni_ep_postdata_w_id( + ep_hndl, in_data, data_len, out_buf, buf_size, + datagram_id); + } + + switch (rrc) { + case GNI_RC_SUCCESS: + case GNI_RC_ERROR_NOMEM: + case GNI_RC_ERROR_RESOURCE: + break; + case GNI_RC_INVALID_PARAM: + case GNI_RC_SIZE_ERROR: + GNILND_API_SWBUG( + ep_hndl, in_data, data_len, out_buf, buf_size, + datagram_id); + break; + default: + GNILND_API_RC_LBUG( + ep_hndl, in_data, data_len, out_buf, buf_size, + datagram_id); + + /* LBUG never returns, but just for style and consistency */ + break; + } + RETURN(rrc); +} +#undef apick_fn +#undef apick_fmt + +#define apick_fn "kgnilnd_ep_postdata_test_by_id" +#define apick_fmt "0x%p, "LPU64", 0x%p, 0x%p, 0x%p" +static inline gni_return_t kgnilnd_ep_postdata_test_by_id( + IN gni_ep_handle_t ep_hndl, + IN uint64_t datagram_id, + OUT gni_post_state_t *post_state, + OUT uint32_t *remote_addr, + OUT uint32_t *remote_id + ) +{ + gni_return_t rrc; + + /* Error injection */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_EP_TEST)) { + rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_ERROR_NOMEM; + } else { + rrc = gni_ep_postdata_test_by_id( + ep_hndl, datagram_id, post_state, remote_addr, + remote_id); + + /* we want to lie, but we need to do the actual work first + * so we don't keep getting the event saying a dgram is ready */ + if (rrc == GNI_RC_SUCCESS && CFS_FAIL_CHECK(CFS_FAIL_GNI_DG_TERMINATE)) { + /* don't use fail_val, allows us to do FAIL_SOME */ + *post_state = GNI_POST_TERMINATED; + } + } + + switch (rrc) { + case GNI_RC_SUCCESS: + case GNI_RC_NO_MATCH: + break; + case GNI_RC_SIZE_ERROR: + case GNI_RC_INVALID_PARAM: + GNILND_API_SWBUG( + ep_hndl, datagram_id, post_state, remote_addr, + remote_id); + break; + case GNI_RC_ERROR_NOMEM: + GNILND_API_RESOURCE( + ep_hndl, datagram_id, post_state, remote_addr, + remote_id); + break; + default: + GNILND_API_RC_LBUG( + ep_hndl, datagram_id, post_state, remote_addr, + remote_id); + + /* LBUG never returns, but just for style and consistency */ + break; + } + RETURN(rrc); +} +#undef apick_fn +#undef apick_fmt + +#define apick_fn "kgnilnd_ep_postdata_cancel_by_id" +#define apick_fmt "0x%p, "LPU64"" +static inline gni_return_t kgnilnd_ep_postdata_cancel_by_id( + IN gni_ep_handle_t ep_hndl, + IN uint64_t datagram_id + ) +{ + gni_return_t rrc; + + /* no error injection as the only thing we'd do is LBUG */ + + rrc = gni_ep_postdata_cancel_by_id( + ep_hndl, datagram_id); + + switch (rrc) { + case GNI_RC_SUCCESS: + case GNI_RC_NO_MATCH: + break; + case GNI_RC_INVALID_PARAM: + GNILND_API_SWBUG( + ep_hndl, datagram_id); + break; + default: + GNILND_API_RC_LBUG( + ep_hndl, datagram_id); + + /* LBUG never returns, but just for style and consistency */ + break; + } + RETURN(rrc); +} +#undef apick_fn +#undef apick_fmt + +#define apick_fn "kgnilnd_postdata_probe_by_id" +#define apick_fmt "0x%p, 0x%p" +static inline gni_return_t kgnilnd_postdata_probe_by_id( + IN gni_nic_handle_t nic_hndl, + OUT uint64_t *datagram_id + ) +{ + gni_return_t rrc; + + /* Error injection */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PROBE)) { + rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NO_MATCH; + } else { + rrc = gni_postdata_probe_by_id( + nic_hndl, datagram_id); + } + + switch (rrc) { + case GNI_RC_SUCCESS: + case GNI_RC_NO_MATCH: + break; + case GNI_RC_INVALID_PARAM: + GNILND_API_SWBUG( + nic_hndl, datagram_id); + break; + default: + GNILND_API_RC_LBUG( + nic_hndl, datagram_id); + + /* LBUG never returns, but just for style and consistency */ + break; + } + RETURN(rrc); +} +#undef apick_fn +#undef apick_fmt + +#define apick_fn "kgnilnd_postdata_probe_wait_by_id" +#define apick_fmt "0x%p, %d, 0x%p" +static inline gni_return_t kgnilnd_postdata_probe_wait_by_id( + IN gni_nic_handle_t nic_hndl, + IN uint32_t timeout, + OUT uint64_t *datagram_id + ) +{ + gni_return_t rrc; + + /* Error injection */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PROBE_WAIT)) { + rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_TIMEOUT; + } else { + rrc = gni_postdata_probe_wait_by_id( + nic_hndl, timeout, datagram_id); + } + + switch (rrc) { + case GNI_RC_SUCCESS: + case GNI_RC_TIMEOUT: + break; + case GNI_RC_INVALID_PARAM: + GNILND_API_SWBUG( + nic_hndl, timeout, datagram_id); + break; + default: + GNILND_API_RC_LBUG( + nic_hndl, timeout, datagram_id); + + /* LBUG never returns, but just for style and consistency */ + break; + } + RETURN(rrc); +} +#undef apick_fn +#undef apick_fmt + +#define apick_fn "kgnilnd_post_rdma" +#define apick_fmt "0x%p, 0x%p" +static inline gni_return_t kgnilnd_post_rdma( + IN gni_ep_handle_t ep_hndl, + IN gni_post_descriptor_t *post_descr + ) +{ + gni_return_t rrc; + + /* Error injection */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_POST_RDMA)) { + rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM; + } else { + rrc = gni_post_rdma( + ep_hndl, post_descr); + } + + switch (rrc) { + case GNI_RC_SUCCESS: + break; + case GNI_RC_ALIGNMENT_ERROR: + case GNI_RC_INVALID_PARAM: + GNILND_API_SWBUG( + ep_hndl, post_descr); + break; + case GNI_RC_ERROR_RESOURCE: + GNILND_API_RESOURCE( + ep_hndl, post_descr); + break; + default: + GNILND_API_RC_LBUG( + ep_hndl, post_descr); + + /* LBUG never returns, but just for style and consistency */ + break; + } + RETURN(rrc); +} +#undef apick_fn +#undef apick_fmt + +#define apick_fn "kgnilnd_get_completed" +#define apick_fmt "0x%p,"LPX64",0x%p" +static inline gni_return_t kgnilnd_get_completed( + IN gni_cq_handle_t cq_hndl, + IN gni_cq_entry_t event_data, + OUT gni_post_descriptor_t **post_descr + ) +{ + gni_return_t rrc; + + + rrc = gni_get_completed(cq_hndl, event_data, post_descr); + + switch (rrc) { + case GNI_RC_TRANSACTION_ERROR: + case GNI_RC_SUCCESS: + break; + case GNI_RC_DESCRIPTOR_ERROR: + case GNI_RC_INVALID_PARAM: + GNILND_API_SWBUG(cq_hndl, event_data, post_descr); + break; + default: + GNILND_API_RC_LBUG(cq_hndl, event_data, post_descr); + /* LBUG never returns, but just for style and consistency */ + break; + } + + /* Error injection - we need a valid desc, so let kgni give us one + * - then we lie */ + if (rrc == GNI_RC_SUCCESS && + (CFS_FAIL_CHECK(CFS_FAIL_GNI_GET_COMPLETED))) { + /* We only trigger TRANSACTION_ERROR for now */ + gni_post_descriptor_t *desc; + rrc = GNI_RC_TRANSACTION_ERROR; + desc = *post_descr; + desc->status = rrc; + /* recoverable decision made from cfs_fail_val in + * kgnilnd_cq_error_str and + * kgnilnd_cq_error_recoverable */ + } + RETURN(rrc); +} +#undef apick_fn +#undef apick_fmt + +#define apick_fn "kgnilnd_cq_error_str" +#define apick_fmt LPX64",0x%p,%d" +static inline gni_return_t kgnilnd_cq_error_str( + IN gni_cq_entry_t entry, + IN void *buffer, + IN uint32_t len + ) +{ + gni_return_t rrc; + + /* Error injection - set string if we injected a + * TRANSACTION_ERROR earlier */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GET_COMPLETED)) { + /* if we just set persistent error, we can't ever + * break in via ssh to clear, so use a count > 10 to indicate fatal */ + sprintf(buffer, "INJECT:%s", cfs_fail_val > 10 ? + "FATAL" : "RECOVERABLE"); + rrc = GNI_RC_SUCCESS; + } else { + rrc = gni_cq_error_str( + entry, buffer, len); + } + + switch (rrc) { + case GNI_RC_SUCCESS: + break; + case GNI_RC_SIZE_ERROR: + case GNI_RC_INVALID_PARAM: + GNILND_API_SWBUG( + entry, buffer, len); + /* give them something to use */ + snprintf(buffer, len, "UNDEF:UNDEF"); + break; + default: + GNILND_API_RC_LBUG( + entry, buffer, len); + + /* LBUG never returns, but just for style and consistency */ + break; + } + RETURN(rrc); +} +#undef apick_fn +#undef apick_fmt + +#define apick_fn "kgnilnd_cq_error_recoverable" +#define apick_fmt LPX64",0x%p" +static inline gni_return_t kgnilnd_cq_error_recoverable( + IN gni_cq_entry_t entry, + IN uint32_t *recoverable + ) +{ + gni_return_t rrc; + + /* Error injection - set string if we injected a + * TRANSACTION_ERROR earlier */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GET_COMPLETED)) { + *recoverable = cfs_fail_val > 10 ? 0 : 1; + rrc = GNI_RC_SUCCESS; + } else { + rrc = gni_cq_error_recoverable( + entry, recoverable); + } + + switch (rrc) { + case GNI_RC_SUCCESS: + break; + case GNI_RC_INVALID_STATE: + case GNI_RC_INVALID_PARAM: + GNILND_API_SWBUG( + entry, recoverable); + *recoverable = 0; + break; + default: + GNILND_API_RC_LBUG( + entry, recoverable); + + /* LBUG never returns, but just for style and consistency */ + break; + } + RETURN(rrc); +} +#undef apick_fn +#undef apick_fmt + +#define apick_fn "kgnilnd_mem_register_segments" +#define apick_fmt "0x%p,0x%p,%u,0x%p,%x,0x%p" +static inline gni_return_t +kgnilnd_mem_register_segments( + IN gni_nic_handle_t nic_hndl, + IN gni_mem_segment_t *mem_segments, + IN uint32_t segments_cnt, + IN gni_cq_handle_t dst_cq_hndl, + IN uint32_t flags, + OUT gni_mem_handle_t *mem_hndl + ) +{ + gni_return_t rrc; + + /* Error injection */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PHYS_MAP)) { + rrc = GNI_RC_ERROR_RESOURCE; + } else { + rrc = gni_mem_register_segments( + nic_hndl, mem_segments, segments_cnt, + dst_cq_hndl, flags, mem_hndl); + } + + switch (rrc) { + case GNI_RC_SUCCESS: + case GNI_RC_ERROR_RESOURCE: + break; + case GNI_RC_INVALID_PARAM: + GNILND_API_SWBUG( + nic_hndl, mem_segments, segments_cnt, + dst_cq_hndl, flags, mem_hndl); + break; + default: + GNILND_API_RC_LBUG( + nic_hndl, mem_segments, segments_cnt, + dst_cq_hndl, flags, mem_hndl); + + /* LBUG never returns, but just for style and consistency */ + break; + } + RETURN(rrc); +} +#undef apick_fn +#undef apick_fmt + +#define apick_fn "kgnilnd_mem_register" +#define apick_fmt "0x%p,"LPX64","LPX64"0x%p,%u,0x%p" +static inline gni_return_t kgnilnd_mem_register( + IN gni_nic_handle_t nic_hndl, + IN uint64_t address, + IN uint64_t length, + IN gni_cq_handle_t dst_cq_hndl, + IN uint32_t flags, + OUT gni_mem_handle_t *mem_hndl + ) +{ + gni_return_t rrc; + + /* Error injection */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_VIRT_MAP)) { + rrc = GNI_RC_ERROR_RESOURCE; + } else if (CFS_FAIL_CHECK(CFS_FAIL_GNI_VIRT_SMALL_MAP) && + length <= *kgnilnd_tunables.kgn_max_immediate) { + rrc = GNI_RC_INVALID_PARAM; + } else { + rrc = gni_mem_register( + nic_hndl, address, length, + dst_cq_hndl, flags, mem_hndl); + } + + switch (rrc) { + case GNI_RC_SUCCESS: + case GNI_RC_ERROR_RESOURCE: + break; + case GNI_RC_INVALID_PARAM: + GNILND_API_SWBUG( + nic_hndl, address, length, + dst_cq_hndl, flags, mem_hndl); + break; + default: + GNILND_API_RC_LBUG( + nic_hndl, address, length, + dst_cq_hndl, flags, mem_hndl); + + /* LBUG never returns, but just for style and consistency */ + break; + } + RETURN(rrc); +} +#undef apick_fn +#undef apick_fmt + +#define apick_fn "kgnilnd_mem_deregister" +#define apick_fmt "0x%p,0x%p,%d" +static inline gni_return_t kgnilnd_mem_deregister( + IN gni_nic_handle_t nic_hndl, + IN gni_mem_handle_t *mem_hndl, + IN int hold_timeout + ) +{ + gni_return_t rrc; + + /* Error injection */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_VIRT_UNMAP)) { + rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_INVALID_PARAM; + } else { + rrc = gni_mem_deregister( + nic_hndl, mem_hndl, hold_timeout); + } + + switch (rrc) { + case GNI_RC_SUCCESS: + break; + case GNI_RC_INVALID_PARAM: + GNILND_API_SWBUG( + nic_hndl, mem_hndl, hold_timeout); + break; + default: + GNILND_API_RC_LBUG( + nic_hndl, mem_hndl, hold_timeout); + + /* LBUG never returns, but just for style and consistency */ + break; + } + RETURN(rrc); +} +#undef apick_fn +#undef apick_fmt + +#define apick_fn "kgnilnd_mem_mdd_release" +#define apick_fmt "0x%p,0x%p" +static inline gni_return_t kgnilnd_mem_mdd_release( + IN gni_nic_handle_t nic_hndl, + IN gni_mem_handle_t *mem_hndl + ) +{ + gni_return_t rrc; + + /* Error injection */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_MDD_RELEASE)) { + rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NO_MATCH; + } else { + rrc = gni_mem_mdd_release( + nic_hndl, mem_hndl); + } + + switch (rrc) { + case GNI_RC_SUCCESS: + case GNI_RC_NO_MATCH: + break; + default: + GNILND_API_RC_LBUG( + nic_hndl, mem_hndl); + + /* LBUG never returns, but just for style and consistency */ + break; + } + RETURN(rrc); +} +#undef apick_fn +#undef apick_fmt + +#endif /* _GNILND_API_WRAP_H */ diff --git a/lnet/klnds/gnilnd/gnilnd_cb.c b/lnet/klnds/gnilnd/gnilnd_cb.c new file mode 100644 index 0000000..56be88a --- /dev/null +++ b/lnet/klnds/gnilnd/gnilnd_cb.c @@ -0,0 +1,4366 @@ +/* + * Copyright (C) 2004 Cluster File Systems, Inc. + * + * Copyright (C) 2009-2012 Cray, Inc. + * + * Derived from work by Eric Barton + * Author: Nic Henke + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include +#include "gnilnd.h" + +/* this is useful when needed to debug wire corruption. */ +static void +kgnilnd_dump_blob(int level, char *prefix, void *buf, int len) { + __u64 *ptr; + + ptr = (__u64 *) buf; + + while (len > 0) { + if (len >= 32) { + CDEBUG(level, + "%s 0x%p: 0x%16.16llx 0x%16.16llx 0x%16.16llx 0x%16.16llx\n", + prefix, ptr, *(ptr), *(ptr + 1), *(ptr + 2), *(ptr + 3)); + ptr += 4; + len -= 32; + } else if (len >= 16) { + CDEBUG(level, + "%s 0x%p: 0x%16.16llx 0x%16.16llx\n", + prefix, ptr, *(ptr), *(ptr + 1)); + ptr += 2; + len -= 16; + } else { + CDEBUG(level, "%s 0x%p: 0x%16.16llx\n", + prefix, ptr, *(ptr)); + ptr++; + len -= 8; + } + } +} + +static void +kgnilnd_dump_msg(int mask, kgn_msg_t *msg) +{ + CDEBUG(mask, "0x%8.8x 0x%4.4x 0x%4.4x 0x%16.16llx" + " 0x%16.16llx 0x%8.8x 0x%4.4x 0x%4.4x 0x%8.8x\n", + msg->gnm_magic, msg->gnm_version, + msg->gnm_type, msg->gnm_srcnid, + msg->gnm_connstamp, msg->gnm_seq, + msg->gnm_cksum, msg->gnm_payload_cksum, + msg->gnm_payload_len); +} + +void +kgnilnd_schedule_device(kgn_device_t *dev) +{ + short already_live = 0; + + /* we'll only want to wake if the scheduler thread + * has come around and set ready to zero */ + already_live = cmpxchg(&dev->gnd_ready, GNILND_DEV_IDLE, GNILND_DEV_IRQ); + + if (!already_live) { + wake_up_all(&dev->gnd_waitq); + } + return; +} + +void kgnilnd_schedule_device_timer(unsigned long arg) +{ + kgn_device_t *dev = (kgn_device_t *) arg; + + kgnilnd_schedule_device(dev); +} + +void +kgnilnd_device_callback(__u32 devid, __u64 arg) +{ + kgn_device_t *dev; + int index = (int) arg; + + if (index >= kgnilnd_data.kgn_ndevs) { + /* use _EMERG instead of an LBUG to prevent LBUG'ing in + * interrupt context. */ + LCONSOLE_EMERG("callback for unknown device %d->%d\n", + devid, index); + return; + } + + dev = &kgnilnd_data.kgn_devices[index]; + /* just basic sanity */ + if (dev->gnd_id == devid) { + kgnilnd_schedule_device(dev); + } else { + LCONSOLE_EMERG("callback for bad device %d devid %d\n", + dev->gnd_id, devid); + } +} + +/* sched_intent values: + * < 0 : do not reschedule under any circumstances + * == 0: reschedule if someone marked him WANTS_SCHED + * > 0 : force a reschedule */ + +void +kgnilnd_schedule_process_conn(kgn_conn_t *conn, int sched_intent) +{ + int conn_sched; + + /* move back to IDLE but save previous state. + * if we see WANTS_SCHED, we'll call kgnilnd_schedule_conn and + * let the xchg there handle any racing callers to get it + * onto gnd_ready_conns */ + + conn_sched = xchg(&conn->gnc_scheduled, GNILND_CONN_IDLE); + LASSERTF(conn_sched == GNILND_CONN_WANTS_SCHED || + conn_sched == GNILND_CONN_PROCESS, + "conn %p after process in bad state: %d\n", + conn, conn_sched); + + if (sched_intent >= 0) { + if ((sched_intent > 0 || (conn_sched == GNILND_CONN_WANTS_SCHED))) { + kgnilnd_schedule_conn(conn); + } + } +} + +void +kgnilnd_schedule_conn(kgn_conn_t *conn) +{ + kgn_device_t *dev = conn->gnc_device; + int sched; + + sched = xchg(&conn->gnc_scheduled, GNILND_CONN_WANTS_SCHED); + + /* if we are IDLE, add to list - only one guy sees IDLE and "wins" + * the chance to put it onto gnd_ready_conns. + * otherwise, leave marked as WANTS_SCHED and the thread that "owns" + * the conn in process_conns will take care of moving it back to + * SCHED when it is done processing */ + + if (sched == GNILND_CONN_IDLE) { + /* if the conn is already scheduled, we've already requested + * the scheduler thread wakeup */ + kgnilnd_conn_addref(conn); /* +1 ref for scheduler */ + + LASSERTF(list_empty(&conn->gnc_schedlist), "conn %p already sched state %d\n", + conn, sched); + + CDEBUG(D_INFO, "scheduling conn 0x%p\n", conn); + + spin_lock(&dev->gnd_lock); + list_add_tail(&conn->gnc_schedlist, &dev->gnd_ready_conns); + spin_unlock(&dev->gnd_lock); + set_mb(conn->gnc_last_sched_ask, jiffies); + + } else { + CDEBUG(D_INFO, "not scheduling conn 0x%p: %d\n", conn, sched); + } + + /* make sure thread(s) going to process conns - but let it make + * separate decision from conn schedule */ + kgnilnd_schedule_device(dev); +} + +void +kgnilnd_schedule_dgram(kgn_device_t *dev) +{ + int wake; + + wake = xchg(&dev->gnd_dgram_ready, GNILND_DGRAM_SCHED); + if (wake != GNILND_DGRAM_SCHED) { + wake_up(&dev->gnd_dgram_waitq); + } else { + CDEBUG(D_NETTRACE, "not waking: %d\n", wake); + } +} + +void +kgnilnd_free_tx(kgn_tx_t *tx) +{ + /* taken from kgnilnd_tx_add_state_locked */ + + LASSERTF((tx->tx_list_p == NULL && + tx->tx_list_state == GNILND_TX_ALLOCD) && + list_empty(&tx->tx_list), + "tx %p with bad state %s (list_p %p) tx_list %s\n", + tx, kgnilnd_tx_state2str(tx->tx_list_state), tx->tx_list_p, + list_empty(&tx->tx_list) ? "empty" : "not empty"); + + atomic_dec(&kgnilnd_data.kgn_ntx); + + /* we only allocate this if we need to */ + if (tx->tx_phys != NULL) { + cfs_mem_cache_free(kgnilnd_data.kgn_tx_phys_cache, tx->tx_phys); + CDEBUG(D_MALLOC, "slab-freed 'tx_phys': %lu at %p.\n", + LNET_MAX_IOV * sizeof(gni_mem_segment_t), tx->tx_phys); + } +#if 0 + KGNILND_POISON(tx, 0x5a, sizeof(kgn_tx_t)); +#endif + cfs_mem_cache_free(kgnilnd_data.kgn_tx_cache, tx); + CDEBUG(D_MALLOC, "slab-freed 'tx': %lu at %p.\n", + sizeof(*tx), tx); +} + +kgn_tx_t * +kgnilnd_alloc_tx(void) +{ + kgn_tx_t *tx = NULL; + + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_ALLOC_TX)) + return tx; + + tx = cfs_mem_cache_alloc(kgnilnd_data.kgn_tx_cache, CFS_ALLOC_ATOMIC); + if (tx == NULL) { + CERROR("failed to allocate tx\n"); + return NULL; + } + CDEBUG(D_MALLOC, "slab-alloced 'tx': %lu at %p.\n", + sizeof(*tx), tx); + + /* need this memset, cache alloc'd memory is not cleared */ + memset(tx, 0, sizeof(*tx)); + + /* setup everything here to minimize time under the lock */ + tx->tx_buftype = GNILND_BUF_NONE; + tx->tx_msg.gnm_type = GNILND_MSG_NONE; + INIT_LIST_HEAD(&tx->tx_list); + INIT_LIST_HEAD(&tx->tx_map_list); + tx->tx_list_state = GNILND_TX_ALLOCD; + + atomic_inc(&kgnilnd_data.kgn_ntx); + + return tx; +} + +/* csum_fold needs to be run on the return value before shipping over the wire */ +#define _kgnilnd_cksum(seed, ptr, nob) csum_partial(ptr, nob, seed) + +/* we don't use offset as every one is passing a buffer reference that already + * includes the offset into the base address - + * see kgnilnd_setup_virt_buffer and kgnilnd_setup_immediate_buffer */ +static inline __u16 +kgnilnd_cksum(void *ptr, size_t nob) +{ + __u16 sum; + + sum = csum_fold(_kgnilnd_cksum(0, ptr, nob)); + + /* don't use magic 'no checksum' value */ + if (sum == 0) + sum = 1; + + CDEBUG(D_INFO, "cksum 0x%x for ptr 0x%p sz %zu\n", + sum, ptr, nob); + + return sum; +} + +inline __u16 +kgnilnd_cksum_kiov(unsigned int nkiov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int nob, int dump_blob) +{ + __wsum cksum = 0; + __wsum tmpck; + __u16 retsum; + void *addr; + unsigned int fraglen; + int i, odd; + + LASSERT(nkiov > 0); + LASSERT(nob > 0); + + CDEBUG(D_BUFFS, "calc cksum for kiov 0x%p nkiov %u offset %u nob %u, dump %d\n", + kiov, nkiov, offset, nob, dump_blob); + + /* if loops changes, please change kgnilnd_setup_phys_buffer */ + + while (offset >= kiov->kiov_len) { + offset -= kiov->kiov_len; + nkiov--; + kiov++; + LASSERT(nkiov > 0); + } + + /* ignore nob here, if nob < (kiov_len - offset), kiov == 1 */ + odd = (unsigned long) (kiov[0].kiov_len - offset) & 1; + + if ((odd || *kgnilnd_tunables.kgn_vmap_cksum) && nkiov > 1) { + struct page **pages = kgnilnd_data.kgn_cksum_map_pages[get_cpu()]; + + LASSERTF(pages != NULL, "NULL pages for cpu %d map_pages 0x%p\n", + get_cpu(), kgnilnd_data.kgn_cksum_map_pages); + + CDEBUG(D_BUFFS, "odd %d len %u offset %u nob %u\n", + odd, kiov[0].kiov_len, offset, nob); + + for (i = 0; i < nkiov; i++) { + pages[i] = kiov[i].kiov_page; + } + + addr = vmap(pages, nkiov, VM_MAP, PAGE_KERNEL); + if (addr == NULL) { + CNETERR("Couldn't vmap %d frags on %d bytes to avoid odd length fragment in cksum\n", + nkiov, nob); + /* return zero to avoid killing tx - we'll just get warning on console + * when remote end sees zero checksum */ + RETURN(0); + } + atomic_inc(&kgnilnd_data.kgn_nvmap_cksum); + + tmpck = _kgnilnd_cksum(0, (void *) addr + kiov[0].kiov_offset + offset, nob); + cksum = tmpck; + + if (dump_blob) { + kgnilnd_dump_blob(D_BUFFS, "flat kiov RDMA payload", + (void *)addr + kiov[0].kiov_offset + offset, nob); + } + CDEBUG(D_BUFFS, "cksum 0x%x (+0x%x) for addr 0x%p+%u len %u offset %u\n", + cksum, tmpck, addr, kiov[0].kiov_offset, nob, offset); + vunmap(addr); + } else { + do { + fraglen = min(kiov->kiov_len - offset, nob); + + /* make dang sure we don't send a bogus checksum if somehow we get + * an odd length fragment on anything but the last entry in a kiov - + * we know from kgnilnd_setup_rdma_buffer that we can't have non + * PAGE_SIZE pages in the middle, so if nob < PAGE_SIZE, it is the last one */ + LASSERTF(!(fraglen&1) || (nob < PAGE_SIZE), + "odd fraglen %u on nkiov %d, nob %u kiov_len %u offset %u kiov 0x%p\n", + fraglen, nkiov, nob, kiov->kiov_len, offset, kiov); + + addr = (void *)kmap(kiov->kiov_page) + kiov->kiov_offset + offset; + tmpck = _kgnilnd_cksum(cksum, addr, fraglen); + + CDEBUG(D_BUFFS, + "cksum 0x%x (+0x%x) for page 0x%p+%u (0x%p) len %u offset %u\n", + cksum, tmpck, kiov->kiov_page, kiov->kiov_offset, addr, + fraglen, offset); + + cksum = tmpck; + + if (dump_blob) + kgnilnd_dump_blob(D_BUFFS, "kiov cksum", addr, fraglen); + + kunmap(kiov->kiov_page); + + kiov++; + nkiov--; + nob -= fraglen; + offset = 0; + + /* iov must not run out before end of data */ + LASSERTF(nob == 0 || nkiov > 0, "nob %u nkiov %u\n", nob, nkiov); + + } while (nob > 0); + } + + retsum = csum_fold(cksum); + + /* don't use magic 'no checksum' value */ + if (retsum == 0) + retsum = 1; + + CDEBUG(D_BUFFS, "retsum 0x%x from cksum 0x%x\n", retsum, cksum); + + return retsum; +} + +void +kgnilnd_init_msg(kgn_msg_t *msg, int type, lnet_nid_t source) +{ + msg->gnm_magic = GNILND_MSG_MAGIC; + msg->gnm_version = GNILND_MSG_VERSION; + msg->gnm_type = type; + msg->gnm_payload_len = 0; + msg->gnm_srcnid = source; + /* gnm_connstamp gets set when FMA is sent */ + /* gnm_srcnid is set on creation via function argument + * The right interface/net and nid is passed in when the message + * is created. + */ +} + +kgn_tx_t * +kgnilnd_new_tx_msg(int type, lnet_nid_t source) +{ + kgn_tx_t *tx = kgnilnd_alloc_tx(); + + if (tx != NULL) { + kgnilnd_init_msg(&tx->tx_msg, type, source); + } else { + CERROR("couldn't allocate new tx type %s!\n", + kgnilnd_msgtype2str(type)); + } + + return tx; +} + +static void +kgnilnd_nak_rdma(kgn_conn_t *conn, int type, int error, __u64 cookie, lnet_nid_t source) { + kgn_tx_t *tx; + + /* only allow NAK on error and truncate to zero */ + LASSERTF(error <= 0, "error %d conn 0x%p, cookie "LPU64"\n", + error, conn, cookie); + + tx = kgnilnd_new_tx_msg(type, source); + if (tx == NULL) { + CNETERR("can't get TX to NAK RDMA to %s\n", + libcfs_nid2str(conn->gnc_peer->gnp_nid)); + return; + } + + tx->tx_msg.gnm_u.completion.gncm_retval = error; + tx->tx_msg.gnm_u.completion.gncm_cookie = cookie; + kgnilnd_queue_tx(conn, tx); +} + +int +kgnilnd_setup_immediate_buffer(kgn_tx_t *tx, unsigned int niov, struct iovec *iov, + lnet_kiov_t *kiov, unsigned int offset, unsigned int nob) + +{ + kgn_msg_t *msg = &tx->tx_msg; + int i; + + /* To help save on MDDs for short messages, we'll vmap a kiov to allow + * gni_smsg_send to send that as the payload */ + + LASSERT(tx->tx_buftype == GNILND_BUF_NONE); + LASSERT(nob >= 0); + + if (nob == 0) { + tx->tx_buffer = NULL; + } else if (kiov != NULL) { + LASSERTF(niov > 0 && niov < GNILND_MAX_IMMEDIATE/PAGE_SIZE, + "bad niov %d\n", niov); + + while (offset >= kiov->kiov_len) { + offset -= kiov->kiov_len; + niov--; + kiov++; + LASSERT(niov > 0); + } + for (i = 0; i < niov; i++) { + /* We can't have a kiov_offset on anything but the first entry, + * otherwise we'll have a hole at the end of the mapping as we only map + * whole pages. + * Also, if we have a kiov_len < PAGE_SIZE but we need to map more + * than kiov_len, we will also have a whole at the end of that page + * which isn't allowed */ + if ((kiov[i].kiov_offset != 0 && i > 0) || + (kiov[i].kiov_offset + kiov[i].kiov_len != CFS_PAGE_SIZE && i < niov - 1)) { + CNETERR("Can't make payload contiguous in I/O VM:" + "page %d, offset %u, nob %u, kiov_offset %u kiov_len %u \n", + i, offset, nob, kiov->kiov_offset, kiov->kiov_len); + RETURN(-EINVAL); + } + tx->tx_imm_pages[i] = kiov[i].kiov_page; + } + + /* hijack tx_phys for the later unmap */ + if (niov == 1) { + /* tx->phyx being equal to NULL is the signal for unmap to discern between kmap and vmap */ + tx->tx_phys = NULL; + tx->tx_buffer = (void *)kmap(tx->tx_imm_pages[0]) + kiov[0].kiov_offset + offset; + atomic_inc(&kgnilnd_data.kgn_nkmap_short); + GNIDBG_TX(D_NET, tx, "kmapped page for %d bytes for kiov 0x%p, buffer 0x%p", + nob, kiov, tx->tx_buffer); + } else { + tx->tx_phys = vmap(tx->tx_imm_pages, niov, VM_MAP, PAGE_KERNEL); + if (tx->tx_phys == NULL) { + CNETERR("Couldn't vmap %d frags on %d bytes\n", niov, nob); + RETURN(-ENOMEM); + + } + atomic_inc(&kgnilnd_data.kgn_nvmap_short); + /* make sure we take into account the kiov offset as the start of the buffer */ + tx->tx_buffer = (void *)tx->tx_phys + kiov[0].kiov_offset + offset; + GNIDBG_TX(D_NET, tx, "mapped %d pages for %d bytes from kiov 0x%p to 0x%p, buffer 0x%p", + niov, nob, kiov, tx->tx_phys, tx->tx_buffer); + } + tx->tx_buftype = GNILND_BUF_IMMEDIATE_KIOV; + tx->tx_nob = nob; + + } else { + /* For now this is almost identical to kgnilnd_setup_virt_buffer, but we + * could "flatten" the payload into a single contiguous buffer ready + * for sending direct over an FMA if we ever needed to. */ + + LASSERT(niov > 0); + + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + niov--; + iov++; + LASSERT(niov > 0); + } + + if (nob > iov->iov_len - offset) { + CERROR("Can't handle multiple vaddr fragments\n"); + return -EMSGSIZE; + } + + tx->tx_buffer = (void *)(((unsigned long)iov->iov_base) + offset); + + tx->tx_buftype = GNILND_BUF_IMMEDIATE; + tx->tx_nob = nob; + } + + /* checksum payload early - it shouldn't be changing after lnd_send */ + if (*kgnilnd_tunables.kgn_checksum >= 2) { + msg->gnm_payload_cksum = kgnilnd_cksum(tx->tx_buffer, nob); + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_CKSUM2)) { + msg->gnm_payload_cksum += 0xe00e; + } + if (*kgnilnd_tunables.kgn_checksum_dump > 1) { + kgnilnd_dump_blob(D_BUFFS, "payload checksum", + tx->tx_buffer, nob); + } + } else { + msg->gnm_payload_cksum = 0; + } + + return 0; +} + +int +kgnilnd_setup_virt_buffer(kgn_tx_t *tx, + unsigned int niov, struct iovec *iov, + unsigned int offset, unsigned int nob) + +{ + LASSERT(nob > 0); + LASSERT(niov > 0); + LASSERT(tx->tx_buftype == GNILND_BUF_NONE); + + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + niov--; + iov++; + LASSERT(niov > 0); + } + + if (nob > iov->iov_len - offset) { + CERROR("Can't handle multiple vaddr fragments\n"); + return -EMSGSIZE; + } + + tx->tx_buftype = GNILND_BUF_VIRT_UNMAPPED; + tx->tx_nob = nob; + tx->tx_buffer = (void *)(((unsigned long)iov->iov_base) + offset); + return 0; +} + +int +kgnilnd_setup_phys_buffer(kgn_tx_t *tx, int nkiov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int nob) +{ + gni_mem_segment_t *phys; + int rc = 0; + unsigned int fraglen; + + GNIDBG_TX(D_NET, tx, "niov %d kiov 0x%p offset %u nob %u", nkiov, kiov, offset, nob); + + LASSERT(nob > 0); + LASSERT(nkiov > 0); + LASSERT(tx->tx_buftype == GNILND_BUF_NONE); + + /* only allocate this if we are going to use it */ + tx->tx_phys = cfs_mem_cache_alloc(kgnilnd_data.kgn_tx_phys_cache, + CFS_ALLOC_ATOMIC); + if (tx->tx_phys == NULL) { + CERROR("failed to allocate tx_phys\n"); + rc = -ENOMEM; + GOTO(error, rc); + } + + CDEBUG(D_MALLOC, "slab-alloced 'tx->tx_phys': %lu at %p.\n", + LNET_MAX_IOV * sizeof(gni_mem_segment_t), tx->tx_phys); + + /* if loops changes, please change kgnilnd_cksum_kiov + * and kgnilnd_setup_immediate_buffer */ + + while (offset >= kiov->kiov_len) { + offset -= kiov->kiov_len; + nkiov--; + kiov++; + LASSERT(nkiov > 0); + } + + /* at this point, kiov points to the first page that we'll actually map + * now that we've seeked into the koiv for offset and dropped any + * leading pages that fall entirely within the offset */ + tx->tx_buftype = GNILND_BUF_PHYS_UNMAPPED; + tx->tx_nob = nob; + + /* kiov_offset is start of 'valid' buffer, so index offset past that */ + tx->tx_buffer = (void *)((unsigned long)(kiov->kiov_offset + offset)); + phys = tx->tx_phys; + + CDEBUG(D_NET, "tx 0x%p buffer 0x%p map start kiov 0x%p+%u niov %d offset %u\n", + tx, tx->tx_buffer, kiov, kiov->kiov_offset, nkiov, offset); + + do { + fraglen = min(kiov->kiov_len - offset, nob); + + /* We can't have a kiov_offset on anything but the first entry, + * otherwise we'll have a hole at the end of the mapping as we only map + * whole pages. Only the first page is allowed to have an offset - + * we'll add that into tx->tx_buffer and that will get used when we + * map in the segments (see kgnilnd_map_buffer). + * Also, if we have a kiov_len < PAGE_SIZE but we need to map more + * than kiov_len, we will also have a whole at the end of that page + * which isn't allowed */ + if ((phys != tx->tx_phys) && + ((kiov->kiov_offset != 0) || + ((kiov->kiov_len < PAGE_SIZE) && (nob > kiov->kiov_len)))) { + CERROR("Can't make payload contiguous in I/O VM:" + "page %d, offset %u, nob %u, kiov_offset %u kiov_len %u \n", + (int)(phys - tx->tx_phys), + offset, nob, kiov->kiov_offset, kiov->kiov_len); + rc = -EINVAL; + GOTO(error, rc); + } + + if ((phys - tx->tx_phys) == LNET_MAX_IOV) { + CERROR ("payload too big (%d)\n", (int)(phys - tx->tx_phys)); + rc = -EMSGSIZE; + GOTO(error, rc); + } + + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PHYS_SETUP)) { + rc = -EINVAL; + GOTO(error, rc); + } + + CDEBUG(D_BUFFS, "page 0x%p kiov_offset %u kiov_len %u nob %u " + "nkiov %u offset %u\n", + kiov->kiov_page, kiov->kiov_offset, kiov->kiov_len, nob, nkiov, offset); + + phys->address = lnet_page2phys(kiov->kiov_page); + phys++; + kiov++; + nkiov--; + nob -= fraglen; + offset = 0; + + /* iov must not run out before end of data */ + LASSERTF(nob == 0 || nkiov > 0, "nob %u nkiov %u\n", nob, nkiov); + + } while (nob > 0); + + tx->tx_phys_npages = phys - tx->tx_phys; + + return 0; + +error: + if (tx->tx_phys != NULL) { + cfs_mem_cache_free(kgnilnd_data.kgn_tx_phys_cache, tx->tx_phys); + CDEBUG(D_MALLOC, "slab-freed 'tx_phys': %lu at %p.\n", + sizeof(*tx->tx_phys), tx->tx_phys); + tx->tx_phys = NULL; + } + return rc; +} + +static inline int +kgnilnd_setup_rdma_buffer(kgn_tx_t *tx, unsigned int niov, + struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int nob) +{ + int rc; + + LASSERT((iov == NULL) != (kiov == NULL)); + + if (kiov != NULL) { + rc = kgnilnd_setup_phys_buffer(tx, niov, kiov, offset, nob); + } else { + rc = kgnilnd_setup_virt_buffer(tx, niov, iov, offset, nob); + } + return rc; +} + +static void +kgnilnd_parse_lnet_rdma(lnet_msg_t *lntmsg, unsigned int *niov, unsigned int *offset, + unsigned int *nob, lnet_kiov_t **kiov) +{ + /* GETs are weird, see kgnilnd_send */ + if (lntmsg->msg_type == LNET_MSG_GET) { + if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0) { + *kiov = NULL; + } else { + *kiov = lntmsg->msg_md->md_iov.kiov; + } + *niov = lntmsg->msg_md->md_niov; + *nob = lntmsg->msg_md->md_length; + *offset = 0; + } else { + *kiov = lntmsg->msg_kiov; + *niov = lntmsg->msg_niov; + *nob = lntmsg->msg_len; + *offset = lntmsg->msg_offset; + } +} + +static inline void +kgnilnd_compute_rdma_cksum(kgn_tx_t *tx) +{ + unsigned int niov, offset, nob; + lnet_kiov_t *kiov; + lnet_msg_t *lntmsg = tx->tx_lntmsg[0]; + int dump_cksum = (*kgnilnd_tunables.kgn_checksum_dump > 1); + + GNITX_ASSERTF(tx, ((tx->tx_msg.gnm_type == GNILND_MSG_PUT_DONE) || + (tx->tx_msg.gnm_type == GNILND_MSG_GET_DONE)), + "bad type %s", kgnilnd_msgtype2str(tx->tx_msg.gnm_type)); + + + if (*kgnilnd_tunables.kgn_checksum < 3) { + tx->tx_msg.gnm_payload_cksum = 0; + return; + } + + GNITX_ASSERTF(tx, lntmsg, "no LNet message!", NULL); + + kgnilnd_parse_lnet_rdma(lntmsg, &niov, &offset, &nob, &kiov); + + if (kiov != NULL) { + tx->tx_msg.gnm_payload_cksum = kgnilnd_cksum_kiov(niov, kiov, offset, nob, dump_cksum); + } else { + tx->tx_msg.gnm_payload_cksum = kgnilnd_cksum(tx->tx_buffer, nob); + if (dump_cksum) { + kgnilnd_dump_blob(D_BUFFS, "peer RDMA payload", tx->tx_buffer, nob); + } + } + + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_CKSUM3)) { + tx->tx_msg.gnm_payload_cksum += 0xd00d; + } +} + +static inline int +kgnilnd_verify_rdma_cksum(kgn_tx_t *tx, __u16 rx_cksum) +{ + int rc = 0; + __u16 cksum; + unsigned int niov, offset, nob; + lnet_kiov_t *kiov; + lnet_msg_t *lntmsg = tx->tx_lntmsg[0]; + int dump_on_err = *kgnilnd_tunables.kgn_checksum_dump; + + /* we can only match certain requests */ + GNITX_ASSERTF(tx, ((tx->tx_msg.gnm_type == GNILND_MSG_GET_REQ) || + (tx->tx_msg.gnm_type == GNILND_MSG_PUT_ACK)), + "bad type %s", kgnilnd_msgtype2str(tx->tx_msg.gnm_type)); + + if (rx_cksum == 0) { + if (*kgnilnd_tunables.kgn_checksum >= 3) { + GNIDBG_MSG(D_WARNING, &tx->tx_msg, + "no RDMA payload checksum when enabled"); + } + return 0; + } + + GNITX_ASSERTF(tx, lntmsg, "no LNet message!", NULL); + + kgnilnd_parse_lnet_rdma(lntmsg, &niov, &offset, &nob, &kiov); + + if (kiov != NULL) { + cksum = kgnilnd_cksum_kiov(niov, kiov, offset, nob, 0); + } else { + cksum = kgnilnd_cksum(tx->tx_buffer, nob); + } + + if (cksum != rx_cksum) { + GNIDBG_MSG(D_NETERROR, &tx->tx_msg, + "Bad RDMA payload checksum (%x expected %x); " + "kiov 0x%p niov %d nob %u offset %u", + cksum, rx_cksum, kiov, niov, nob, offset); + switch (dump_on_err) { + case 2: + if (kiov != NULL) { + kgnilnd_cksum_kiov(niov, kiov, offset, nob, 1); + } else { + kgnilnd_dump_blob(D_BUFFS, "RDMA payload", + tx->tx_buffer, nob); + } + /* fall through to dump log */ + case 1: + libcfs_debug_dumplog(); + break; + default: + break; + } + rc = -ENOKEY; + /* kgnilnd_check_fma_rx will close conn, kill tx with error */ + } + return rc; +} + +void +kgnilnd_mem_add_map_list(kgn_device_t *dev, kgn_tx_t *tx) +{ + int bytes; + + GNITX_ASSERTF(tx, list_empty(&tx->tx_map_list), + "already mapped!", NULL); + + spin_lock(&dev->gnd_map_lock); + switch (tx->tx_buftype) { + default: + GNIDBG_TX(D_EMERG, tx, + "SOFTWARE BUG: invalid mapping %d", tx->tx_buftype); + spin_unlock(&dev->gnd_map_lock); + LBUG(); + break; + + case GNILND_BUF_PHYS_MAPPED: + bytes = tx->tx_phys_npages * PAGE_SIZE; + dev->gnd_map_nphys++; + dev->gnd_map_physnop += tx->tx_phys_npages; + break; + + case GNILND_BUF_VIRT_MAPPED: + bytes = tx->tx_nob; + dev->gnd_map_nvirt++; + dev->gnd_map_virtnob += tx->tx_nob; + break; + } + + if (tx->tx_msg.gnm_type == GNILND_MSG_PUT_ACK || + tx->tx_msg.gnm_type == GNILND_MSG_GET_REQ) { + atomic64_add(bytes, &dev->gnd_rdmaq_bytes_out); + GNIDBG_TX(D_NETTRACE, tx, "rdma ++ %d to "LPD64"", + bytes, atomic64_read(&dev->gnd_rdmaq_bytes_out)); + } + + atomic_inc(&dev->gnd_n_mdd); + atomic64_add(bytes, &dev->gnd_nbytes_map); + + /* clear retrans to prevent any SMSG goofiness as that code uses the same counter */ + tx->tx_retrans = 0; + + /* we only get here in the valid cases */ + list_add_tail(&tx->tx_map_list, &dev->gnd_map_list); + dev->gnd_map_version++; + spin_unlock(&dev->gnd_map_lock); +} + +void +kgnilnd_mem_del_map_list(kgn_device_t *dev, kgn_tx_t *tx) +{ + int bytes; + + GNITX_ASSERTF(tx, !list_empty(&tx->tx_map_list), + "not mapped!", NULL); + spin_lock(&dev->gnd_map_lock); + + switch (tx->tx_buftype) { + default: + GNIDBG_TX(D_EMERG, tx, + "SOFTWARE BUG: invalid mapping %d", tx->tx_buftype); + spin_unlock(&dev->gnd_map_lock); + LBUG(); + break; + + case GNILND_BUF_PHYS_UNMAPPED: + bytes = tx->tx_phys_npages * PAGE_SIZE; + dev->gnd_map_nphys--; + dev->gnd_map_physnop -= tx->tx_phys_npages; + break; + + case GNILND_BUF_VIRT_UNMAPPED: + bytes = tx->tx_nob; + dev->gnd_map_nvirt--; + dev->gnd_map_virtnob -= tx->tx_nob; + break; + } + + if (tx->tx_msg.gnm_type == GNILND_MSG_PUT_ACK || + tx->tx_msg.gnm_type == GNILND_MSG_GET_REQ) { + atomic64_sub(bytes, &dev->gnd_rdmaq_bytes_out); + LASSERTF(atomic64_read(&dev->gnd_rdmaq_bytes_out) >= 0, + "bytes_out negative! %ld\n", atomic64_read(&dev->gnd_rdmaq_bytes_out)); + GNIDBG_TX(D_NETTRACE, tx, "rdma -- %d to "LPD64"", + bytes, atomic64_read(&dev->gnd_rdmaq_bytes_out)); + } + + atomic_dec(&dev->gnd_n_mdd); + atomic64_sub(bytes, &dev->gnd_nbytes_map); + + /* we only get here in the valid cases */ + list_del_init(&tx->tx_map_list); + dev->gnd_map_version++; + spin_unlock(&dev->gnd_map_lock); +} + +int +kgnilnd_map_buffer(kgn_tx_t *tx) +{ + kgn_conn_t *conn = tx->tx_conn; + kgn_device_t *dev = conn->gnc_device; + __u32 flags = GNI_MEM_READWRITE; + gni_return_t rrc; + + /* The kgnilnd_mem_register(_segments) Gemini Driver functions can + * be called concurrently as there are internal locks that protect + * any data structures or HW resources. We just need to ensure + * that our concurrency doesn't result in the kgn_device_t + * getting nuked while we are in here */ + + LASSERTF(conn != NULL, "tx %p with NULL conn, someone forgot" + " to set tx_conn before calling %s\n", tx, __FUNCTION__); + + if (unlikely(CFS_FAIL_CHECK(CFS_FAIL_GNI_MAP_TX))) + RETURN(-ENOMEM); + + if (*kgnilnd_tunables.kgn_bte_relaxed_ordering) { + flags |= GNI_MEM_RELAXED_PI_ORDERING; + } + + switch (tx->tx_buftype) { + default: + LBUG(); + + case GNILND_BUF_NONE: + case GNILND_BUF_IMMEDIATE: + case GNILND_BUF_IMMEDIATE_KIOV: + case GNILND_BUF_PHYS_MAPPED: + case GNILND_BUF_VIRT_MAPPED: + return 0; + + case GNILND_BUF_PHYS_UNMAPPED: + GNITX_ASSERTF(tx, tx->tx_phys != NULL, "physical buffer not there!", NULL); + rrc = kgnilnd_mem_register_segments(dev->gnd_handle, + tx->tx_phys, tx->tx_phys_npages, NULL, + GNI_MEM_PHYS_SEGMENTS | flags, + &tx->tx_map_key); + /* could race with other uses of the map counts, but this is ok + * - this needs to turn into a non-fatal error soon to allow + * GART resource, etc starvation handling */ + if (rrc != GNI_RC_SUCCESS) { + GNIDBG_TX(D_NET, tx, "Can't map %d pages: dev %d " + "phys %u pp %u, virt %u nob "LPU64"", + tx->tx_phys_npages, dev->gnd_id, + dev->gnd_map_nphys, dev->gnd_map_physnop, + dev->gnd_map_nvirt, dev->gnd_map_virtnob); + RETURN(rrc == GNI_RC_ERROR_RESOURCE ? -ENOMEM : -EINVAL); + } + + tx->tx_buftype = GNILND_BUF_PHYS_MAPPED; + kgnilnd_mem_add_map_list(dev, tx); + return 0; + + case GNILND_BUF_VIRT_UNMAPPED: + rrc = kgnilnd_mem_register(dev->gnd_handle, + (__u64)tx->tx_buffer, tx->tx_nob, + NULL, flags, &tx->tx_map_key); + if (rrc != GNI_RC_SUCCESS) { + GNIDBG_TX(D_NET, tx, "Can't map %u bytes: dev %d " + "phys %u pp %u, virt %u nob "LPU64"", + tx->tx_nob, dev->gnd_id, + dev->gnd_map_nphys, dev->gnd_map_physnop, + dev->gnd_map_nvirt, dev->gnd_map_virtnob); + RETURN(rrc == GNI_RC_ERROR_RESOURCE ? -ENOMEM : -EINVAL); + } + + tx->tx_buftype = GNILND_BUF_VIRT_MAPPED; + kgnilnd_mem_add_map_list(dev, tx); + if (tx->tx_msg.gnm_type == GNILND_MSG_PUT_ACK || + tx->tx_msg.gnm_type == GNILND_MSG_GET_REQ) { + atomic64_add(tx->tx_nob, &dev->gnd_rdmaq_bytes_out); + GNIDBG_TX(D_NETTRACE, tx, "rdma ++ %d to %ld\n", + tx->tx_nob, atomic64_read(&dev->gnd_rdmaq_bytes_out)); + } + + return 0; + } +} + +void +kgnilnd_add_purgatory_tx(kgn_tx_t *tx) +{ + kgn_conn_t *conn = tx->tx_conn; + kgn_mdd_purgatory_t *gmp; + + LIBCFS_ALLOC(gmp, sizeof(*gmp)); + LASSERTF(gmp != NULL, "couldn't allocate MDD purgatory member;" + " asserting to avoid data corruption\n"); + + gmp->gmp_map_key = tx->tx_map_key; + atomic_inc(&conn->gnc_device->gnd_n_mdd_held); + + /* ensure that we don't have a blank purgatory - indicating the + * conn is not already on purgatory lists - we'd never recover these + * MDD if that were the case */ + GNITX_ASSERTF(tx, conn->gnc_in_purgatory, + "conn 0x%p->%s with NULL purgatory", + conn, libcfs_nid2str(conn->gnc_peer->gnp_nid)); + + /* link 'er up! - only place we really need to lock for + * concurrent access */ + spin_lock(&conn->gnc_list_lock); + list_add_tail(&gmp->gmp_list, &conn->gnc_mdd_list); + spin_unlock(&conn->gnc_list_lock); +} + +void +kgnilnd_unmap_buffer(kgn_tx_t *tx, int error) +{ + kgn_device_t *dev; + gni_return_t rrc; + int hold_timeout = 0; + + /* code below relies on +1 relationship ... */ + CLASSERT(GNILND_BUF_PHYS_MAPPED == (GNILND_BUF_PHYS_UNMAPPED + 1)); + CLASSERT(GNILND_BUF_VIRT_MAPPED == (GNILND_BUF_VIRT_UNMAPPED + 1)); + + switch (tx->tx_buftype) { + default: + LBUG(); + + case GNILND_BUF_NONE: + case GNILND_BUF_IMMEDIATE: + case GNILND_BUF_PHYS_UNMAPPED: + case GNILND_BUF_VIRT_UNMAPPED: + break; + case GNILND_BUF_IMMEDIATE_KIOV: + if (tx->tx_phys != NULL) { + vunmap(tx->tx_phys); + } else if (tx->tx_phys == NULL && tx->tx_buffer != NULL) { + kunmap(tx->tx_imm_pages[0]); + } + /* clear to prevent kgnilnd_free_tx from thinking + * this is a RDMA descriptor */ + tx->tx_phys = NULL; + break; + + case GNILND_BUF_PHYS_MAPPED: + case GNILND_BUF_VIRT_MAPPED: + LASSERT(tx->tx_conn != NULL); + + dev = tx->tx_conn->gnc_device; + + /* only want to hold if we are closing conn without + * verified peer notification - the theory is that + * a TX error can be communicated in all other cases */ + if (tx->tx_conn->gnc_state != GNILND_CONN_ESTABLISHED && + kgnilnd_check_purgatory_conn(tx->tx_conn)) { + kgnilnd_add_purgatory_tx(tx); + + /* The timeout we give to kgni is a deadman stop only. + * we are setting high to ensure we don't have the kgni timer + * fire before ours fires _and_ is handled */ + hold_timeout = GNILND_TIMEOUT2DEADMAN; + + GNIDBG_TX(D_NET, tx, + "dev %p delaying MDD release for %dms key "LPX64"."LPX64"", + tx->tx_conn->gnc_device, hold_timeout, + tx->tx_map_key.qword1, tx->tx_map_key.qword2); + } + + rrc = kgnilnd_mem_deregister(dev->gnd_handle, &tx->tx_map_key, hold_timeout); + + LASSERTF(rrc == GNI_RC_SUCCESS, "rrc %d\n", rrc); + + tx->tx_buftype--; + kgnilnd_mem_del_map_list(dev, tx); + break; + } +} + +void +kgnilnd_tx_done(kgn_tx_t *tx, int completion) +{ + lnet_msg_t *lntmsg0, *lntmsg1; + int status0, status1; + lnet_ni_t *ni = NULL; + kgn_conn_t *conn = tx->tx_conn; + + LASSERT(!in_interrupt()); + + lntmsg0 = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL; + lntmsg1 = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL; + + if (completion && + !(tx->tx_state & GNILND_TX_QUIET_ERROR) && + !kgnilnd_conn_clean_errno(completion)) { + GNIDBG_TOMSG(D_NETERROR, &tx->tx_msg, + "error %d on tx 0x%p->%s id %u/%d state %s age %ds", + completion, tx, conn ? + libcfs_nid2str(conn->gnc_peer->gnp_nid) : "", + tx->tx_id.txe_smsg_id, tx->tx_id.txe_idx, + kgnilnd_tx_state2str(tx->tx_list_state), + cfs_duration_sec((long)jiffies - tx->tx_qtime)); + } + + /* The error codes determine if we hold onto the MDD */ + kgnilnd_unmap_buffer(tx, completion); + + /* we have to deliver a reply on lntmsg[1] for the GET, so make sure + * we play nice with the error codes to avoid delivering a failed + * REQUEST and then a REPLY event as well */ + + /* return -EIO to lnet - it is the magic value for failed sends */ + if (tx->tx_msg.gnm_type == GNILND_MSG_GET_REQ) { + status0 = 0; + status1 = completion; + } else { + status0 = status1 = completion; + } + + tx->tx_buftype = GNILND_BUF_NONE; + tx->tx_msg.gnm_type = GNILND_MSG_NONE; + + /* lnet_finalize doesn't do anything with the *ni, so ok for us to + * set NULL when we are a tx without a conn */ + if (conn != NULL) { + ni = conn->gnc_peer->gnp_net->gnn_ni; + + spin_lock(&conn->gnc_tx_lock); + + LASSERTF(test_and_clear_bit(tx->tx_id.txe_idx, + (volatile unsigned long *)&conn->gnc_tx_bits), + "conn %p tx %p bit %d already cleared\n", + conn, tx, tx->tx_id.txe_idx); + + LASSERTF(conn->gnc_tx_ref_table[tx->tx_id.txe_idx] != NULL, + "msg_id %d already NULL\n", tx->tx_id.txe_idx); + + conn->gnc_tx_ref_table[tx->tx_id.txe_idx] = NULL; + spin_unlock(&conn->gnc_tx_lock); + } + + kgnilnd_free_tx(tx); + + /* finalize AFTER freeing lnet msgs */ + + /* warning - we should hold no locks here - calling lnet_finalize + * could free up lnet credits, resulting in a call chain back into + * the LND via kgnilnd_send and friends */ + lnet_finalize(ni, lntmsg0, status0); + + if (lntmsg1 != NULL) { + lnet_finalize(ni, lntmsg1, status1); + } +} + +void +kgnilnd_txlist_done(struct list_head *txlist, int error) +{ + kgn_tx_t *tx, *txn; + int err_printed = 0; + + if (list_empty(txlist)) + return; + + list_for_each_entry_safe(tx, txn, txlist, tx_list) { + /* only print the first error */ + if (err_printed) + tx->tx_state |= GNILND_TX_QUIET_ERROR; + list_del_init(&tx->tx_list); + kgnilnd_tx_done(tx, error); + err_printed++; + } +} +int +kgnilnd_set_tx_id(kgn_tx_t *tx, kgn_conn_t *conn) +{ + int id; + + spin_lock(&conn->gnc_tx_lock); + + /* ID zero is NOT ALLOWED!!! */ + +search_again: + id = find_next_zero_bit((unsigned long *)&conn->gnc_tx_bits, + GNILND_MAX_MSG_ID, conn->gnc_next_tx); + if (id == GNILND_MAX_MSG_ID) { + if (conn->gnc_next_tx != 1) { + /* we only searched from next_tx to end and didn't find + * one, so search again from start */ + conn->gnc_next_tx = 1; + goto search_again; + } + /* couldn't find one! */ + spin_unlock(&conn->gnc_tx_lock); + return -E2BIG; + } + + /* bump next_tx to prevent immediate reuse */ + conn->gnc_next_tx = id + 1; + + set_bit(id, (volatile unsigned long *)&conn->gnc_tx_bits); + LASSERTF(conn->gnc_tx_ref_table[id] == NULL, + "tx 0x%p already at id %d\n", + conn->gnc_tx_ref_table[id], id); + + /* delay these until we have a valid ID - prevents bad clear of the bit + * in kgnilnd_tx_done */ + tx->tx_conn = conn; + tx->tx_id.txe_cqid = conn->gnc_cqid; + + tx->tx_id.txe_idx = id; + conn->gnc_tx_ref_table[id] = tx; + + /* Using jiffies to help differentiate against TX reuse - with + * the usual minimum of a 250HZ clock, we wrap jiffies on the same TX + * if we are sending to the same node faster than 256000/sec. + * To help guard against this, we OR in the tx_seq - that is 32 bits */ + + tx->tx_id.txe_chips = (__u32)(jiffies | conn->gnc_tx_seq); + + GNIDBG_TX(D_NET, tx, "set cookie/id/bits", NULL); + + spin_unlock(&conn->gnc_tx_lock); + return 0; +} + +static inline int +kgnilnd_tx_should_retry(kgn_conn_t *conn, kgn_tx_t *tx) +{ + int max_retrans = *kgnilnd_tunables.kgn_max_retransmits; + int log_retrans; + int log_retrans_level; + + /* I need kgni credits to send this. Replace tx at the head of the + * fmaq and I'll get rescheduled when credits appear */ + tx->tx_state = 0; + tx->tx_retrans++; + conn->gnc_tx_retrans++; + log_retrans = ((tx->tx_retrans < 25) || ((tx->tx_retrans % 25) == 0) || + (tx->tx_retrans > (max_retrans / 2))); + log_retrans_level = tx->tx_retrans < (max_retrans / 2) ? D_NET : D_NETERROR; + + /* Decision time - either error, warn or just retransmit */ + + /* we don't care about TX timeout - it could be that the network is slower + * or throttled. We'll keep retranmitting - so if the network is so slow + * that we fill up our mailbox, we'll keep trying to resend that msg + * until we exceed the max_retrans _or_ gnc_last_rx expires, indicating + * that he hasn't send us any traffic in return */ + + if (tx->tx_retrans > max_retrans) { + /* this means we are not backing off the retransmits + * in a healthy manner and are likely chewing up the + * CPU cycles quite badly */ + GNIDBG_TOMSG(D_ERROR, &tx->tx_msg, + "SOFTWARE BUG: too many retransmits (%d) for tx id %x " + "conn 0x%p->%s\n", + tx->tx_retrans, tx->tx_id, conn, + libcfs_nid2str(conn->gnc_peer->gnp_nid)); + + /* yes - double errors to help debug this condition */ + GNIDBG_TOMSG(D_NETERROR, &tx->tx_msg, "connection dead. " + "unable to send to %s for %lu secs (%d tries)", + libcfs_nid2str(tx->tx_conn->gnc_peer->gnp_nid), + cfs_duration_sec(jiffies - tx->tx_cred_wait), + tx->tx_retrans); + + kgnilnd_close_conn(conn, -ETIMEDOUT); + + /* caller should terminate */ + RETURN(0); + } else { + /* some reasonable throttling of the debug message */ + if (log_retrans) { + unsigned long now = jiffies; + /* XXX Nic: Mystical TX debug here... */ + GNIDBG_SMSG_CREDS(log_retrans_level, conn); + GNIDBG_TOMSG(log_retrans_level, &tx->tx_msg, + "NOT_DONE on conn 0x%p->%s id %x retrans %d wait %dus" + " last_msg %uus/%uus last_cq %uus/%uus", + conn, libcfs_nid2str(conn->gnc_peer->gnp_nid), + tx->tx_id, tx->tx_retrans, + jiffies_to_usecs(now - tx->tx_cred_wait), + jiffies_to_usecs(now - conn->gnc_last_tx), + jiffies_to_usecs(now - conn->gnc_last_rx), + jiffies_to_usecs(now - conn->gnc_last_tx_cq), + jiffies_to_usecs(now - conn->gnc_last_rx_cq)); + } + /* caller should retry */ + RETURN(1); + } +} + +/* caller must be holding gnd_cq_mutex and not unlock it afterwards, as we need to drop it + * to avoid bad ordering with state_lock */ + +static inline int +kgnilnd_sendmsg_nolock(kgn_tx_t *tx, void *immediate, unsigned int immediatenob, + spinlock_t *state_lock, kgn_tx_list_state_t state) +{ + kgn_conn_t *conn = tx->tx_conn; + kgn_msg_t *msg = &tx->tx_msg; + int retry_send; + gni_return_t rrc; + unsigned long newest_last_rx, timeout; + unsigned long now; + + LASSERTF((msg->gnm_type == GNILND_MSG_IMMEDIATE) ? + immediatenob <= *kgnilnd_tunables.kgn_max_immediate : + immediatenob == 0, + "msg 0x%p type %d wrong payload size %d\n", + msg, msg->gnm_type, immediatenob); + + /* make sure we catch all the cases where we'd send on a dirty old mbox + * but allow case for sending CLOSE. Since this check is within the CQ + * mutex barrier and the close message is only sent through + * kgnilnd_send_conn_close the last message out the door will be the + * close message. + */ + if (atomic_read(&conn->gnc_peer->gnp_dirty_eps) != 0 && msg->gnm_type != GNILND_MSG_CLOSE) { + mutex_unlock(&conn->gnc_device->gnd_cq_mutex); + /* Return -ETIME, we are closing the connection already so we dont want to + * have this tx hit the wire. The tx will be killed by the calling function. + * Once the EP is marked dirty the close message will be the last + * thing to hit the wire */ + return -ETIME; + } + + now = jiffies; + timeout = cfs_time_seconds(conn->gnc_timeout); + + newest_last_rx = GNILND_LASTRX(conn); + + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SEND_TIMEOUT)) { + now = now + (GNILND_TIMEOUTRX(timeout) * 2); + } + + if (time_after_eq(now, newest_last_rx + GNILND_TIMEOUTRX(timeout))) { + GNIDBG_CONN(D_NETERROR|D_CONSOLE, conn, "Cant send to %s after timeout lapse of %lu; TO %lu", + libcfs_nid2str(conn->gnc_peer->gnp_nid), + cfs_duration_sec(now - newest_last_rx), + cfs_duration_sec(GNILND_TIMEOUTRX(timeout))); + mutex_unlock(&conn->gnc_device->gnd_cq_mutex); + return -ETIME; + } + + GNITX_ASSERTF(tx, (conn != NULL) && (tx->tx_id.txe_idx != 0), "tx id unset!", NULL); + /* msg->gnm_srcnid is set when the message is initialized by whatever function is + * creating the message this allows the message to contain the correct LNET NID/NET needed + * instead of the one that the peer/conn uses for sending the data. + */ + msg->gnm_connstamp = conn->gnc_my_connstamp; + msg->gnm_payload_len = immediatenob; + msg->gnm_seq = conn->gnc_tx_seq; + + /* always init here - kgn_checksum is a /sys module tunable + * and can be flipped at any point, even between msg init and sending */ + msg->gnm_cksum = 0; + if (*kgnilnd_tunables.kgn_checksum) { + /* We must set here and not in kgnilnd_init_msg, + * we could resend this msg many times + * (NOT_DONE from gni_smsg_send below) and wouldn't pass + * through init_msg again */ + msg->gnm_cksum = kgnilnd_cksum(msg, sizeof(kgn_msg_t)); + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_CKSUM1)) { + msg->gnm_cksum += 0xf00f; + } + } + + GNIDBG_TOMSG(D_NET, msg, "tx 0x%p conn 0x%p->%s sending SMSG sz %u id %x/%d [%p for %u]", + tx, conn, libcfs_nid2str(conn->gnc_peer->gnp_nid), + sizeof(kgn_msg_t), tx->tx_id.txe_smsg_id, + tx->tx_id.txe_idx, immediate, immediatenob); + + if (unlikely(tx->tx_state & GNILND_TX_FAIL_SMSG)) { + rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE; + } else { + rrc = kgnilnd_smsg_send(conn->gnc_ephandle, + msg, sizeof(*msg), immediate, immediatenob, + tx->tx_id.txe_smsg_id); + } + + switch (rrc) { + case GNI_RC_SUCCESS: + conn->gnc_tx_seq++; + conn->gnc_last_tx = jiffies; + /* no locking here as LIVE isn't a list */ + kgnilnd_tx_add_state_locked(tx, NULL, conn, GNILND_TX_LIVE_FMAQ, 1); + + /* this needs to be checked under lock as it might be freed from a completion + * event. + */ + if (msg->gnm_type == GNILND_MSG_NOOP) { + set_mb(conn->gnc_last_noop_sent, jiffies); + } + + /* serialize with seeing CQ events for completion on this, as well as + * tx_seq */ + mutex_unlock(&conn->gnc_device->gnd_cq_mutex); + + atomic_inc(&conn->gnc_device->gnd_short_ntx); + atomic64_add(immediatenob, &conn->gnc_device->gnd_short_txbytes); + kgnilnd_peer_alive(conn->gnc_peer); + GNIDBG_SMSG_CREDS(D_NET, conn); + return 0; + + case GNI_RC_NOT_DONE: + /* XXX Nic: We need to figure out how to track this + * - there are bound to be good reasons for it, + * but we want to know when it happens */ + + mutex_unlock(&conn->gnc_device->gnd_cq_mutex); + /* We'll handle this error inline - makes the calling logic much more + * clean */ + + /* If no lock, caller doesn't want us to retry */ + if (state_lock == NULL) { + return -EAGAIN; + } + + retry_send = kgnilnd_tx_should_retry(conn, tx); + if (retry_send) { + /* add to head of list for the state and retries */ + spin_lock(state_lock); + kgnilnd_tx_add_state_locked(tx, conn->gnc_peer, conn, state, 0); + spin_unlock(state_lock); + + /* We only reschedule for a certain number of retries, then + * we will wait for the CQ events indicating a release of SMSG + * credits */ + if (tx->tx_retrans < (*kgnilnd_tunables.kgn_max_retransmits/4)) { + kgnilnd_schedule_conn(conn); + return 0; + } else { + /* CQ event coming in signifies either TX completed or + * RX receive. Either of these *could* free up credits + * in the SMSG mbox and we should try sending again */ + GNIDBG_TX(D_NET, tx, "waiting for CQID %u event to resend", + tx->tx_conn->gnc_cqid); + /* use +ve return code to let upper layers know they + * should stop looping on sends */ + return EAGAIN; + } + } else { + return -EAGAIN; + } + default: + /* handle bad retcode gracefully */ + mutex_unlock(&conn->gnc_device->gnd_cq_mutex); + return -EIO; + } +} + +/* kgnilnd_sendmsg has hard wait on gnd_cq_mutex */ +static inline int +kgnilnd_sendmsg(kgn_tx_t *tx, void *immediate, unsigned int immediatenob, + spinlock_t *state_lock, kgn_tx_list_state_t state) +{ + kgn_device_t *dev = tx->tx_conn->gnc_device; + unsigned long timestamp; + int rc; + + timestamp = jiffies; + mutex_lock(&dev->gnd_cq_mutex); + /* delay in jiffies - we are really concerned only with things that + * result in a schedule() or really holding this off for long times . + * NB - mutex_lock could spin for 2 jiffies before going to sleep to wait */ + dev->gnd_mutex_delay += (long) jiffies - timestamp; + + rc = kgnilnd_sendmsg_nolock(tx, immediate, immediatenob, state_lock, state); + + RETURN(rc); +} + + +/* returns -EAGAIN for lock miss, anything else < 0 is hard error, >=0 for success */ +static inline int +kgnilnd_sendmsg_trylock(kgn_tx_t *tx, void *immediate, unsigned int immediatenob, + spinlock_t *state_lock, kgn_tx_list_state_t state) +{ + kgn_conn_t *conn = tx->tx_conn; + kgn_device_t *dev = conn->gnc_device; + unsigned long timestamp; + int rc; + + timestamp = jiffies; + + /* technically we are doing bad things with the read_lock on the peer_conn + * table, but we shouldn't be sleeping inside here - and we don't sleep/block + * for the mutex. I bet lockdep is gonna flag this one though... */ + + /* there are a few cases where we don't want the immediate send - like + * when we are in the scheduler thread and it'd harm the latency of + * getting messages up to LNet */ + + /* rmb for gnd_ready */ + smp_rmb(); + if (conn->gnc_device->gnd_ready == GNILND_DEV_LOOP) { + rc = 0; + atomic_inc(&conn->gnc_device->gnd_fast_block); + } else if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) { + /* dont hit HW during quiesce */ + rc = 0; + } else if (unlikely(atomic_read(&conn->gnc_peer->gnp_dirty_eps))) { + /* dont hit HW if stale EPs and conns left to close */ + rc = 0; + } else { + atomic_inc(&conn->gnc_device->gnd_fast_try); + rc = mutex_trylock(&conn->gnc_device->gnd_cq_mutex); + } + if (!rc) { + rc = -EAGAIN; + } else { + /* we got the mutex and weren't blocked */ + + /* delay in jiffies - we are really concerned only with things that + * result in a schedule() or really holding this off for long times . + * NB - mutex_lock could spin for 2 jiffies before going to sleep to wait */ + dev->gnd_mutex_delay += (long) jiffies - timestamp; + + atomic_inc(&conn->gnc_device->gnd_fast_ok); + tx->tx_qtime = jiffies; + tx->tx_state = GNILND_TX_WAITING_COMPLETION; + rc = kgnilnd_sendmsg_nolock(tx, tx->tx_buffer, tx->tx_nob, &conn->gnc_list_lock, GNILND_TX_FMAQ); + /* _nolock unlocks the mutex for us */ + } + + RETURN(rc); +} + +/* lets us know if we can push this RDMA through now */ +inline int +kgnilnd_auth_rdma_bytes(kgn_device_t *dev, kgn_tx_t *tx) +{ + long bytes_left; + + bytes_left = atomic64_sub_return(tx->tx_nob, &dev->gnd_rdmaq_bytes_ok); + + if (bytes_left < 0) { + atomic64_add(tx->tx_nob, &dev->gnd_rdmaq_bytes_ok); + atomic_inc(&dev->gnd_rdmaq_nstalls); + smp_wmb(); + + CDEBUG(D_NET, "no bytes to send, turning on timer for %lu\n", + dev->gnd_rdmaq_deadline); + mod_timer(&dev->gnd_rdmaq_timer, dev->gnd_rdmaq_deadline); + /* we never del this timer - at worst it schedules us.. */ + return -EAGAIN; + } else { + return 0; + } +} + +/* this adds a TX to the queue pending throttling authorization before + * we allow our remote peer to launch a PUT at us */ +void +kgnilnd_queue_rdma(kgn_conn_t *conn, kgn_tx_t *tx) +{ + int rc; + + /* we cannot go into send_mapped_tx from here as we are holding locks + * and mem registration might end up allocating memory in kgni. + * That said, we'll push this as far as we can into the queue process */ + rc = kgnilnd_auth_rdma_bytes(conn->gnc_device, tx); + + if (rc < 0) { + spin_lock(&conn->gnc_device->gnd_rdmaq_lock); + kgnilnd_tx_add_state_locked(tx, NULL, conn, GNILND_TX_RDMAQ, 0); + /* lets us know how delayed RDMA is */ + tx->tx_qtime = jiffies; + spin_unlock(&conn->gnc_device->gnd_rdmaq_lock); + } else { + /* we have RDMA authorized, now it just needs a MDD and to hit the wire */ + spin_lock(&tx->tx_conn->gnc_device->gnd_lock); + kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_MAPQ, 0); + /* lets us know how delayed mapping is */ + tx->tx_qtime = jiffies; + spin_unlock(&tx->tx_conn->gnc_device->gnd_lock); + } + + /* make sure we wake up sched to run this */ + kgnilnd_schedule_device(tx->tx_conn->gnc_device); +} + +/* push TX through state machine */ +void +kgnilnd_queue_tx(kgn_conn_t *conn, kgn_tx_t *tx) +{ + int rc; + int add_tail = 1; + + /* set the tx_id here, we delay it until we have an actual conn + * to fiddle with + * in some cases, the tx_id is already set to provide for things + * like RDMA completion cookies, etc */ + if (tx->tx_id.txe_idx == 0) { + rc = kgnilnd_set_tx_id(tx, conn); + if (rc != 0) { + kgnilnd_tx_done(tx, rc); + return; + } + } + + CDEBUG(D_NET, "%s to conn %p for %s\n", kgnilnd_msgtype2str(tx->tx_msg.gnm_type), + conn, libcfs_nid2str(conn->gnc_peer->gnp_nid)); + + /* Only let NOOPs to be sent while fail loc is set, otherwise kill the tx. + */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_ONLY_NOOP) && (tx->tx_msg.gnm_type != GNILND_MSG_NOOP)) { + kgnilnd_tx_done(tx, rc); + return; + } + + switch (tx->tx_msg.gnm_type) { + case GNILND_MSG_PUT_ACK: + case GNILND_MSG_GET_REQ: + /* hijacking time! If this messages will authorize our peer to + * send his dirty little bytes in an RDMA, we need to get permission */ + kgnilnd_queue_rdma(conn, tx); + break; + case GNILND_MSG_IMMEDIATE: + /* try to send right now, can help reduce latency */ + rc = kgnilnd_sendmsg_trylock(tx, tx->tx_buffer, tx->tx_nob, &conn->gnc_list_lock, GNILND_TX_FMAQ); + + if (rc >= 0) { + /* it was sent, break out of switch to avoid default case of queueing */ + break; + } else if (rc == -EAGAIN) { + /* needs to queue to try again, so fall through to default case */ + } else { + /* bail: it wasnt sent and we didn't get EAGAIN indicating + * we should retrans - We do not close the conn due to locking + * we let the reaper thread take care of it. There are no hard + * errors from send_msg that would require close to be called + */ + kgnilnd_tx_done(tx, rc); + break; + } + case GNILND_MSG_NOOP: + /* Just make sure this goes out first for this conn */ + add_tail = 0; + /* fall through... */ + default: + spin_lock(&conn->gnc_list_lock); + kgnilnd_tx_add_state_locked(tx, conn->gnc_peer, conn, GNILND_TX_FMAQ, add_tail); + tx->tx_qtime = jiffies; + spin_unlock(&conn->gnc_list_lock); + kgnilnd_schedule_conn(conn); + } +} + +void +kgnilnd_launch_tx(kgn_tx_t *tx, kgn_net_t *net, lnet_process_id_t *target) +{ + kgn_peer_t *peer; + kgn_peer_t *new_peer = NULL; + kgn_conn_t *conn = NULL; + int rc; + + ENTRY; + + /* If I get here, I've committed to send, so I complete the tx with + * failure on any problems */ + + GNITX_ASSERTF(tx, tx->tx_conn == NULL, + "tx already has connection %p", tx->tx_conn); + + /* do all of the peer & conn searching in one swoop - this avoids + * nastiness when dropping locks and needing to maintain a sane state + * in the face of stack reset or something else nuking peers & conns */ + + /* I expect to find him, so only take a read lock */ + read_lock(&kgnilnd_data.kgn_peer_conn_lock); + + peer = kgnilnd_find_peer_locked(target->nid); + if (peer != NULL) { + conn = kgnilnd_find_conn_locked(peer); + /* this could be NULL during quiesce */ + if (conn != NULL) { + /* Connection exists; queue message on it */ + kgnilnd_queue_tx(conn, tx); + read_unlock(&kgnilnd_data.kgn_peer_conn_lock); + RETURN_EXIT; + } + } + + /* creating peer or conn; I'll need a write lock... */ + read_unlock(&kgnilnd_data.kgn_peer_conn_lock); + + CFS_RACE(CFS_FAIL_GNI_FIND_TARGET); + + /* NB - this will not block during normal operations - + * the only writer of this is in the startup/shutdown path. */ + rc = down_read_trylock(&kgnilnd_data.kgn_net_rw_sem); + if (!rc) { + rc = -ESHUTDOWN; + GOTO(no_peer, rc); + } + + /* ignore previous peer entirely - we cycled the lock, so we + * will create new peer and at worst drop it if peer is still + * in the tables */ + rc = kgnilnd_create_peer_safe(&new_peer, target->nid, net); + if (rc != 0) { + up_read(&kgnilnd_data.kgn_net_rw_sem); + GOTO(no_peer, rc); + } + + write_lock(&kgnilnd_data.kgn_peer_conn_lock); + up_read(&kgnilnd_data.kgn_net_rw_sem); + + /* search for peer again now that we have the lock + * if we don't find it, add our new one to the list */ + kgnilnd_add_peer_locked(target->nid, new_peer, &peer); + + conn = kgnilnd_find_or_create_conn_locked(peer); + if (conn != NULL) { + /* oh hey, found a conn now... magical */ + kgnilnd_queue_tx(conn, tx); + } else { + /* no conn, must be trying to connect - so we queue for now */ + tx->tx_qtime = jiffies; + kgnilnd_tx_add_state_locked(tx, peer, NULL, GNILND_TX_PEERQ, 1); + } + write_unlock(&kgnilnd_data.kgn_peer_conn_lock); + RETURN_EXIT; +no_peer: + kgnilnd_tx_done(tx, rc); + RETURN_EXIT; +} + +void +kgnilnd_rdma(kgn_tx_t *tx, int type, + kgn_rdma_desc_t *sink, unsigned int nob, __u64 cookie) +{ + kgn_conn_t *conn = tx->tx_conn; + unsigned long timestamp; + gni_return_t rrc; + + LASSERTF(kgnilnd_tx_mapped(tx), + "unmapped tx %p\n", tx); + LASSERTF(conn != NULL, + "NULL conn on tx %p, naughty, naughty\n", tx); + LASSERTF(nob <= sink->gnrd_nob, + "nob %u > sink->gnrd_nob %d (%p)\n", + nob, sink->gnrd_nob, sink); + LASSERTF(nob <= tx->tx_nob, + "nob %d > tx(%p)->tx_nob %d\n", + nob, tx, tx->tx_nob); + + memset(&tx->tx_rdma_desc, 0, sizeof(tx->tx_rdma_desc)); + tx->tx_rdma_desc.post_id = tx->tx_id.txe_cookie; + tx->tx_rdma_desc.type = GNI_POST_RDMA_PUT; + tx->tx_rdma_desc.cq_mode = GNI_CQMODE_GLOBAL_EVENT; + tx->tx_rdma_desc.local_addr = (__u64)((unsigned long)tx->tx_buffer); + tx->tx_rdma_desc.local_mem_hndl = tx->tx_map_key; + tx->tx_rdma_desc.remote_addr = sink->gnrd_addr; + tx->tx_rdma_desc.remote_mem_hndl = sink->gnrd_key; + tx->tx_rdma_desc.length = nob; + if (!*kgnilnd_tunables.kgn_bte_hash) + tx->tx_rdma_desc.dlvr_mode |= GNI_DLVMODE_NO_HASH; + if (!*kgnilnd_tunables.kgn_bte_adapt) + tx->tx_rdma_desc.dlvr_mode |= (GNI_DLVMODE_NO_ADAPT | GNI_DLVMODE_NO_RADAPT); + + /* prep final completion message */ + kgnilnd_init_msg(&tx->tx_msg, type, tx->tx_msg.gnm_srcnid); + tx->tx_msg.gnm_u.completion.gncm_cookie = cookie; + /* send actual size RDMA'd in retval */ + tx->tx_msg.gnm_u.completion.gncm_retval = nob; + + kgnilnd_compute_rdma_cksum(tx); + + if (nob == 0) { + kgnilnd_queue_tx(conn, tx); + return; + } + + /* Don't lie (CLOSE == RDMA idle) */ + LASSERTF(!conn->gnc_close_sent, "tx %p on conn %p after close sent %d\n", + tx, conn, conn->gnc_close_sent); + + GNIDBG_TX(D_NET, tx, "Post RDMA type 0x%02x dlvr_mode 0x%x", + type, tx->tx_rdma_desc.dlvr_mode); + + /* set CQ dedicated for RDMA */ + tx->tx_rdma_desc.src_cq_hndl = conn->gnc_device->gnd_snd_rdma_cqh; + + timestamp = jiffies; + mutex_lock(&conn->gnc_device->gnd_cq_mutex); + /* delay in jiffies - we are really concerned only with things that + * result in a schedule() or really holding this off for long times . + * NB - mutex_lock could spin for 2 jiffies before going to sleep to wait */ + conn->gnc_device->gnd_mutex_delay += (long) jiffies - timestamp; + + rrc = kgnilnd_post_rdma(conn->gnc_ephandle, &tx->tx_rdma_desc); + + spin_lock(&conn->gnc_list_lock); + kgnilnd_tx_add_state_locked(tx, conn->gnc_peer, conn, GNILND_TX_LIVE_RDMAQ, 1); + tx->tx_qtime = jiffies; + spin_unlock(&conn->gnc_list_lock); + + mutex_unlock(&conn->gnc_device->gnd_cq_mutex); + + /* XXX Nic: is this a place we should handle more errors for + * robustness sake */ + LASSERT(rrc == GNI_RC_SUCCESS); + +} + +kgn_rx_t * +kgnilnd_alloc_rx(void) +{ + kgn_rx_t *rx; + + rx = cfs_mem_cache_alloc(kgnilnd_data.kgn_rx_cache, CFS_ALLOC_ATOMIC); + if (rx == NULL) { + CERROR("failed to allocate rx\n"); + return NULL; + } + CDEBUG(D_MALLOC, "slab-alloced 'rx': %lu at %p.\n", + sizeof(*rx), rx); + + /* no memset to zero, we'll always fill all members */ + return rx; +} + +/* release is to just free connection resources + * we use this for the eager path after copying */ +void +kgnilnd_release_msg(kgn_conn_t *conn) +{ + gni_return_t rrc; + unsigned long timestamp; + + CDEBUG(D_NET, "consuming %p\n", conn); + + timestamp = jiffies; + mutex_lock(&conn->gnc_device->gnd_cq_mutex); + /* delay in jiffies - we are really concerned only with things that + * result in a schedule() or really holding this off for long times . + * NB - mutex_lock could spin for 2 jiffies before going to sleep to wait */ + conn->gnc_device->gnd_mutex_delay += (long) jiffies - timestamp; + + rrc = kgnilnd_smsg_release(conn->gnc_ephandle); + mutex_unlock(&conn->gnc_device->gnd_cq_mutex); + + LASSERTF(rrc == GNI_RC_SUCCESS, "bad rrc %d\n", rrc); + GNIDBG_SMSG_CREDS(D_NET, conn); + + return; +} + +void +kgnilnd_consume_rx(kgn_rx_t *rx) +{ + kgn_conn_t *conn = rx->grx_conn; + kgn_msg_t *rxmsg = rx->grx_msg; + + /* if we are eager, free the cache alloc'd msg */ + if (unlikely(rx->grx_eager)) { + LIBCFS_FREE(rxmsg, sizeof(*rxmsg) + *kgnilnd_tunables.kgn_max_immediate); + + /* release ref from eager_recv */ + kgnilnd_conn_decref(conn); + } else { + GNIDBG_MSG(D_NET, rxmsg, "rx %p processed", rx); + kgnilnd_release_msg(conn); + } + + cfs_mem_cache_free(kgnilnd_data.kgn_rx_cache, rx); + CDEBUG(D_MALLOC, "slab-freed 'rx': %lu at %p.\n", + sizeof(*rx), rx); + + return; +} + +int +kgnilnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) +{ + lnet_hdr_t *hdr = &lntmsg->msg_hdr; + int type = lntmsg->msg_type; + lnet_process_id_t target = lntmsg->msg_target; + int target_is_router = lntmsg->msg_target_is_router; + int routing = lntmsg->msg_routing; + unsigned int niov = lntmsg->msg_niov; + struct iovec *iov = lntmsg->msg_iov; + lnet_kiov_t *kiov = lntmsg->msg_kiov; + unsigned int offset = lntmsg->msg_offset; + unsigned int nob = lntmsg->msg_len; + unsigned int msg_vmflush = lntmsg->msg_vmflush; + kgn_net_t *net = ni->ni_data; + kgn_tx_t *tx; + int rc = 0; + int mpflag = 0; + + /* NB 'private' is different depending on what we're sending.... */ + LASSERT(!in_interrupt()); + + CDEBUG(D_NET, "sending msg type %d with %d bytes in %d frags to %s\n", + type, nob, niov, libcfs_id2str(target)); + + LASSERTF(nob == 0 || niov > 0, + "lntmsg %p nob %d niov %d\n", lntmsg, nob, niov); + LASSERTF(niov <= LNET_MAX_IOV, + "lntmsg %p niov %d\n", lntmsg, niov); + + /* payload is either all vaddrs or all pages */ + LASSERTF(!(kiov != NULL && iov != NULL), + "lntmsg %p kiov %p iov %p\n", lntmsg, kiov, iov); + + if (msg_vmflush) + mpflag = cfs_memory_pressure_get_and_set(); + + switch (type) { + default: + CERROR("lntmsg %p with unexpected type %d\n", + lntmsg, type); + LBUG(); + + case LNET_MSG_ACK: + LASSERTF(nob == 0, "lntmsg %p nob %d\n", + lntmsg, nob); + break; + + case LNET_MSG_GET: + LASSERT(niov == 0); + LASSERT(nob == 0); + + if (routing || target_is_router) + break; /* send IMMEDIATE */ + + /* it is safe to do direct GET with out mapping buffer for RDMA as we + * check the eventual sink buffer here - if small enough, remote + * end is perfectly capable of returning data in short message - + * The magic is that we call lnet_parse in kgnilnd_recv with rdma_req=0 + * for IMMEDIATE messages which will have it send a real reply instead + * of doing kgnilnd_recv to have the RDMA continued */ + if (lntmsg->msg_md->md_length <= *kgnilnd_tunables.kgn_max_immediate) + break; + + tx = kgnilnd_new_tx_msg(GNILND_MSG_GET_REQ, ni->ni_nid); + if (tx == NULL) { + rc = -ENOMEM; + goto out; + } + + /* slightly different options as we might actually have a GET with a + * MD_KIOV set but a non-NULL md_iov.iov */ + if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0) + rc = kgnilnd_setup_rdma_buffer(tx, lntmsg->msg_md->md_niov, + lntmsg->msg_md->md_iov.iov, NULL, + 0, lntmsg->msg_md->md_length); + else + rc = kgnilnd_setup_rdma_buffer(tx, lntmsg->msg_md->md_niov, + NULL, lntmsg->msg_md->md_iov.kiov, + 0, lntmsg->msg_md->md_length); + if (rc != 0) { + CERROR("unable to setup buffer: %d\n", rc); + kgnilnd_tx_done(tx, rc); + rc = -EIO; + goto out; + } + + tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg); + if (tx->tx_lntmsg[1] == NULL) { + CERROR("Can't create reply for GET to %s\n", + libcfs_nid2str(target.nid)); + kgnilnd_tx_done(tx, rc); + rc = -EIO; + goto out; + } + + tx->tx_lntmsg[0] = lntmsg; + tx->tx_msg.gnm_u.get.gngm_hdr = *hdr; + /* rest of tx_msg is setup just before it is sent */ + kgnilnd_launch_tx(tx, net, &target); + goto out; + + case LNET_MSG_REPLY: + case LNET_MSG_PUT: + /* to save on MDDs, we'll handle short kiov by vmap'ing + * and sending via SMSG */ + if (nob <= *kgnilnd_tunables.kgn_max_immediate) + break; + + tx = kgnilnd_new_tx_msg(GNILND_MSG_PUT_REQ, ni->ni_nid); + if (tx == NULL) { + rc = -ENOMEM; + goto out; + } + + rc = kgnilnd_setup_rdma_buffer(tx, niov, iov, kiov, offset, nob); + if (rc != 0) { + kgnilnd_tx_done(tx, rc); + rc = -EIO; + goto out; + } + + tx->tx_lntmsg[0] = lntmsg; + tx->tx_msg.gnm_u.putreq.gnprm_hdr = *hdr; + /* rest of tx_msg is setup just before it is sent */ + kgnilnd_launch_tx(tx, net, &target); + goto out; + } + + /* send IMMEDIATE */ + + LASSERTF(nob <= *kgnilnd_tunables.kgn_max_immediate, + "lntmsg 0x%p too large %d\n", lntmsg, nob); + + tx = kgnilnd_new_tx_msg(GNILND_MSG_IMMEDIATE, ni->ni_nid); + if (tx == NULL) { + rc = -ENOMEM; + goto out; + } + + rc = kgnilnd_setup_immediate_buffer(tx, niov, iov, kiov, offset, nob); + if (rc != 0) { + kgnilnd_tx_done(tx, rc); + goto out; + } + + tx->tx_msg.gnm_u.immediate.gnim_hdr = *hdr; + tx->tx_lntmsg[0] = lntmsg; + kgnilnd_launch_tx(tx, net, &target); + +out: + /* use stored value as we could have already finalized lntmsg here from a failed launch */ + if (msg_vmflush) + cfs_memory_pressure_restore(mpflag); + return rc; +} + +void +kgnilnd_reply(lnet_ni_t *ni, kgn_rx_t *rx, lnet_msg_t *lntmsg) +{ + kgn_conn_t *conn = rx->grx_conn; + kgn_msg_t *rxmsg = rx->grx_msg; + unsigned int niov = lntmsg->msg_niov; + struct iovec *iov = lntmsg->msg_iov; + lnet_kiov_t *kiov = lntmsg->msg_kiov; + unsigned int offset = lntmsg->msg_offset; + unsigned int nob = lntmsg->msg_len; + kgn_tx_t *tx; + int rc = 0; + + tx = kgnilnd_new_tx_msg(GNILND_MSG_GET_DONE, ni->ni_nid); + if (tx == NULL) + goto failed_0; + + rc = kgnilnd_set_tx_id(tx, conn); + if (rc != 0) + goto failed_1; + + rc = kgnilnd_setup_rdma_buffer(tx, niov, iov, kiov, offset, nob); + if (rc != 0) + goto failed_1; + + tx->tx_lntmsg[0] = lntmsg; + tx->tx_getinfo = rxmsg->gnm_u.get; + + /* we only queue from kgnilnd_recv - we might get called from other contexts + * and we don't want to block the mutex in those cases */ + + spin_lock(&tx->tx_conn->gnc_device->gnd_lock); + kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_MAPQ, 1); + spin_unlock(&tx->tx_conn->gnc_device->gnd_lock); + kgnilnd_schedule_device(tx->tx_conn->gnc_device); + + return; + + failed_1: + kgnilnd_tx_done(tx, rc); + kgnilnd_nak_rdma(conn, GNILND_MSG_GET_NAK, rc, rxmsg->gnm_u.get.gngm_cookie, ni->ni_nid); + failed_0: + lnet_finalize(ni, lntmsg, rc); +} + +int +kgnilnd_eager_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, + void **new_private) +{ + kgn_rx_t *rx = private; + kgn_conn_t *conn = rx->grx_conn; + kgn_msg_t *rxmsg = rx->grx_msg; + kgn_msg_t *eagermsg = NULL; + + GNIDBG_MSG(D_NET, rxmsg, "eager recv for conn %p, rxmsg %p, lntmsg %p", + conn, rxmsg, lntmsg); + + if (rxmsg->gnm_payload_len > *kgnilnd_tunables.kgn_max_immediate) { + GNIDBG_MSG(D_ERROR, rxmsg, "payload too large %d", + rxmsg->gnm_payload_len); + return -EPROTO; + } + + /* we have no credits or buffers for this message, so copy it + * somewhere for a later kgnilnd_recv */ + LIBCFS_ALLOC(eagermsg, sizeof(*eagermsg) + *kgnilnd_tunables.kgn_max_immediate); + if (eagermsg == NULL) { + CERROR("couldn't allocate eager rx message for conn %p to %s\n", + conn, libcfs_nid2str(conn->gnc_peer->gnp_nid)); + return -ENOMEM; + } + + /* copy msg and payload */ + memcpy(eagermsg, rxmsg, sizeof(*rxmsg) + rxmsg->gnm_payload_len); + rx->grx_msg = eagermsg; + rx->grx_eager = 1; + + /* stash this for lnet_finalize on cancel-on-conn-close */ + rx->grx_lntmsg = lntmsg; + + /* add conn ref to ensure it doesn't go away until all eager messages processed */ + kgnilnd_conn_addref(conn); + + /* keep the same rx_t, it just has a new grx_msg now */ + *new_private = private; + + /* release SMSG buffer */ + kgnilnd_release_msg(conn); + + return 0; +} + +int +kgnilnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, + int delayed, unsigned int niov, + struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen) +{ + kgn_rx_t *rx = private; + kgn_conn_t *conn = rx->grx_conn; + kgn_msg_t *rxmsg = rx->grx_msg; + kgn_tx_t *tx; + int rc = 0; + __u32 pload_cksum; + ENTRY; + + LASSERT(!in_interrupt()); + LASSERTF(mlen <= rlen, "%d <= %d\n", mlen, rlen); + /* Either all pages or all vaddrs */ + LASSERTF(!(kiov != NULL && iov != NULL), "kiov %p iov %p\n", + kiov, iov); + + GNIDBG_MSG(D_NET, rxmsg, "conn %p, rxmsg %p, lntmsg %p" + " niov=%d kiov=%p iov=%p offset=%d mlen=%d rlen=%d", + conn, rxmsg, lntmsg, + niov, kiov, iov, offset, mlen, rlen); + + /* we need to lock here as recv can be called from any context */ + read_lock(&kgnilnd_data.kgn_peer_conn_lock); + if (rx->grx_eager && conn->gnc_state != GNILND_CONN_ESTABLISHED) { + read_unlock(&kgnilnd_data.kgn_peer_conn_lock); + + /* someone closed the conn after we copied this out, nuke it */ + kgnilnd_consume_rx(rx); + lnet_finalize(ni, lntmsg, conn->gnc_error); + RETURN(0); + } + read_unlock(&kgnilnd_data.kgn_peer_conn_lock); + + switch (rxmsg->gnm_type) { + default: + LBUG(); + + case GNILND_MSG_IMMEDIATE: + if (mlen > rxmsg->gnm_payload_len) { + GNIDBG_MSG(D_ERROR, rxmsg, + "Immediate message from %s too big: %d > %d", + libcfs_nid2str(conn->gnc_peer->gnp_nid), mlen, + rxmsg->gnm_payload_len); + rc = -EINVAL; + kgnilnd_consume_rx(rx); + RETURN(rc); + } + + /* rxmsg[1] is a pointer to the payload, sitting in the buffer + * right after the kgn_msg_t header - so just 'cute' way of saying + * rxmsg + sizeof(kgn_msg_t) */ + + /* check payload checksum if sent */ + + if (*kgnilnd_tunables.kgn_checksum >= 2 && + !rxmsg->gnm_payload_cksum && + rxmsg->gnm_payload_len != 0) + GNIDBG_MSG(D_WARNING, rxmsg, "no msg payload checksum when enabled"); + + if (rxmsg->gnm_payload_cksum != 0) { + /* gnm_payload_len set in kgnilnd_sendmsg from tx->tx_nob, + * which is what is used to calculate the cksum on the TX side */ + pload_cksum = kgnilnd_cksum(&rxmsg[1], rxmsg->gnm_payload_len); + + if (rxmsg->gnm_payload_cksum != pload_cksum) { + GNIDBG_MSG(D_NETERROR, rxmsg, + "Bad payload checksum (%x expected %x)", + pload_cksum, rxmsg->gnm_payload_cksum); + switch (*kgnilnd_tunables.kgn_checksum_dump) { + case 2: + kgnilnd_dump_blob(D_BUFFS, "bad payload checksum", + &rxmsg[1], rxmsg->gnm_payload_len); + /* fall through to dump */ + case 1: + libcfs_debug_dumplog(); + break; + default: + break; + } + rc = -ENOKEY; + /* checksum problems are fatal, kill the conn */ + kgnilnd_consume_rx(rx); + kgnilnd_close_conn(conn, rc); + RETURN(rc); + } + } + + if (kiov != NULL) + lnet_copy_flat2kiov( + niov, kiov, offset, + *kgnilnd_tunables.kgn_max_immediate, + &rxmsg[1], 0, mlen); + else + lnet_copy_flat2iov( + niov, iov, offset, + *kgnilnd_tunables.kgn_max_immediate, + &rxmsg[1], 0, mlen); + + kgnilnd_consume_rx(rx); + lnet_finalize(ni, lntmsg, 0); + RETURN(0); + + case GNILND_MSG_PUT_REQ: + /* LNET wants to truncate or drop transaction, sending NAK */ + if (mlen == 0) { + kgnilnd_consume_rx(rx); + lnet_finalize(ni, lntmsg, 0); + + /* only error if lntmsg == NULL, otherwise we are just + * short circuiting the rdma process of 0 bytes */ + kgnilnd_nak_rdma(conn, GNILND_MSG_PUT_NAK, + lntmsg == NULL ? -ENOENT : 0, + rxmsg->gnm_u.get.gngm_cookie, + ni->ni_nid); + RETURN(0); + } + /* sending ACK with sink buff. info */ + tx = kgnilnd_new_tx_msg(GNILND_MSG_PUT_ACK, ni->ni_nid); + if (tx == NULL) { + kgnilnd_consume_rx(rx); + RETURN(-ENOMEM); + } + + rc = kgnilnd_set_tx_id(tx, conn); + if (rc != 0) { + GOTO(nak_put_req, rc); + } + + rc = kgnilnd_setup_rdma_buffer(tx, niov, iov, kiov, offset, mlen); + if (rc != 0) { + GOTO(nak_put_req, rc); + } + + tx->tx_msg.gnm_u.putack.gnpam_src_cookie = + rxmsg->gnm_u.putreq.gnprm_cookie; + tx->tx_msg.gnm_u.putack.gnpam_dst_cookie = tx->tx_id.txe_cookie; + tx->tx_msg.gnm_u.putack.gnpam_desc.gnrd_addr = + (__u64)((unsigned long)tx->tx_buffer); + tx->tx_msg.gnm_u.putack.gnpam_desc.gnrd_nob = mlen; + + tx->tx_lntmsg[0] = lntmsg; /* finalize this on RDMA_DONE */ + + /* we only queue from kgnilnd_recv - we might get called from other contexts + * and we don't want to block the mutex in those cases */ + + spin_lock(&tx->tx_conn->gnc_device->gnd_lock); + kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_MAPQ, 1); + spin_unlock(&tx->tx_conn->gnc_device->gnd_lock); + kgnilnd_schedule_device(tx->tx_conn->gnc_device); + + kgnilnd_consume_rx(rx); + RETURN(0); + +nak_put_req: + /* make sure we send an error back when the PUT fails */ + kgnilnd_nak_rdma(conn, GNILND_MSG_PUT_NAK, rc, rxmsg->gnm_u.get.gngm_cookie, ni->ni_nid); + kgnilnd_tx_done(tx, rc); + kgnilnd_consume_rx(rx); + + /* return magic LNet network error */ + RETURN(-EIO); + + case GNILND_MSG_GET_REQ: + if (lntmsg != NULL) { + /* Matched! */ + kgnilnd_reply(ni, rx, lntmsg); + } else { + /* No match */ + kgnilnd_nak_rdma(conn, GNILND_MSG_GET_NAK, + -ENOENT, + rxmsg->gnm_u.get.gngm_cookie, + ni->ni_nid); + } + kgnilnd_consume_rx(rx); + RETURN(0); + } + RETURN(0); +} + +/* needs write_lock on kgn_peer_conn_lock held */ +int +kgnilnd_check_conn_timeouts_locked(kgn_conn_t *conn) +{ + unsigned long timeout, keepalive; + unsigned long now = jiffies; + unsigned long newest_last_rx; + kgn_tx_t *tx; + + /* given that we found this conn hanging off a peer, it better damned + * well be connected */ + LASSERTF(conn->gnc_state == GNILND_CONN_ESTABLISHED, + "conn 0x%p->%s with bad state%s\n", conn, + conn->gnc_peer ? libcfs_nid2str(conn->gnc_peer->gnp_nid) + : "", + kgnilnd_conn_state2str(conn)); + + CDEBUG(D_NET, "checking conn %p->%s timeout %d keepalive %d " + "rx_diff %lu tx_diff %lu\n", + conn, libcfs_nid2str(conn->gnc_peer->gnp_nid), + conn->gnc_timeout, GNILND_TO2KA(conn->gnc_timeout), + cfs_duration_sec(now - conn->gnc_last_rx_cq), + cfs_duration_sec(now - conn->gnc_last_tx)); + + timeout = cfs_time_seconds(conn->gnc_timeout); + keepalive = cfs_time_seconds(GNILND_TO2KA(conn->gnc_timeout)); + + /* just in case our lack of RX msg processing is gumming up the works - give the + * remove an extra chance */ + + newest_last_rx = GNILND_LASTRX(conn); + + if (time_after_eq(now, newest_last_rx + timeout)) { + GNIDBG_CONN(D_CONSOLE|D_NETERROR, conn, "No gnilnd traffic received from %s for %lu " + "seconds, terminating connection. Is node down? ", + libcfs_nid2str(conn->gnc_peer->gnp_nid), + cfs_duration_sec(now - newest_last_rx)); + return -ETIMEDOUT; + } + + /* we don't timeout on last_tx stalls - we are going to trust the + * underlying network to let us know when sends are failing. + * At worst, the peer will timeout our RX stamp and drop the connection + * at that point. We'll then see his CLOSE or at worst his RX + * stamp stop and drop the connection on our end */ + + if (time_after_eq(now, conn->gnc_last_tx + keepalive)) { + CDEBUG(D_NET, "sending NOOP -> %s (%p idle %lu(%lu)) " + "last %lu/%lu/%lu %lus/%lus/%lus\n", + libcfs_nid2str(conn->gnc_peer->gnp_nid), conn, + cfs_duration_sec(jiffies - conn->gnc_last_tx), + keepalive, + conn->gnc_last_noop_want, conn->gnc_last_noop_sent, + conn->gnc_last_noop_cq, + cfs_duration_sec(jiffies - conn->gnc_last_noop_want), + cfs_duration_sec(jiffies - conn->gnc_last_noop_sent), + cfs_duration_sec(jiffies - conn->gnc_last_noop_cq)); + set_mb(conn->gnc_last_noop_want, jiffies); + atomic_inc(&conn->gnc_reaper_noop); + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_NOOP_SEND)) + return 0; + + tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP, conn->gnc_peer->gnp_net->gnn_ni->ni_nid); + if (tx == NULL) + return 0; + kgnilnd_queue_tx(conn, tx); + } + + return 0; +} + +/* needs write_lock on kgn_peer_conn_lock held */ +void +kgnilnd_check_peer_timeouts_locked(kgn_peer_t *peer, struct list_head *todie, + struct list_head *souls) +{ + unsigned long timeout; + kgn_conn_t *conn, *connN = NULL; + kgn_tx_t *tx, *txN; + int rc = 0; + int count = 0; + int reconnect; + short releaseconn = 0; + unsigned long first_rx = 0; + + CDEBUG(D_NET, "checking peer 0x%p->%s for timeouts; interval %lus\n", + peer, libcfs_nid2str(peer->gnp_nid), + peer->gnp_reconnect_interval); + + timeout = cfs_time_seconds(MAX(*kgnilnd_tunables.kgn_timeout, + GNILND_MIN_TIMEOUT)); + + conn = kgnilnd_find_conn_locked(peer); + if (conn) { + /* if there is a valid conn, check the queues for timeouts */ + rc = kgnilnd_check_conn_timeouts_locked(conn); + if (rc) { + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_RX_CLOSE_CLOSING)) { + /* simulate a RX CLOSE after the timeout but before + * the scheduler thread gets it */ + conn->gnc_close_recvd = GNILND_CLOSE_INJECT1; + conn->gnc_peer_error = -ETIMEDOUT; + } + /* Once we mark closed, any of the scheduler threads could + * get it and move through before we hit the fail loc code */ + kgnilnd_close_conn_locked(conn, rc); + } else { + /* first_rx is used to decide when to release a conn from purgatory. + */ + first_rx = conn->gnc_first_rx; + } + } + + /* now regardless of starting new conn, find tx on peer queue that + * are old and smell bad - do this first so we don't trigger + * reconnect on empty queue if we timeout all */ + list_for_each_entry_safe(tx, txN, &peer->gnp_tx_queue, tx_list) { + if (time_after_eq(jiffies, tx->tx_qtime + timeout)) { + if (count == 0) { + LCONSOLE_INFO("could not send to %s due to connection" + " setup failure after %lu seconds\n", + libcfs_nid2str(peer->gnp_nid), + cfs_duration_sec(jiffies - tx->tx_qtime)); + } + kgnilnd_tx_del_state_locked(tx, peer, NULL, + GNILND_TX_ALLOCD); + list_add_tail(&tx->tx_list, todie); + count++; + } + } + + if (count || peer->gnp_connecting == GNILND_PEER_KILL) { + CDEBUG(D_NET, "canceling %d tx for peer 0x%p->%s\n", + count, peer, libcfs_nid2str(peer->gnp_nid)); + /* if we nuked all the TX, stop peer connection attempt (if there is one..) */ + if (list_empty(&peer->gnp_tx_queue) || + peer->gnp_connecting == GNILND_PEER_KILL) { + /* we pass down todie to use a common function - but we know there are + * no TX to add */ + kgnilnd_cancel_peer_connect_locked(peer, todie); + } + } + + /* Don't reconnect if we are still trying to clear out old conns. + * This prevents us sending traffic on the new mbox before ensuring we are done + * with the old one */ + reconnect = (atomic_read(&peer->gnp_dirty_eps) == 0); + + /* if we are not connected and there are tx on the gnp_tx_queue waiting + * to be sent, we'll check the reconnect interval and fire up a new + * connection request */ + + if ((peer->gnp_connecting == GNILND_PEER_IDLE) && + (time_after_eq(jiffies, peer->gnp_reconnect_time)) && + !list_empty(&peer->gnp_tx_queue) && reconnect) { + + CDEBUG(D_NET, "starting connect to %s\n", + libcfs_nid2str(peer->gnp_nid)); + LASSERTF(peer->gnp_connecting == GNILND_PEER_IDLE, "Peer was idle and we" + "have a write_lock, state issue %d\n", peer->gnp_connecting); + + peer->gnp_connecting = GNILND_PEER_CONNECT; + kgnilnd_peer_addref(peer); /* extra ref for connd */ + + spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock); + list_add_tail(&peer->gnp_connd_list, + &peer->gnp_net->gnn_dev->gnd_connd_peers); + spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock); + + kgnilnd_schedule_dgram(peer->gnp_net->gnn_dev); + } + + /* fail_loc to allow us to delay release of purgatory */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PURG_REL_DELAY)) + return; + + /* This check allows us to verify that the new conn is actually being used. This allows us to + * pull the old conns out of purgatory if they have actually seen traffic. + * We only release a conn from purgatory during stack reset, admin command, or when a peer reconnects + */ + if (first_rx && + time_after(jiffies, first_rx + cfs_time_seconds(*kgnilnd_tunables.kgn_hardware_timeout))) { + CDEBUG(D_NET,"We can release conn %p from purgatory %lu\n", + conn, first_rx + cfs_time_seconds(*kgnilnd_tunables.kgn_hardware_timeout)); + releaseconn = 1; + } + + list_for_each_entry_safe (conn, connN, &peer->gnp_conns, gnc_list) { + /* check for purgatory timeouts */ + if (conn->gnc_in_purgatory) { + /* We cannot detach this conn from purgatory if it has not been closed so we reschedule it + * that way the next time we check it we can detach it from purgatory + */ + + if (conn->gnc_state != GNILND_CONN_DONE) { + /* Skip over conns that are currently not DONE. If they arent already scheduled + * for completion something in the state machine is broken. + */ + continue; + } + + /* We only detach a conn that is in purgatory if we have received a close message, + * we have a new valid connection that has successfully received data, or an admin + * command tells us we need to detach. + */ + + if (conn->gnc_close_recvd || releaseconn || conn->gnc_needs_detach) { + unsigned long waiting; + + waiting = (long) jiffies - conn->gnc_last_rx_cq; + + /* C.E: The remote peer is expected to close the + * connection (see kgnilnd_check_conn_timeouts) + * via the reaper thread and nuke out the MDD and + * FMA resources after conn->gnc_timeout has expired + * without an FMA RX */ + CDEBUG(D_NET, "Reconnected to %s in %lds or admin forced detach, dropping " + " held resources\n", + libcfs_nid2str(conn->gnc_peer->gnp_nid), + cfs_duration_sec(waiting)); + + kgnilnd_detach_purgatory_locked(conn, souls); + } + } + } + + return; +} + +void +kgnilnd_reaper_check(int idx) +{ + struct list_head *peers = &kgnilnd_data.kgn_peers[idx]; + struct list_head *ctmp, *ctmpN; + struct list_head geriatrics; + struct list_head souls; + + INIT_LIST_HEAD(&geriatrics); + INIT_LIST_HEAD(&souls); + + write_lock(&kgnilnd_data.kgn_peer_conn_lock); + + list_for_each_safe(ctmp, ctmpN, peers) { + kgn_peer_t *peer = NULL; + + /* don't timeout stuff if the network is mucked or shutting down */ + if (kgnilnd_check_hw_quiesce()) { + break; + } + peer = list_entry(ctmp, kgn_peer_t, gnp_list); + + kgnilnd_check_peer_timeouts_locked(peer, &geriatrics, &souls); + } + + write_unlock(&kgnilnd_data.kgn_peer_conn_lock); + + kgnilnd_txlist_done(&geriatrics, -EHOSTUNREACH); + kgnilnd_release_purgatory_list(&souls); +} + +void +kgnilnd_update_reaper_timeout(long timeout) +{ + LASSERT(timeout > 0); + + spin_lock(&kgnilnd_data.kgn_reaper_lock); + + if (timeout < kgnilnd_data.kgn_new_min_timeout) + kgnilnd_data.kgn_new_min_timeout = timeout; + + spin_unlock(&kgnilnd_data.kgn_reaper_lock); +} + +static void +kgnilnd_reaper_poke_with_stick(unsigned long arg) +{ + wake_up(&kgnilnd_data.kgn_reaper_waitq); +} + +int +kgnilnd_reaper(void *arg) +{ + long timeout; + int i; + int hash_index = 0; + unsigned long next_check_time = jiffies; + long current_min_timeout = MAX_SCHEDULE_TIMEOUT; + struct timer_list timer; + DEFINE_WAIT(wait); + + cfs_daemonize("kgnilnd_rpr"); + cfs_block_allsigs(); + + /* all gnilnd threads need to run fairly urgently */ + set_user_nice(current, *kgnilnd_tunables.kgn_nice); + spin_lock(&kgnilnd_data.kgn_reaper_lock); + + while (!kgnilnd_data.kgn_shutdown) { + /* I wake up every 'p' seconds to check for timeouts on some + * more peers. I try to check every connection 'n' times + * within the global minimum of all keepalive and timeout + * intervals, to ensure I attend to every connection within + * (n+1)/n times its timeout intervals. */ + const int p = GNILND_REAPER_THREAD_WAKE; + const int n = GNILND_REAPER_NCHECKS; + int chunk; + /* to quiesce or to not quiesce, that is the question */ + if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) { + spin_unlock(&kgnilnd_data.kgn_reaper_lock); + KGNILND_SPIN_QUIESCE; + spin_lock(&kgnilnd_data.kgn_reaper_lock); + } + + /* careful with the jiffy wrap... */ + timeout = (long)(next_check_time - jiffies); + + if (timeout > 0) { + prepare_to_wait(&kgnilnd_data.kgn_reaper_waitq, &wait, + TASK_INTERRUPTIBLE); + spin_unlock(&kgnilnd_data.kgn_reaper_lock); + setup_timer(&timer, kgnilnd_reaper_poke_with_stick, + next_check_time); + mod_timer(&timer, (long) jiffies + timeout); + + /* check flag variables before comitting */ + if (!kgnilnd_data.kgn_shutdown && + !kgnilnd_data.kgn_quiesce_trigger) { + CDEBUG(D_INFO, "schedule timeout %ld (%lu sec)\n", + timeout, cfs_duration_sec(timeout)); + schedule(); + CDEBUG(D_INFO, "awake after schedule\n"); + } + + del_singleshot_timer_sync(&timer); + spin_lock(&kgnilnd_data.kgn_reaper_lock); + finish_wait(&kgnilnd_data.kgn_reaper_waitq, &wait); + continue; + } + + /* new_min_timeout is set from the conn timeouts and keepalive + * this should end up with a min timeout of + * GNILND_TIMEOUT2KEEPALIVE(t) or roughly LND_TIMEOUT/2 */ + if (kgnilnd_data.kgn_new_min_timeout < current_min_timeout) { + current_min_timeout = kgnilnd_data.kgn_new_min_timeout; + CDEBUG(D_NET, "Set new min timeout %ld\n", + current_min_timeout); + } + + spin_unlock(&kgnilnd_data.kgn_reaper_lock); + + /* Compute how many table entries to check now so I get round + * the whole table fast enough given that I do this at fixed + * intervals of 'p' seconds) */ + chunk = *kgnilnd_tunables.kgn_peer_hash_size; + if (kgnilnd_data.kgn_new_min_timeout > n * p) + chunk = (chunk * n * p) / + kgnilnd_data.kgn_new_min_timeout; + if (chunk == 0) + chunk = 1; + for (i = 0; i < chunk; i++) { + kgnilnd_reaper_check(hash_index); + hash_index = (hash_index + 1) % + *kgnilnd_tunables.kgn_peer_hash_size; + } + next_check_time = (long) jiffies + cfs_time_seconds(p); + CDEBUG(D_INFO, "next check at %lu or in %d sec\n", next_check_time, p); + + spin_lock(&kgnilnd_data.kgn_reaper_lock); + } + + spin_unlock(&kgnilnd_data.kgn_reaper_lock); + + kgnilnd_thread_fini(); + return 0; +} + +int +kgnilnd_check_rdma_cq(kgn_device_t *dev) +{ + gni_return_t rrc; + gni_post_descriptor_t *desc; + __u64 event_data; + kgn_tx_ev_id_t ev_id; + char err_str[256]; + int should_retry, rc; + long num_processed = 0; + kgn_conn_t *conn = NULL; + kgn_tx_t *tx = NULL; + + for (;;) { + /* make sure we don't keep looping if we need to reset */ + if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) { + return num_processed; + } + rc = kgnilnd_mutex_trylock(&dev->gnd_cq_mutex); + if (!rc) { + /* we didn't get the mutex, so return that there is still work + * to be done */ + return 1; + } + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_DELAY_RDMA)) { + /* a bit gross - but we need a good way to test for + * delayed RDMA completions and the easiest way to do + * that is to delay the RDMA CQ events */ + rrc = GNI_RC_NOT_DONE; + } else { + rrc = kgnilnd_cq_get_event(dev->gnd_snd_rdma_cqh, &event_data); + } + + if (rrc == GNI_RC_NOT_DONE) { + mutex_unlock(&dev->gnd_cq_mutex); + CDEBUG(D_INFO, "SEND RDMA CQ %d empty processed %ld\n", + dev->gnd_id, num_processed); + return num_processed; + } + dev->gnd_sched_alive = jiffies; + num_processed++; + + LASSERTF(!GNI_CQ_OVERRUN(event_data), + "this is bad, somehow our credits didn't protect us" + " from CQ overrun\n"); + LASSERTF(GNI_CQ_GET_TYPE(event_data) == GNI_CQ_EVENT_TYPE_POST, + "rrc %d, GNI_CQ_GET_TYPE("LPX64") = "LPX64"\n", rrc, + event_data, GNI_CQ_GET_TYPE(event_data)); + + rrc = kgnilnd_get_completed(dev->gnd_snd_rdma_cqh, event_data, + &desc); + mutex_unlock(&dev->gnd_cq_mutex); + + /* XXX Nic: Need better error handling here... */ + LASSERTF((rrc == GNI_RC_SUCCESS) || + (rrc == GNI_RC_TRANSACTION_ERROR), + "rrc %d\n", rrc); + + ev_id.txe_cookie = desc->post_id; + + kgnilnd_validate_tx_ev_id(&ev_id, &tx, &conn); + + if (conn == NULL || tx == NULL) { + /* either conn or tx was already nuked and this is a "late" + * completion, so drop it */ + continue; + } + + GNITX_ASSERTF(tx, tx->tx_msg.gnm_type == GNILND_MSG_PUT_DONE || + tx->tx_msg.gnm_type == GNILND_MSG_GET_DONE, + "tx %p with type %d\n", tx, tx->tx_msg.gnm_type); + + GNIDBG_TX(D_NET, tx, "RDMA completion for %d bytes", tx->tx_nob); + + /* remove from rdmaq */ + spin_lock(&conn->gnc_list_lock); + kgnilnd_tx_del_state_locked(tx, NULL, conn, GNILND_TX_ALLOCD); + spin_unlock(&conn->gnc_list_lock); + + if (likely(desc->status == GNI_RC_SUCCESS)) { + atomic_inc(&dev->gnd_rdma_ntx); + atomic64_add(tx->tx_nob, &dev->gnd_rdma_txbytes); + /* transaction succeeded, add into fmaq */ + kgnilnd_queue_tx(conn, tx); + kgnilnd_peer_alive(conn->gnc_peer); + + /* drop ref from kgnilnd_validate_tx_ev_id */ + kgnilnd_conn_decref(conn); + continue; + } + + /* fall through to the TRANSACTION_ERROR case */ + tx->tx_retrans++; + + /* get stringified version for log messages */ + kgnilnd_cq_error_str(event_data, &err_str, 256); + kgnilnd_cq_error_recoverable(event_data, &should_retry); + + /* make sure we are not off in the weeds with this tx */ + if (tx->tx_retrans > + *kgnilnd_tunables.kgn_max_retransmits) { + GNIDBG_TX(D_NETERROR, tx, + "giving up on TX, too many retries", NULL); + should_retry = 0; + } + + GNIDBG_TX(D_NETERROR, tx, "RDMA %s error (%s)", + should_retry ? "transient" : "unrecoverable", err_str); + + if (tx->tx_msg.gnm_type == GNILND_MSG_PUT_DONE) { + if (should_retry) { + kgnilnd_rdma(tx, GNILND_MSG_PUT_DONE, + &tx->tx_putinfo.gnpam_desc, + tx->tx_putinfo.gnpam_desc.gnrd_nob, + tx->tx_putinfo.gnpam_dst_cookie); + } else { + kgnilnd_nak_rdma(conn, GNILND_MSG_PUT_NAK, + -EFAULT, + tx->tx_putinfo.gnpam_dst_cookie, + tx->tx_msg.gnm_srcnid); + kgnilnd_tx_done(tx, -EFAULT); + } + } else { + if (should_retry) { + kgnilnd_rdma(tx, GNILND_MSG_GET_DONE, + &tx->tx_getinfo.gngm_desc, + tx->tx_lntmsg[0]->msg_len, + tx->tx_getinfo.gngm_cookie); + } else { + kgnilnd_nak_rdma(conn, GNILND_MSG_GET_NAK, + -EFAULT, + tx->tx_getinfo.gngm_cookie, + tx->tx_msg.gnm_srcnid); + kgnilnd_tx_done(tx, -EFAULT); + } + } + + /* drop ref from kgnilnd_validate_tx_ev_id */ + kgnilnd_conn_decref(conn); + } +} + +int +kgnilnd_check_fma_send_cq(kgn_device_t *dev) +{ + gni_return_t rrc; + __u64 event_data; + kgn_tx_ev_id_t ev_id; + kgn_tx_t *tx = NULL; + kgn_conn_t *conn = NULL; + int queued_fma, saw_reply, rc; + long num_processed = 0; + + for (;;) { + /* make sure we don't keep looping if we need to reset */ + if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) { + return num_processed; + } + + rc = kgnilnd_mutex_trylock(&dev->gnd_cq_mutex); + if (!rc) { + /* we didn't get the mutex, so return that there is still work + * to be done */ + return 1; + } + + rrc = kgnilnd_cq_get_event(dev->gnd_snd_fma_cqh, &event_data); + mutex_unlock(&dev->gnd_cq_mutex); + + if (rrc == GNI_RC_NOT_DONE) { + CDEBUG(D_INFO, + "SMSG send CQ %d not ready (data "LPX64") " + "processed %ld\n", dev->gnd_id, event_data, + num_processed); + return num_processed; + } + + dev->gnd_sched_alive = jiffies; + num_processed++; + + LASSERTF(!GNI_CQ_OVERRUN(event_data), + "this is bad, somehow our credits didn't " + "protect us from CQ overrun\n"); + LASSERTF(GNI_CQ_GET_TYPE(event_data) == GNI_CQ_EVENT_TYPE_SMSG, + "rrc %d, GNI_CQ_GET_TYPE("LPX64") = "LPX64"\n", rrc, + event_data, GNI_CQ_GET_TYPE(event_data)); + + /* if SMSG couldn't handle an error, time for conn to die */ + if (unlikely(rrc == GNI_RC_TRANSACTION_ERROR)) { + char err_str[256]; + + /* need to take the write_lock to ensure atomicity + * on the conn state if we need to close it */ + write_lock(&kgnilnd_data.kgn_peer_conn_lock); + conn = kgnilnd_cqid2conn_locked(GNI_CQ_GET_INST_ID(event_data)); + if (conn == NULL) { + /* Conn was destroyed? */ + CDEBUG(D_NET, + "SMSG CQID lookup "LPX64" failed\n", + GNI_CQ_GET_INST_ID(event_data)); + write_unlock(&kgnilnd_data.kgn_peer_conn_lock); + continue; + } + + kgnilnd_cq_error_str(event_data, &err_str, 256); + CNETERR("SMSG send error to %s: rc %d (%s)\n", + libcfs_nid2str(conn->gnc_peer->gnp_nid), + rrc, err_str); + kgnilnd_close_conn_locked(conn, -ECOMM); + + write_unlock(&kgnilnd_data.kgn_peer_conn_lock); + + /* no need to process rest of this tx - + * it is getting canceled */ + continue; + } + + /* fall through to GNI_RC_SUCCESS case */ + ev_id.txe_smsg_id = GNI_CQ_GET_MSG_ID(event_data); + + kgnilnd_validate_tx_ev_id(&ev_id, &tx, &conn); + if (conn == NULL || tx == NULL) { + /* either conn or tx was already nuked and this is a "late" + * completion, so drop it */ + continue; + } + + tx->tx_conn->gnc_last_tx_cq = jiffies; + if (tx->tx_msg.gnm_type == GNILND_MSG_NOOP) { + set_mb(conn->gnc_last_noop_cq, jiffies); + } + + /* lock tx_list_state and tx_state */ + spin_lock(&tx->tx_conn->gnc_list_lock); + + GNITX_ASSERTF(tx, tx->tx_list_state == GNILND_TX_LIVE_FMAQ, + "state not GNILND_TX_LIVE_FMAQ", NULL); + GNITX_ASSERTF(tx, tx->tx_state & GNILND_TX_WAITING_COMPLETION, + "not waiting for completion", NULL); + + GNIDBG_TX(D_NET, tx, "SMSG complete tx_state %x rc %d", + tx->tx_state, rrc); + + tx->tx_state &= ~GNILND_TX_WAITING_COMPLETION; + + /* This will trigger other FMA sends that were + * pending this completion */ + queued_fma = !list_empty(&tx->tx_conn->gnc_fmaq); + + /* we either did not expect reply or we already got it */ + saw_reply = !(tx->tx_state & GNILND_TX_WAITING_REPLY); + + spin_unlock(&tx->tx_conn->gnc_list_lock); + + if (queued_fma) { + CDEBUG(D_NET, "scheduling conn 0x%p->%s for fmaq\n", + conn, + libcfs_nid2str(conn->gnc_peer->gnp_nid)); + kgnilnd_schedule_conn(conn); + } + + /* If saw_reply is false as soon as gnc_list_lock is dropped the tx could be nuked + * If saw_reply is true we know that the tx is safe to use as the other thread + * is already finished with it. + */ + + if (saw_reply) { + /* no longer need to track on the live_fmaq */ + kgnilnd_tx_del_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_ALLOCD); + + if (tx->tx_state & GNILND_TX_PENDING_RDMA) { + /* we already got reply & were waiting for + * completion of initial send */ + /* to initiate RDMA transaction */ + GNIDBG_TX(D_NET, tx, + "Pending RDMA 0x%p type 0x%02x", + tx->tx_msg.gnm_type); + tx->tx_state &= ~GNILND_TX_PENDING_RDMA; + rc = kgnilnd_send_mapped_tx(tx, 0); + GNITX_ASSERTF(tx, rc == 0, "RDMA send failed: %d\n", rc); + } else { + /* we are done with this tx */ + GNIDBG_TX(D_NET, tx, + "Done with tx type 0x%02x", + tx->tx_msg.gnm_type); + kgnilnd_tx_done(tx, tx->tx_rc); + } + } + + /* drop ref from kgnilnd_validate_tx_ev_id */ + kgnilnd_conn_decref(conn); + + /* if we are waiting for a REPLY, we'll handle the tx then */ + } /* end for loop */ +} + +int +kgnilnd_check_fma_rcv_cq(kgn_device_t *dev) +{ + kgn_conn_t *conn; + gni_return_t rrc; + __u64 event_data; + long num_processed = 0; + struct list_head *conns; + struct list_head *tmp; + int rc; + + for (;;) { + /* make sure we don't keep looping if we need to reset */ + if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) { + return num_processed; + } + + rc = kgnilnd_mutex_trylock(&dev->gnd_cq_mutex); + if (!rc) { + /* we didn't get the mutex, so return that there is still work + * to be done */ + return 1; + } + rrc = kgnilnd_cq_get_event(dev->gnd_rcv_fma_cqh, &event_data); + mutex_unlock(&dev->gnd_cq_mutex); + + if (rrc == GNI_RC_NOT_DONE) { + CDEBUG(D_INFO, "SMSG RX CQ %d empty data "LPX64" " + "processed %ld\n", + dev->gnd_id, event_data, num_processed); + return num_processed; + } + dev->gnd_sched_alive = jiffies; + num_processed++; + + /* this is the only CQ that can really handle transient + * CQ errors */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CQ_GET_EVENT)) { + rrc = cfs_fail_val ? cfs_fail_val + : GNI_RC_ERROR_RESOURCE; + if (rrc == GNI_RC_ERROR_RESOURCE) { + /* set overrun too */ + event_data |= (1UL << 63); + LASSERTF(GNI_CQ_OVERRUN(event_data), + "(1UL << 63) is no longer the bit to" + "set to indicate CQ_OVERRUN\n"); + } + } + /* sender should get error event too and take care + of failed transaction by re-transmitting */ + if (rrc == GNI_RC_TRANSACTION_ERROR) { + CDEBUG(D_NET, "SMSG RX CQ error "LPX64"\n", event_data); + continue; + } + + if (likely(!GNI_CQ_OVERRUN(event_data))) { + read_lock(&kgnilnd_data.kgn_peer_conn_lock); + conn = kgnilnd_cqid2conn_locked( + GNI_CQ_GET_INST_ID(event_data)); + if (conn == NULL) { + CDEBUG(D_NET, "SMSG RX CQID lookup "LPU64" " + "failed, dropping event "LPX64"\n", + GNI_CQ_GET_INST_ID(event_data), + event_data); + } else { + CDEBUG(D_NET, "SMSG RX: CQID "LPU64" " + "conn %p->%s\n", + GNI_CQ_GET_INST_ID(event_data), + conn, conn->gnc_peer ? + libcfs_nid2str(conn->gnc_peer->gnp_nid) : + ""); + + conn->gnc_last_rx_cq = jiffies; + + /* stash first rx so we can clear out purgatory. + */ + if (conn->gnc_first_rx == 0) { + conn->gnc_first_rx = jiffies; + } + kgnilnd_peer_alive(conn->gnc_peer); + kgnilnd_schedule_conn(conn); + } + read_unlock(&kgnilnd_data.kgn_peer_conn_lock); + continue; + } + + /* FMA CQ has overflowed: check ALL conns */ + CNETERR("SMSG RX CQ overflow: scheduling ALL " + "conns on device %d\n", dev->gnd_id); + + for (rc = 0; rc < *kgnilnd_tunables.kgn_peer_hash_size; rc++) { + + read_lock(&kgnilnd_data.kgn_peer_conn_lock); + conns = &kgnilnd_data.kgn_conns[rc]; + + list_for_each(tmp, conns) { + conn = list_entry(tmp, kgn_conn_t, + gnc_hashlist); + + if (conn->gnc_device == dev) { + kgnilnd_schedule_conn(conn); + conn->gnc_last_rx_cq = jiffies; + } + } + + /* don't block write lockers for too long... */ + read_unlock(&kgnilnd_data.kgn_peer_conn_lock); + } + } +} + +/* try_map_if_full should only be used when processing TX from list of + * backlog TX waiting on mappings to free up + * + * Return Codes: + * try_map_if_full = 0: 0 (sent or queued), (-|+)errno failure of kgnilnd_sendmsg + * try_map_if_full = 1: 0 (sent), -ENOMEM for caller to requeue, (-|+)errno failure of kgnilnd_sendmsg */ + +int +kgnilnd_send_mapped_tx(kgn_tx_t *tx, int try_map_if_full) +{ + /* slight bit of race if multiple people calling, but at worst we'll have + * order altered just a bit... which would not be determenistic anyways */ + int rc = atomic_read(&tx->tx_conn->gnc_device->gnd_nq_map); + + GNIDBG_TX(D_NET, tx, "try %d nq_map %d", try_map_if_full, rc); + + /* We know that we have a GART reservation that should guarantee forward progress. + * This means we don't need to take any extraordinary efforts if we are failing + * mappings here - even if we are holding a very small number of these. */ + + if (try_map_if_full || (rc == 0)) { + rc = kgnilnd_map_buffer(tx); + } + + /* rc should be 0 if we mapped succesfully here, if non-zero we are queueing */ + if (rc != 0) { + /* if try_map_if_full set, they handle requeuing */ + if (unlikely(try_map_if_full)) { + RETURN(rc); + } else { + spin_lock(&tx->tx_conn->gnc_device->gnd_lock); + kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_MAPQ, 1); + spin_unlock(&tx->tx_conn->gnc_device->gnd_lock); + /* make sure we wake up sched to run this */ + kgnilnd_schedule_device(tx->tx_conn->gnc_device); + /* return 0 as this is now queued for later sending */ + RETURN(0); + } + } + + switch (tx->tx_msg.gnm_type) { + default: + LBUG(); + break; + /* GET_REQ and PUT_ACK are outbound messages sending our mapping key to + * remote node where the RDMA will be started + * Special case -EAGAIN logic - this should just queued as if the mapping couldn't + * be satisified. The rest of the errors are "hard" errors that require + * upper layers to handle themselves */ + case GNILND_MSG_GET_REQ: + tx->tx_msg.gnm_u.get.gngm_desc.gnrd_key = tx->tx_map_key; + tx->tx_msg.gnm_u.get.gngm_cookie = tx->tx_id.txe_cookie; + tx->tx_msg.gnm_u.get.gngm_desc.gnrd_addr = (__u64)((unsigned long)tx->tx_buffer); + tx->tx_msg.gnm_u.get.gngm_desc.gnrd_nob = tx->tx_nob; + tx->tx_state = GNILND_TX_WAITING_COMPLETION | GNILND_TX_WAITING_REPLY; + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GET_REQ_AGAIN)) { + tx->tx_state |= GNILND_TX_FAIL_SMSG; + } + /* redirect to FMAQ on failure, no need to infinite loop here in MAPQ */ + rc = kgnilnd_sendmsg(tx, NULL, 0, &tx->tx_conn->gnc_list_lock, GNILND_TX_FMAQ); + break; + case GNILND_MSG_PUT_ACK: + tx->tx_msg.gnm_u.putack.gnpam_desc.gnrd_key = tx->tx_map_key; + tx->tx_state = GNILND_TX_WAITING_COMPLETION | GNILND_TX_WAITING_REPLY; + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PUT_ACK_AGAIN)) { + tx->tx_state |= GNILND_TX_FAIL_SMSG; + } + /* redirect to FMAQ on failure, no need to infinite loop here in MAPQ */ + rc = kgnilnd_sendmsg(tx, NULL, 0, &tx->tx_conn->gnc_list_lock, GNILND_TX_FMAQ); + break; + + /* PUT_REQ and GET_DONE are where we do the actual RDMA */ + case GNILND_MSG_PUT_REQ: + kgnilnd_rdma(tx, GNILND_MSG_PUT_DONE, + &tx->tx_putinfo.gnpam_desc, + tx->tx_putinfo.gnpam_desc.gnrd_nob, + tx->tx_putinfo.gnpam_dst_cookie); + break; + case GNILND_MSG_GET_DONE: + kgnilnd_rdma(tx, GNILND_MSG_GET_DONE, + &tx->tx_getinfo.gngm_desc, + tx->tx_lntmsg[0]->msg_len, + tx->tx_getinfo.gngm_cookie); + + break; + } + + RETURN(rc); +} + +void +kgnilnd_process_fmaq(kgn_conn_t *conn) +{ + int more_to_do = 0; + kgn_tx_t *tx = NULL; + void *buffer = NULL; + unsigned int nob = 0; + int rc; + + /* NB 1. kgnilnd_sendmsg() may fail if I'm out of credits right now. + * However I will be rescheduled by an FMA completion event + * when I eventually get some. + * NB 2. Sampling gnc_state here races with setting it elsewhere. + * But it doesn't matter if I try to send a "real" message just + * as I start closing because I'll get scheduled to send the + * close anyway. */ + + /* Short circuit if the ep_handle is null we cant send anyway. */ + if (conn->gnc_ephandle == NULL) + return; + + LASSERTF(!conn->gnc_close_sent, "Conn %p close was sent\n", conn); + + spin_lock(&conn->gnc_list_lock); + + if (list_empty(&conn->gnc_fmaq)) { + int keepalive = GNILND_TO2KA(conn->gnc_timeout); + + spin_unlock(&conn->gnc_list_lock); + + if (time_after_eq(jiffies, conn->gnc_last_tx + cfs_time_seconds(keepalive))) { + CDEBUG(D_NET, "sending NOOP -> %s (%p idle %lu(%d)) " + "last %lu/%lu/%lu %lus/%lus/%lus\n", + libcfs_nid2str(conn->gnc_peer->gnp_nid), conn, + cfs_duration_sec(jiffies - conn->gnc_last_tx), + keepalive, + conn->gnc_last_noop_want, conn->gnc_last_noop_sent, + conn->gnc_last_noop_cq, + cfs_duration_sec(jiffies - conn->gnc_last_noop_want), + cfs_duration_sec(jiffies - conn->gnc_last_noop_sent), + cfs_duration_sec(jiffies - conn->gnc_last_noop_cq)); + atomic_inc(&conn->gnc_sched_noop); + set_mb(conn->gnc_last_noop_want, jiffies); + + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_NOOP_SEND)) + return; + + tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP, conn->gnc_peer->gnp_net->gnn_ni->ni_nid); + if (tx != NULL) { + int rc; + + rc = kgnilnd_set_tx_id(tx, conn); + if (rc != 0) { + kgnilnd_tx_done(tx, rc); + return; + } + } + } + } else { + tx = list_first_entry(&conn->gnc_fmaq, kgn_tx_t, tx_list); + /* move from fmaq to allocd, kgnilnd_sendmsg will move to live_fmaq */ + kgnilnd_tx_del_state_locked(tx, NULL, conn, GNILND_TX_ALLOCD); + more_to_do = !list_empty(&conn->gnc_fmaq); + spin_unlock(&conn->gnc_list_lock); + } + + /* if there is no real TX or no NOOP to send, bail */ + if (tx == NULL) { + return; + } + + if (!tx->tx_retrans) + tx->tx_cred_wait = jiffies; + + GNITX_ASSERTF(tx, tx->tx_id.txe_smsg_id != 0, + "tx with zero id", NULL); + + CDEBUG(D_NET, "sending regular msg: %p, type %s(0x%02x), cookie "LPX64"\n", + tx, kgnilnd_msgtype2str(tx->tx_msg.gnm_type), + tx->tx_msg.gnm_type, tx->tx_id.txe_cookie); + + rc = 0; + + switch (tx->tx_msg.gnm_type) { + default: + LBUG(); + + case GNILND_MSG_NOOP: + case GNILND_MSG_CLOSE: + case GNILND_MSG_IMMEDIATE: + tx->tx_state = GNILND_TX_WAITING_COMPLETION; + buffer = tx->tx_buffer; + nob = tx->tx_nob; + break; + + case GNILND_MSG_GET_DONE: + case GNILND_MSG_PUT_DONE: + case GNILND_MSG_PUT_NAK: + case GNILND_MSG_GET_NAK: + tx->tx_state = GNILND_TX_WAITING_COMPLETION; + break; + + case GNILND_MSG_PUT_REQ: + tx->tx_msg.gnm_u.putreq.gnprm_cookie = tx->tx_id.txe_cookie; + + case GNILND_MSG_PUT_ACK: + case GNILND_MSG_GET_REQ: + /* This is really only to handle the retransmit of SMSG once these + * two messages are setup in send_mapped_tx */ + tx->tx_state = GNILND_TX_WAITING_COMPLETION | GNILND_TX_WAITING_REPLY; + break; + } + + if (likely(rc == 0)) { + rc = kgnilnd_sendmsg(tx, buffer, nob, &conn->gnc_list_lock, GNILND_TX_FMAQ); + } + + if (rc > 0) { + /* don't explicitly reschedule here - we are short credits and will rely on + * kgnilnd_sendmsg to resched the conn if need be */ + more_to_do = 0; + } else if (rc < 0) { + /* bail: it wasn't sent and we didn't get EAGAIN indicating we should retrans + * almost certainly a software bug, but lets play nice with the other kids */ + kgnilnd_tx_done(tx, rc); + /* just for fun, kick peer in arse - resetting conn might help to correct + * this almost certainly buggy software caused return code */ + kgnilnd_close_conn(conn, rc); + } + + if (more_to_do) { + CDEBUG(D_NET, "Rescheduling %p (more to do)\n", conn); + kgnilnd_schedule_conn(conn); + } +} + +int +kgnilnd_process_rdmaq(kgn_device_t *dev) +{ + int found_work = 0; + kgn_tx_t *tx; + + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_DELAY_RDMAQ)) { + RETURN(found_work); + } + + if (time_after_eq(jiffies, dev->gnd_rdmaq_deadline)) { + unsigned long dead_bump; + long new_ok; + + /* if we think we need to adjust, take lock to serialize and recheck */ + spin_lock(&dev->gnd_rdmaq_lock); + if (time_after_eq(jiffies, dev->gnd_rdmaq_deadline)) { + del_singleshot_timer_sync(&dev->gnd_rdmaq_timer); + + dead_bump = cfs_time_seconds(1) / *kgnilnd_tunables.kgn_rdmaq_intervals; + + /* roll the bucket forward */ + dev->gnd_rdmaq_deadline = jiffies + dead_bump; + + if (kgnilnd_data.kgn_rdmaq_override && + (*kgnilnd_tunables.kgn_rdmaq_intervals != 0)) { + new_ok = kgnilnd_data.kgn_rdmaq_override / *kgnilnd_tunables.kgn_rdmaq_intervals; + } else { + new_ok = ~0UL >> 1; + } + + /* roll current outstanding forward to make sure we carry outstanding + * committment forward + * new_ok starts out as the whole interval value + * - first subtract bytes_out from last interval, as that would push us over + * strict limits for this interval + * - second, set bytes_ok to new_ok to ensure it doesn't exceed the current auth + * + * there is a small race here if someone is actively processing mappings and + * adding to rdmaq_bytes_out, but it should be small as the mappings are triggered + * quite quickly after kgnilnd_auth_rdma_bytes gives us the go-ahead + * - if this gives us problems in the future, we could use a read/write lock + * to protect the resetting of these values */ + new_ok -= atomic64_read(&dev->gnd_rdmaq_bytes_out); + atomic64_set(&dev->gnd_rdmaq_bytes_ok, new_ok); + + CDEBUG(D_NET, "resetting rdmaq bytes to %ld, deadline +%lu -> %lu, " + "current out %ld\n", + atomic64_read(&dev->gnd_rdmaq_bytes_ok), dead_bump, dev->gnd_rdmaq_deadline, + atomic64_read(&dev->gnd_rdmaq_bytes_out)); + } + spin_unlock(&dev->gnd_rdmaq_lock); + } + + spin_lock(&dev->gnd_rdmaq_lock); + while (!list_empty(&dev->gnd_rdmaq)) { + int rc; + + /* make sure we break out early on quiesce */ + if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) { + /* always break with lock held - we unlock outside loop */ + break; + } + + tx = list_first_entry(&dev->gnd_rdmaq, kgn_tx_t, tx_list); + kgnilnd_tx_del_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_ALLOCD); + found_work++; + + /* sample with lock held, serializing with kgnilnd_complete_closed_conn */ + if (tx->tx_conn->gnc_state != GNILND_CONN_ESTABLISHED) { + /* if conn is dying, mark tx in tx_ref_table for + * kgnilnd_complete_closed_conn to finish up */ + kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_DYING, 1); + + /* tx was moved to DYING, get next */ + continue; + } + spin_unlock(&dev->gnd_rdmaq_lock); + + rc = kgnilnd_auth_rdma_bytes(dev, tx); + spin_lock(&dev->gnd_rdmaq_lock); + + if (rc < 0) { + /* no ticket! add back to head */ + kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_RDMAQ, 0); + /* clear found_work so scheduler threads wait for timer */ + found_work = 0; + break; + } else { + /* TX is GO for launch */ + tx->tx_qtime = jiffies; + kgnilnd_send_mapped_tx(tx, 0); + found_work++; + } + } + spin_unlock(&dev->gnd_rdmaq_lock); + + RETURN(found_work); +} + +static inline void +kgnilnd_swab_rdma_desc(kgn_rdma_desc_t *d) +{ + __swab64s(&d->gnrd_key.qword1); + __swab64s(&d->gnrd_key.qword2); + __swab64s(&d->gnrd_addr); + __swab32s(&d->gnrd_nob); +} + +#define kgnilnd_match_reply_either(w, x, y, z) _kgnilnd_match_reply(w, x, y, z) +#define kgnilnd_match_reply(x, y, z) _kgnilnd_match_reply(x, y, GNILND_MSG_NONE, z) + +kgn_tx_t * +_kgnilnd_match_reply(kgn_conn_t *conn, int type1, int type2, __u64 cookie) +{ + kgn_tx_ev_id_t ev_id; + kgn_tx_t *tx; + + /* we use the cookie from the original TX, so we can find the match + * by parsing that and using the txe_idx */ + ev_id.txe_cookie = cookie; + + tx = conn->gnc_tx_ref_table[ev_id.txe_idx]; + + if (tx != NULL) { + /* check tx to make sure kgni didn't eat it */ + GNITX_ASSERTF(tx, tx->tx_msg.gnm_magic == GNILND_MSG_MAGIC, + "came back from kgni with bad magic %x\n", tx->tx_msg.gnm_magic); + + GNITX_ASSERTF(tx, ((tx->tx_id.txe_idx == ev_id.txe_idx) && + (tx->tx_id.txe_cookie = cookie)), + "conn 0x%p->%s tx_ref_table hosed: wanted " + "txe_cookie "LPX64" txe_idx %d " + "found tx %p cookie "LPX64" txe_idx %d\n", + conn, libcfs_nid2str(conn->gnc_peer->gnp_nid), + cookie, ev_id.txe_idx, + tx, tx->tx_id.txe_cookie, tx->tx_id.txe_idx); + + LASSERTF((((tx->tx_msg.gnm_type == type1) || (tx->tx_msg.gnm_type == type2)) && + (tx->tx_state & GNILND_TX_WAITING_REPLY)), + "Unexpected TX type (%x, %x or %x) " + "or state (%x, expected +%x) " + "matched reply from %s\n", + tx->tx_msg.gnm_type, type1, type2, + tx->tx_state, GNILND_TX_WAITING_REPLY, + libcfs_nid2str(conn->gnc_peer->gnp_nid)); + } else { + CWARN("Unmatched reply %02x, or %02x/"LPX64" from %s\n", + type1, type2, cookie, libcfs_nid2str(conn->gnc_peer->gnp_nid)); + } + return tx; +} + +static inline void +kgnilnd_complete_tx(kgn_tx_t *tx, int rc) +{ + int complete = 0; + kgn_conn_t *conn = tx->tx_conn; + + spin_lock(&conn->gnc_list_lock); + + GNITX_ASSERTF(tx, tx->tx_state & GNILND_TX_WAITING_REPLY, + "not waiting for reply", NULL); + + tx->tx_rc = rc; + tx->tx_state &= ~GNILND_TX_WAITING_REPLY; + + if (!(tx->tx_state & GNILND_TX_WAITING_COMPLETION)) { + kgnilnd_tx_del_state_locked(tx, NULL, conn, GNILND_TX_ALLOCD); + /* sample under lock as follow on steps require gnc_list_lock + * - or call kgnilnd_tx_done which requires no locks held over + * call to lnet_finalize */ + complete = 1; + } + spin_unlock(&conn->gnc_list_lock); + + if (complete) { + kgnilnd_tx_done(tx, tx->tx_rc); + } +} + +static inline void +kgnilnd_finalize_rx_done(kgn_tx_t *tx, kgn_msg_t *msg) +{ + int rc; + kgn_conn_t *conn = tx->tx_conn; + + atomic_inc(&conn->gnc_device->gnd_rdma_nrx); + atomic64_add(tx->tx_nob, &conn->gnc_device->gnd_rdma_rxbytes); + + rc = kgnilnd_verify_rdma_cksum(tx, msg->gnm_payload_cksum); + + kgnilnd_complete_tx(tx, rc); +} + +void +kgnilnd_check_fma_rx(kgn_conn_t *conn) +{ + __u32 seq; + kgn_tx_t *tx; + kgn_rx_t *rx; + kgn_msg_t *msg; + void *prefix; + gni_return_t rrc; + kgn_peer_t *peer = conn->gnc_peer; + kgn_net_t *net; + int rc = 0; + __u16 tmp_cksum = 0, msg_cksum = 0; + int repost = 1, saw_complete; + unsigned long timestamp, newest_last_rx, timeout; + int last_seq; + void *memory = NULL; + ENTRY; + + /* Short circuit if the ep_handle is null. + * It's likely that its about to be closed as stale. + */ + if (conn->gnc_ephandle == NULL) + RETURN_EXIT; + + timestamp = jiffies; + mutex_lock(&conn->gnc_device->gnd_cq_mutex); + /* delay in jiffies - we are really concerned only with things that + * result in a schedule() or really holding this off for long times . + * NB - mutex_lock could spin for 2 jiffies before going to sleep to wait */ + conn->gnc_device->gnd_mutex_delay += (long) jiffies - timestamp; + + /* Resample current time as we have no idea how long it took to get the mutex */ + timestamp = jiffies; + + /* We check here when the last time we received an rx, we do this before + * we call getnext in case the thread has been blocked for a while. If we + * havent received an rx since our timeout value we close the connection + * as we should assume the other side has closed the connection. This will + * stop us from sending replies to a mailbox that is already in purgatory. + */ + + timeout = cfs_time_seconds(conn->gnc_timeout); + newest_last_rx = GNILND_LASTRX(conn); + + /* Error injection to validate that timestamp checking works and closing the conn */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_RECV_TIMEOUT)) { + timestamp = timestamp + (GNILND_TIMEOUTRX(timeout) * 2); + } + + if (time_after_eq(timestamp, newest_last_rx + (GNILND_TIMEOUTRX(timeout)))) { + GNIDBG_CONN(D_NETERROR|D_CONSOLE, conn, "Cant receive from %s after timeout lapse of %lu; TO %lu", + libcfs_nid2str(conn->gnc_peer->gnp_nid), + cfs_duration_sec(timestamp - newest_last_rx), + cfs_duration_sec(GNILND_TIMEOUTRX(timeout))); + mutex_unlock(&conn->gnc_device->gnd_cq_mutex); + rc = -ETIME; + kgnilnd_close_conn(conn, rc); + RETURN_EXIT; + } + + rrc = kgnilnd_smsg_getnext(conn->gnc_ephandle, &prefix); + + if (rrc == GNI_RC_NOT_DONE) { + mutex_unlock(&conn->gnc_device->gnd_cq_mutex); + CDEBUG(D_INFO, "SMSG RX empty\n"); + RETURN_EXIT; + } + + if (rrc == GNI_RC_INVALID_STATE) { + LIBCFS_ALLOC(memory, conn->gnpr_smsg_attr.buff_size); + if (memory == NULL) { + memory = (void *)0xdeadbeef; + } else { + memcpy(memory, conn->gnpr_smsg_attr.msg_buffer + conn->gnpr_smsg_attr.mbox_offset, conn->gnpr_smsg_attr.buff_size); + } + } + + LASSERTF(rrc == GNI_RC_SUCCESS, + "bad rc %d on conn %p from peer %s mailbox copy %p\n", + rrc, conn, libcfs_nid2str(peer->gnp_nid), memory); + + msg = (kgn_msg_t *)prefix; + + rx = kgnilnd_alloc_rx(); + if (rx == NULL) { + mutex_unlock(&conn->gnc_device->gnd_cq_mutex); + kgnilnd_release_msg(conn); + GNIDBG_MSG(D_NETERROR, msg, "Dropping SMSG RX from 0x%p->%s, no RX memory", + conn, libcfs_nid2str(peer->gnp_nid)); + RETURN_EXIT; + } + + GNIDBG_MSG(D_INFO, msg, "SMSG RX on %p from %s", + conn, libcfs_nid2str(peer->gnp_nid)); + + timestamp = conn->gnc_last_rx; + last_seq = conn->gnc_rx_seq; + + conn->gnc_last_rx = jiffies; + /* stash first rx so we can clear out purgatory + */ + if (conn->gnc_first_rx == 0) + conn->gnc_first_rx = jiffies; + + seq = conn->gnc_rx_seq++; + + /* needs to linger to protect gnc_rx_seq like we do with gnc_tx_seq */ + mutex_unlock(&conn->gnc_device->gnd_cq_mutex); + kgnilnd_peer_alive(conn->gnc_peer); + + rx->grx_msg = msg; + rx->grx_conn = conn; + rx->grx_eager = 0; + rx->grx_received = current_kernel_time(); + + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_NET_LOOKUP)) { + rc = -ENONET; + } else { + rc = kgnilnd_find_net(msg->gnm_srcnid, &net); + } + + if (rc < 0) { + GOTO(out, rc); + } else { + kgnilnd_net_decref(net); + } + + if (*kgnilnd_tunables.kgn_checksum && !msg->gnm_cksum) + GNIDBG_MSG(D_WARNING, msg, "no msg header checksum when enabled"); + + /* XXX Nic: Do we need to swab cksum */ + if (msg->gnm_cksum != 0) { + msg_cksum = msg->gnm_cksum; + msg->gnm_cksum = 0; + tmp_cksum = kgnilnd_cksum(msg, sizeof(kgn_msg_t)); + + if (tmp_cksum != msg_cksum) { + GNIDBG_MSG(D_NETERROR, msg, "Bad hdr checksum (%x expected %x)", + tmp_cksum, msg_cksum); + kgnilnd_dump_msg(D_BUFFS, msg); + rc = -ENOKEY; + GOTO(out, rc); + } + } + /* restore checksum for future debug messages */ + msg->gnm_cksum = tmp_cksum; + + if (msg->gnm_magic != GNILND_MSG_MAGIC) { + if (__swab32(msg->gnm_magic) != GNILND_MSG_MAGIC) { + GNIDBG_MSG(D_NETERROR, msg, "Unexpected magic %08x from %s", + msg->gnm_magic, libcfs_nid2str(peer->gnp_nid)); + rc = -EPROTO; + GOTO(out, rc); + } + + __swab32s(&msg->gnm_magic); + __swab16s(&msg->gnm_version); + __swab16s(&msg->gnm_type); + __swab64s(&msg->gnm_srcnid); + __swab64s(&msg->gnm_connstamp); + __swab32s(&msg->gnm_seq); + + /* NB message type checked below; NOT here... */ + switch (msg->gnm_type) { + case GNILND_MSG_PUT_ACK: + kgnilnd_swab_rdma_desc(&msg->gnm_u.putack.gnpam_desc); + break; + + case GNILND_MSG_GET_REQ: + kgnilnd_swab_rdma_desc(&msg->gnm_u.get.gngm_desc); + break; + + default: + break; + } + } + + if (msg->gnm_version != GNILND_MSG_VERSION) { + GNIDBG_MSG(D_NETERROR, msg, "Unexpected protocol version %d from %s", + msg->gnm_version, libcfs_nid2str(peer->gnp_nid)); + rc = -EPROTO; + GOTO(out, rc); + } + + if (LNET_NIDADDR(msg->gnm_srcnid) != LNET_NIDADDR(peer->gnp_nid)) { + GNIDBG_MSG(D_NETERROR, msg, "Unexpected peer %s from %s", + libcfs_nid2str(msg->gnm_srcnid), + libcfs_nid2str(peer->gnp_nid)); + rc = -EPROTO; + GOTO(out, rc); + } + + if (msg->gnm_connstamp != conn->gnc_peer_connstamp) { + GNIDBG_MSG(D_NETERROR, msg, "Unexpected connstamp "LPX64"("LPX64 + " expected) from %s", + msg->gnm_connstamp, conn->gnc_peer_connstamp, + libcfs_nid2str(peer->gnp_nid)); + rc = -EPROTO; + GOTO(out, rc); + } + + if (msg->gnm_seq != seq) { + GNIDBG_MSG(D_NETERROR, msg, "Unexpected sequence number %d(%d expected) from %s", + msg->gnm_seq, seq, libcfs_nid2str(peer->gnp_nid)); + rc = -EPROTO; + GOTO(out, rc); + } + + atomic_inc(&conn->gnc_device->gnd_short_nrx); + + if (msg->gnm_type == GNILND_MSG_CLOSE) { + CDEBUG(D_NETTRACE, "%s sent us CLOSE msg\n", + libcfs_nid2str(conn->gnc_peer->gnp_nid)); + write_lock(&kgnilnd_data.kgn_peer_conn_lock); + conn->gnc_close_recvd = GNILND_CLOSE_RX; + conn->gnc_peer_error = msg->gnm_u.completion.gncm_retval; + /* double check state with lock held */ + if (conn->gnc_state == GNILND_CONN_ESTABLISHED) { + /* only error if we are not already closing */ + if (conn->gnc_peer_error == -ETIMEDOUT) { + unsigned long now = jiffies; + CNETERR("peer 0x%p->%s closed connection 0x%p due to timeout. " + "Is node down? " + "RX %d @ %lus/%lus; TX %d @ %lus/%lus; " + "NOOP %lus/%lus/%lus; sched %lus/%lus/%lus ago\n", + conn->gnc_peer, libcfs_nid2str(conn->gnc_peer->gnp_nid), + conn, last_seq, + cfs_duration_sec(now - timestamp), + cfs_duration_sec(now - conn->gnc_last_rx_cq), + conn->gnc_tx_seq, + cfs_duration_sec(now - conn->gnc_last_tx), + cfs_duration_sec(now - conn->gnc_last_tx_cq), + cfs_duration_sec(now - conn->gnc_last_noop_want), + cfs_duration_sec(now - conn->gnc_last_noop_sent), + cfs_duration_sec(now - conn->gnc_last_noop_cq), + cfs_duration_sec(now - conn->gnc_last_sched_ask), + cfs_duration_sec(now - conn->gnc_last_sched_do), + cfs_duration_sec(now - conn->gnc_device->gnd_sched_alive)); + } + kgnilnd_close_conn_locked(conn, -ECONNRESET); + } + write_unlock(&kgnilnd_data.kgn_peer_conn_lock); + GOTO(out, rc); + } + + if (conn->gnc_close_recvd) { + GNIDBG_MSG(D_NETERROR, msg, "Unexpected message %s(%d/%d) after CLOSE from %s", + kgnilnd_msgtype2str(msg->gnm_type), + msg->gnm_type, conn->gnc_close_recvd, + libcfs_nid2str(conn->gnc_peer->gnp_nid)); + rc = -EPROTO; + GOTO(out, rc); + } + + if (conn->gnc_state != GNILND_CONN_ESTABLISHED) { + /* XXX Nic: log message received on bad connection state */ + GOTO(out, rc); + } + + switch (msg->gnm_type) { + case GNILND_MSG_NOOP: + /* Nothing to do; just a keepalive */ + break; + + case GNILND_MSG_IMMEDIATE: + /* only get SMSG payload for IMMEDIATE */ + atomic64_add(msg->gnm_payload_len, &conn->gnc_device->gnd_short_rxbytes); + rc = lnet_parse(net->gnn_ni, &msg->gnm_u.immediate.gnim_hdr, + msg->gnm_srcnid, rx, 0); + repost = rc < 0; + break; + + case GNILND_MSG_PUT_REQ: + rc = lnet_parse(net->gnn_ni, &msg->gnm_u.putreq.gnprm_hdr, + msg->gnm_srcnid, rx, 1); + repost = rc < 0; + break; + + case GNILND_MSG_PUT_NAK: + tx = kgnilnd_match_reply_either(conn, GNILND_MSG_PUT_REQ, GNILND_MSG_PUT_ACK, + msg->gnm_u.completion.gncm_cookie); + if (tx == NULL) + break; + + kgnilnd_complete_tx(tx, msg->gnm_u.completion.gncm_retval); + break; + + case GNILND_MSG_PUT_ACK: + tx = kgnilnd_match_reply(conn, GNILND_MSG_PUT_REQ, + msg->gnm_u.putack.gnpam_src_cookie); + if (tx == NULL) + break; + + /* store putack data for later: deferred rdma or re-try */ + tx->tx_putinfo = msg->gnm_u.putack; + + saw_complete = 0; + spin_lock(&tx->tx_conn->gnc_list_lock); + + GNITX_ASSERTF(tx, tx->tx_state & GNILND_TX_WAITING_REPLY, + "not waiting for reply", NULL); + + tx->tx_state &= ~GNILND_TX_WAITING_REPLY; + + if (likely(!(tx->tx_state & GNILND_TX_WAITING_COMPLETION))) { + kgnilnd_tx_del_state_locked(tx, NULL, conn, GNILND_TX_ALLOCD); + /* sample under lock as follow on steps require gnc_list_lock + * - or call kgnilnd_tx_done which requires no locks held over + * call to lnet_finalize */ + saw_complete = 1; + } else { + /* cannot launch rdma if still waiting for fma-msg completion */ + CDEBUG(D_NET, "tx 0x%p type 0x%02x will need to " + "wait for SMSG completion\n", tx, tx->tx_msg.gnm_type); + tx->tx_state |= GNILND_TX_PENDING_RDMA; + } + spin_unlock(&tx->tx_conn->gnc_list_lock); + + if (saw_complete) { + rc = kgnilnd_send_mapped_tx(tx, 0); + if (rc < 0) + kgnilnd_tx_done(tx, rc); + } + break; + + case GNILND_MSG_PUT_DONE: + tx = kgnilnd_match_reply(conn, GNILND_MSG_PUT_ACK, + msg->gnm_u.completion.gncm_cookie); + if (tx == NULL) + break; + + GNITX_ASSERTF(tx, tx->tx_buftype == GNILND_BUF_PHYS_MAPPED || + tx->tx_buftype == GNILND_BUF_VIRT_MAPPED, + "bad tx buftype %d", tx->tx_buftype); + + kgnilnd_finalize_rx_done(tx, msg); + break; + + case GNILND_MSG_GET_REQ: + rc = lnet_parse(net->gnn_ni, &msg->gnm_u.get.gngm_hdr, + msg->gnm_srcnid, rx, 1); + repost = rc < 0; + break; + + case GNILND_MSG_GET_NAK: + tx = kgnilnd_match_reply(conn, GNILND_MSG_GET_REQ, + msg->gnm_u.completion.gncm_cookie); + if (tx == NULL) + break; + + GNITX_ASSERTF(tx, tx->tx_buftype == GNILND_BUF_PHYS_MAPPED || + tx->tx_buftype == GNILND_BUF_VIRT_MAPPED, + "bad tx buftype %d", tx->tx_buftype); + + kgnilnd_complete_tx(tx, msg->gnm_u.completion.gncm_retval); + break; + + case GNILND_MSG_GET_DONE: + tx = kgnilnd_match_reply(conn, GNILND_MSG_GET_REQ, + msg->gnm_u.completion.gncm_cookie); + if (tx == NULL) + break; + + GNITX_ASSERTF(tx, tx->tx_buftype == GNILND_BUF_PHYS_MAPPED || + tx->tx_buftype == GNILND_BUF_VIRT_MAPPED, + "bad tx buftype %d", tx->tx_buftype); + + lnet_set_reply_msg_len(net->gnn_ni, tx->tx_lntmsg[1], + msg->gnm_u.completion.gncm_retval); + + kgnilnd_finalize_rx_done(tx, msg); + break; + } + + out: + if (rc < 0) /* protocol/comms error */ + kgnilnd_close_conn(conn, rc); + + if (repost && rx != NULL) { + kgnilnd_consume_rx(rx); + } + + /* we got an event so assume more there and call for reschedule */ + if (rc >= 0) + kgnilnd_schedule_conn(conn); + EXIT; +} + +/* Do the failure injections that we need to affect conn processing in the following function. + * When writing tests that use this function make sure to use a fail_loc with a fail mask. + * If you dont you can cause the scheduler threads to spin on the conn without it leaving + * process_conns. + * + * intent is used to signal the calling function whether or not the conn needs to be rescheduled. + */ + +static inline int +kgnilnd_check_conn_fail_loc(kgn_device_t *dev, kgn_conn_t *conn, int *intent) +{ + int rc = 0; + + /* short circuit out when not set */ + if (likely(!cfs_fail_loc)) { + RETURN(rc); + } + + /* failure injection to test for stack reset clean ups */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_DROP_CLOSING)) { + /* we can't rely on busy loops being nice enough to get the + * stack reset triggered - it'd just spin on this conn */ + CFS_RACE(CFS_FAIL_GNI_DROP_CLOSING); + rc = 1; + *intent = 1; + GOTO(did_fail_loc, rc); + } + + if (conn->gnc_state == GNILND_CONN_DESTROY_EP) { + /* DESTROY_EP set in kgnilnd_conn_decref on gnc_refcount = 1 */ + + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_DROP_DESTROY_EP)) { + CFS_RACE(CFS_FAIL_GNI_DROP_DESTROY_EP); + rc = 1; + *intent = 1; + GOTO(did_fail_loc, rc); + } + } + + /* CFS_FAIL_GNI_FINISH_PURG2 is used to stop a connection from fully closing. This scheduler + * will spin on the CFS_FAIL_TIMEOUT until the fail_loc is cleared at which time the connection + * will be closed by kgnilnd_complete_closed_conn. + */ + if ((conn->gnc_state == GNILND_CONN_CLOSED) && CFS_FAIL_CHECK(CFS_FAIL_GNI_FINISH_PURG2)) { + while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_FINISH_PURG2, 1)) {}; + rc = 1; + *intent = 1; + GOTO(did_fail_loc, rc); + } + + /* this one is a bit gross - we can't hold the mutex from process_conns + * across a CFS_RACE here - it'd block the conn threads from doing an ep_bind + * and moving onto finish_connect + * so, we'll just set the rc - kgnilnd_process_conns will clear + * found_work on a fail_loc, getting the scheduler thread to call schedule() + * and effectively getting this thread to sleep */ + if ((conn->gnc_state == GNILND_CONN_CLOSED) && CFS_FAIL_CHECK(CFS_FAIL_GNI_FINISH_PURG)) { + rc = 1; + *intent = 1; + GOTO(did_fail_loc, rc); + } + +did_fail_loc: + RETURN(rc); +} + +static inline void +kgnilnd_send_conn_close(kgn_conn_t *conn) +{ + kgn_tx_t *tx; + + /* we are closing the conn - we will try to send the CLOSE msg + * but will not wait for anything else to flush */ + + /* send the close if not already done so or received one */ + if (!conn->gnc_close_sent && !conn->gnc_close_recvd) { + /* set close_sent regardless of the success of the + * CLOSE message. We are going to try once and then + * kick him out of the sandbox */ + conn->gnc_close_sent = 1; + mb(); + + /* EP might be null already if remote side initiated a new connection. + * kgnilnd_finish_connect destroys existing ep_handles before wiring up the new connection, + * so this check is here to make sure we dont attempt to send with a null ep_handle. + */ + if (conn->gnc_ephandle != NULL) { + int rc = 0; + + tx = kgnilnd_new_tx_msg(GNILND_MSG_CLOSE, conn->gnc_peer->gnp_net->gnn_ni->ni_nid); + if (tx != NULL) { + tx->tx_msg.gnm_u.completion.gncm_retval = conn->gnc_error; + tx->tx_state = GNILND_TX_WAITING_COMPLETION; + tx->tx_qtime = jiffies; + + if (tx->tx_id.txe_idx == 0) { + rc = kgnilnd_set_tx_id(tx, conn); + if (rc != 0) { + kgnilnd_tx_done(tx, rc); + } + } + + CDEBUG(D_NETTRACE, "sending close with errno %d\n", + conn->gnc_error); + + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CLOSE_SEND)) { + kgnilnd_tx_done(tx, -EAGAIN); + } else if (!rc) { + rc = kgnilnd_sendmsg(tx, NULL, 0, NULL, GNILND_TX_FMAQ); + if (rc) { + /* It wasnt sent and we dont care. */ + kgnilnd_tx_done(tx, rc); + } + } + + } + } + } + + conn->gnc_state = GNILND_CONN_CLOSED; + /* mark this conn as CLOSED now that we processed it + * do after TX, so we can use CLOSING in asserts */ + + mb(); + + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_RX_CLOSE_CLOSED)) { + /* simulate a RX CLOSE after the timeout but before + * the scheduler thread gets it */ + conn->gnc_close_recvd = GNILND_CLOSE_INJECT2; + conn->gnc_peer_error = -ETIMEDOUT; + } + /* schedule to allow potential CLOSE and get the complete phase run */ + kgnilnd_schedule_conn(conn); +} + +int +kgnilnd_process_mapped_tx(kgn_device_t *dev) +{ + int found_work = 0; + int rc = 0; + kgn_tx_t *tx; + int max_retrans = *kgnilnd_tunables.kgn_max_retransmits; + int log_retrans, log_retrans_level; + static int last_map_version; + ENTRY; + + spin_lock(&dev->gnd_lock); + if (list_empty(&dev->gnd_map_tx)) { + spin_unlock(&dev->gnd_lock); + RETURN(0); + } + + dev->gnd_sched_alive = jiffies; + + /* we'll retry as fast as possible up to 25% of the limit, then we start + * backing off until our map version changes - indicating we unmapped + * something */ + tx = list_first_entry(&dev->gnd_map_tx, kgn_tx_t, tx_list); + if ((tx->tx_retrans > (max_retrans / 4)) && + (last_map_version == dev->gnd_map_version)) { + GNIDBG_TX(D_NET, tx, "waiting for mapping event event to retry", NULL); + spin_unlock(&dev->gnd_lock); + RETURN(0); + } + + /* stash the last map version to let us know when a good one was seen */ + last_map_version = dev->gnd_map_version; + + /* we need to to take the lock and continually refresh the head of the list as + * kgnilnd_complete_closed_conn might be nuking stuff and we are cycling the lock + * allowing them to squeeze in */ + + while (!list_empty(&dev->gnd_map_tx)) { + /* make sure we break out early on quiesce */ + if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) { + /* always break with lock held - we unlock outside loop */ + break; + } + + tx = list_first_entry(&dev->gnd_map_tx, kgn_tx_t, tx_list); + + kgnilnd_tx_del_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_ALLOCD); + found_work++; + + /* sample with lock held, serializing with kgnilnd_complete_closed_conn */ + if (tx->tx_conn->gnc_state != GNILND_CONN_ESTABLISHED) { + /* if conn is dying, mark tx in tx_ref_table for + * kgnilnd_complete_closed_conn to finish up */ + kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_DYING, 1); + found_work++; + + /* tx was moved to DYING, get next */ + continue; + } + + spin_unlock(&dev->gnd_lock); + rc = kgnilnd_send_mapped_tx(tx, 1); + + /* We made it! skip error handling.. */ + if (rc >= 0) { + /* OK to continue on +ve errors as it won't get seen until + * this function is called again - we operate on a copy of the original + * list and not the live list */ + spin_lock(&dev->gnd_lock); + continue; + } else if (rc != -ENOMEM) { + /* carp, failure we can't handle */ + kgnilnd_tx_done(tx, rc); + spin_lock(&dev->gnd_lock); + continue; + } + + /* time to handle the retry cases.. */ + tx->tx_retrans++; + if (tx->tx_retrans == 1) + tx->tx_qtime = jiffies; + + /* only log occasionally once we've retried max / 2 */ + log_retrans = (tx->tx_retrans >= (max_retrans / 2)) && + ((tx->tx_retrans % 32) == 0); + log_retrans_level = log_retrans ? D_NETERROR : D_NET; + + /* make sure we are not off in the weeds with this tx */ + if (tx->tx_retrans > *kgnilnd_tunables.kgn_max_retransmits) { + GNIDBG_TX(D_NETERROR, tx, + "giving up on TX, too many retries", NULL); + kgnilnd_tx_done(tx, -ENOMEM); + GOTO(get_out_mapped, rc); + } else { + GNIDBG_TX(log_retrans_level, tx, + "transient map failure #%d %d pages/%d bytes phys %u@%u " + "virt %u@"LPU64" " + "nq_map %d mdd# %d/%d GART %ld", + tx->tx_retrans, tx->tx_phys_npages, tx->tx_nob, + dev->gnd_map_nphys, dev->gnd_map_physnop * PAGE_SIZE, + dev->gnd_map_nvirt, dev->gnd_map_virtnob, + atomic_read(&dev->gnd_nq_map), + atomic_read(&dev->gnd_n_mdd), atomic_read(&dev->gnd_n_mdd_held), + atomic64_read(&dev->gnd_nbytes_map)); + } + + /* we need to stop processing the rest of the list, so add it back in */ + spin_lock(&dev->gnd_lock); + kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_MAPQ, 0); + spin_unlock(&dev->gnd_lock); + GOTO(get_out_mapped, rc); + } + spin_unlock(&dev->gnd_lock); +get_out_mapped: + RETURN(found_work); +} + +int +kgnilnd_process_conns(kgn_device_t *dev) +{ + int found_work = 0; + int conn_sched; + int intent = 0; + kgn_conn_t *conn; + + spin_lock(&dev->gnd_lock); + while (!list_empty(&dev->gnd_ready_conns)) { + dev->gnd_sched_alive = jiffies; + + if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) { + /* break with lock held */ + break; + } + + conn = list_first_entry(&dev->gnd_ready_conns, kgn_conn_t, gnc_schedlist); + list_del_init(&conn->gnc_schedlist); + spin_unlock(&dev->gnd_lock); + + conn_sched = xchg(&conn->gnc_scheduled, GNILND_CONN_PROCESS); + + LASSERTF(conn_sched != GNILND_CONN_IDLE && + conn_sched != GNILND_CONN_PROCESS, + "conn %p on ready list but in bad state: %d\n", + conn, conn_sched); + + CDEBUG(D_INFO, "conn %p@%s for processing\n", + conn, kgnilnd_conn_state2str(conn)); + + found_work++; + set_mb(conn->gnc_last_sched_do, jiffies); + + if (kgnilnd_check_conn_fail_loc(dev, conn, &intent)) { + + /* based on intent see if we should run again. */ + kgnilnd_schedule_process_conn(conn, intent); + + /* drop ref from gnd_ready_conns */ + kgnilnd_conn_decref(conn); + /* clear this so that scheduler thread doesn't spin */ + found_work = 0; + /* break with lock held... */ + spin_lock(&dev->gnd_lock); + break; + } + + if (unlikely(conn->gnc_state == GNILND_CONN_CLOSED)) { + /* CONN_CLOSED set in procces_fmaq when CLOSE is sent */ + kgnilnd_complete_closed_conn(conn); + } else if (unlikely(conn->gnc_state == GNILND_CONN_DESTROY_EP)) { + /* DESTROY_EP set in kgnilnd_conn_decref on gnc_refcount = 1 */ + /* serialize SMSG CQs with ep_bind and smsg_release */ + kgnilnd_destroy_conn_ep(conn); + } else if (unlikely(conn->gnc_state == GNILND_CONN_CLOSING)) { + /* if we need to do some CLOSE sending, etc done here do it */ + kgnilnd_send_conn_close(conn); + kgnilnd_check_fma_rx(conn); + } else if (atomic_read(&conn->gnc_peer->gnp_dirty_eps) == 0) { + /* start moving traffic if the old conns are cleared out */ + kgnilnd_check_fma_rx(conn); + kgnilnd_process_fmaq(conn); + } + + kgnilnd_schedule_process_conn(conn, 0); + + /* drop ref from gnd_ready_conns */ + kgnilnd_conn_decref(conn); + + /* check list again with lock held */ + spin_lock(&dev->gnd_lock); + } + spin_unlock(&dev->gnd_lock); + + RETURN(found_work); +} + +int +kgnilnd_scheduler(void *arg) +{ + int threadno = (long)arg; + kgn_device_t *dev; + char name[16]; + int busy_loops = 0; + DEFINE_WAIT(wait); + + dev = &kgnilnd_data.kgn_devices[(threadno + 1) % kgnilnd_data.kgn_ndevs]; + + snprintf(name, sizeof(name), "kgnilnd_sd_%02d", threadno); + cfs_daemonize(name); + cfs_block_allsigs(); + + /* all gnilnd threads need to run fairly urgently */ + set_user_nice(current, *kgnilnd_tunables.kgn_nice); + + while (!kgnilnd_data.kgn_shutdown) { + int found_work = 0; + /* Safe: kgn_shutdown only set when quiescent */ + + /* to quiesce or to not quiesce, that is the question */ + + if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) { + KGNILND_SPIN_QUIESCE; + } + + /* tracking for when thread goes AWOL */ + dev->gnd_sched_alive = jiffies; + + /* let folks know we are up and kicking + * - they can use this for latency savings, etc + * - only change if IRQ, if IDLE leave alone as that + * schedule_device calls to put us back to IRQ */ + (void)cmpxchg(&dev->gnd_ready, GNILND_DEV_IRQ, GNILND_DEV_LOOP); + + /* always check these - they are super low cost */ + found_work += kgnilnd_check_fma_send_cq(dev); + found_work += kgnilnd_check_fma_rcv_cq(dev); + + /* rdma CQ doesn't care about eps */ + found_work += kgnilnd_check_rdma_cq(dev); + + /* move some RDMA ? */ + found_work += kgnilnd_process_rdmaq(dev); + + /* map some pending RDMA requests ? */ + found_work += kgnilnd_process_mapped_tx(dev); + + /* the EP for a conn is not destroyed until all the references + * to it are gone, so these checks should be safe + * even if run in parallel with the CQ checking functions + * _AND_ a thread that processes the CLOSED->DONE + * transistion + * ...should.... */ + + /* process all conns ready now */ + found_work += kgnilnd_process_conns(dev); + + /* do an eager check to avoid the IRQ disabling in + * prepare_to_wait and friends */ + + if (found_work && busy_loops++ < *kgnilnd_tunables.kgn_loops) { + found_work = 0; + if ((busy_loops % 10) == 0) { + /* tickle heartbeat and watchdog to ensure our + * piggishness doesn't turn into heartbeat failure */ + touch_nmi_watchdog(); + if (kgnilnd_hssops.hb_to_l0 != NULL) { + kgnilnd_hssops.hb_to_l0(); + } + } + continue; + } + + /* if we got here, found_work was zero or busy_loops means we + * need to take a break. We'll clear gnd_ready but we'll check + * one last time if there is an IRQ that needs processing */ + + prepare_to_wait(&dev->gnd_waitq, &wait, TASK_INTERRUPTIBLE); + + /* the first time this will go LOOP -> IDLE and let us do one final check + * during which we might get an IRQ, then IDLE->IDLE and schedule() + * - this might allow other threads to block us for a bit if they + * try to get the mutex, but that is good as we'd need to wake + * up soon to handle the CQ or other processing anyways */ + + found_work += xchg(&dev->gnd_ready, GNILND_DEV_IDLE); + + if (busy_loops >= *kgnilnd_tunables.kgn_loops) { + CDEBUG(D_INFO, + "yeilding: found_work %d busy_loops %d\n", + found_work, busy_loops); + busy_loops = 0; + /* use yield if we are bailing due to busy_loops + * - this will ensure we wake up soonish. This closes + * a race with kgnilnd_device_callback - where it'd + * not call wake_up() because gnd_ready == 1, but then + * we come down and schedule() because of busy_loops. + * We'd not be woken up until something poked our waitq + * again. yield() ensures we wake up without another + * waitq poke in that case */ + atomic_inc(&dev->gnd_n_yield); + yield(); + CDEBUG(D_INFO, "awake after yeild\n"); + } else if (found_work == GNILND_DEV_IDLE) { + /* busy_loops is low and there is nothing to do, + * go to sleep and wait for a waitq poke */ + CDEBUG(D_INFO, + "scheduling: found_work %d busy_loops %d\n", + found_work, busy_loops); + atomic_inc(&dev->gnd_n_schedule); + schedule(); + CDEBUG(D_INFO, "awake after schedule\n"); + } + finish_wait(&dev->gnd_waitq, &wait); + } + + kgnilnd_thread_fini(); + return 0; +} diff --git a/lnet/klnds/gnilnd/gnilnd_conn.c b/lnet/klnds/gnilnd/gnilnd_conn.c new file mode 100644 index 0000000..38aee5b --- /dev/null +++ b/lnet/klnds/gnilnd/gnilnd_conn.c @@ -0,0 +1,2408 @@ +/* + * Copyright (C) 2012 Cray, Inc. + * + * Author: Igor Gorodetsky + * Author: Nic Henke + * Author: James Shimek + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "gnilnd.h" + +void +kgnilnd_setup_smsg_attr(gni_smsg_attr_t *smsg_attr) +{ + smsg_attr->mbox_maxcredit = *kgnilnd_tunables.kgn_mbox_credits; + smsg_attr->msg_maxsize = GNILND_MAX_MSG_SIZE; + smsg_attr->msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT; +} + +int +kgnilnd_map_fmablk(kgn_device_t *device, kgn_fma_memblock_t *fma_blk) +{ + gni_return_t rrc; + __u32 flags = GNI_MEM_READWRITE; + + if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) { + flags |= GNI_MEM_PHYS_CONT; + } + + /* make sure we are mapping a clean block */ + LASSERTF(fma_blk->gnm_hndl.qword1 == 0UL, "fma_blk %p dirty\n", fma_blk); + + rrc = kgnilnd_mem_register(device->gnd_handle, (__u64)fma_blk->gnm_block, + fma_blk->gnm_blk_size, device->gnd_rcv_fma_cqh, + flags, &fma_blk->gnm_hndl); + if (rrc != GNI_RC_SUCCESS) { + /* XXX Nic: need a way to silence this for runtime stuff that is ok to fail + * -- like when under MDD or GART pressure on big systems + */ + CNETERR("register fmablk failed 0x%p mbox_size %d flags %u\n", + fma_blk, fma_blk->gnm_mbox_size, flags); + RETURN(-ENOMEM); + } + + /* PHYS_CONT memory isn't really mapped, at least not in GART - + * but all mappings chew up a MDD + */ + if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) { + atomic64_add(fma_blk->gnm_blk_size, &device->gnd_nbytes_map); + } + + atomic_inc(&device->gnd_n_mdd); + /* nfmablk is live (mapped) blocks */ + atomic_inc(&device->gnd_nfmablk); + + RETURN(0); +} + +int +kgnilnd_alloc_fmablk(kgn_device_t *device, int use_phys) +{ + int rc = 0; + int num_mbox; + kgn_fma_memblock_t *fma_blk; + gni_smsg_attr_t smsg_attr; + unsigned long fmablk_vers; + + /* we'll use fmablk_vers and the gnd_fmablk_sem to gate access + * to this allocation code. Everyone will sample the version + * before and after getting the semaphore. If it has changed, + * we'll bail out to check the lists again - this indicates that + * some sort of change was made to the lists and it is possible + * that there is a mailbox for us to find now. This should prevent + * a ton of spinning in the case where there are lots of threads + * that need a yet-to-be-allocated mailbox for a connection. */ + + fmablk_vers = atomic_read(&device->gnd_fmablk_vers); + down(&device->gnd_fmablk_sem); + + if (fmablk_vers != atomic_read(&device->gnd_fmablk_vers)) { + /* version changed while we were waiting for semaphore, + * we'll recheck the lists assuming something nice happened */ + up(&device->gnd_fmablk_sem); + return 0; + } + + LIBCFS_ALLOC(fma_blk, sizeof(kgn_fma_memblock_t)); + if (fma_blk == NULL) { + CNETERR("could not allocate fma block descriptor\n"); + rc = -ENOMEM; + GOTO(out, rc); + } + + INIT_LIST_HEAD(&fma_blk->gnm_bufflist); + + kgnilnd_setup_smsg_attr(&smsg_attr); + + gni_smsg_buff_size_needed(&smsg_attr, &fma_blk->gnm_mbox_size); + + LASSERTF(fma_blk->gnm_mbox_size, "mbox size %d\n", fma_blk->gnm_mbox_size); + + /* gni_smsg_buff_size_needed calculates the base mailbox size and since + * we want to hold kgn_peer_credits worth of messages in both directions, + * we add PAYLOAD to grow the mailbox size + */ + + fma_blk->gnm_mbox_size += GNILND_MBOX_PAYLOAD; + + /* we'll only use physical during preallocate at startup -- this keeps it nice and + * clean for runtime decisions. We'll keep the PHYS ones around until shutdown + * as reallocating them is tough if there is memory fragmentation */ + + if (use_phys) { + fma_blk->gnm_block = cfs_mem_cache_alloc(kgnilnd_data.kgn_mbox_cache, CFS_ALLOC_ATOMIC); + if (fma_blk->gnm_block == NULL) { + CNETERR("could not allocate physical SMSG mailbox memory\n"); + rc = -ENOMEM; + GOTO(free_desc, rc); + } + fma_blk->gnm_blk_size = KMALLOC_MAX_SIZE; + num_mbox = fma_blk->gnm_blk_size / fma_blk->gnm_mbox_size; + + LASSERTF(num_mbox >= 1, + "num_mbox %d blk_size %u mbox_size %d\n", + num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size); + + fma_blk->gnm_state = GNILND_FMABLK_PHYS; + + } else { + num_mbox = *kgnilnd_tunables.kgn_mbox_per_block; + fma_blk->gnm_blk_size = num_mbox * fma_blk->gnm_mbox_size; + + LASSERTF(num_mbox >= 1 && num_mbox >= *kgnilnd_tunables.kgn_mbox_per_block, + "num_mbox %d blk_size %u mbox_size %d tunable %d\n", + num_mbox, fma_blk->gnm_blk_size, fma_blk->gnm_mbox_size, + *kgnilnd_tunables.kgn_mbox_per_block); + + LIBCFS_ALLOC(fma_blk->gnm_block, fma_blk->gnm_blk_size); + if (fma_blk->gnm_block == NULL) { + CNETERR("could not allocate virtual SMSG mailbox memory, %d bytes\n", fma_blk->gnm_blk_size); + rc = -ENOMEM; + GOTO(free_desc, rc); + } + + fma_blk->gnm_state = GNILND_FMABLK_VIRT; + } + + /* allocate just enough space for the bits to track the mailboxes */ + LIBCFS_ALLOC(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox) * sizeof(unsigned long)); + if (fma_blk->gnm_bit_array == NULL) { + CNETERR("could not allocate mailbox bitmask, %lu bytes for %d mbox\n", + sizeof(unsigned long) * BITS_TO_LONGS(num_mbox), num_mbox); + rc = -ENOMEM; + GOTO(free_blk, rc); + } + bitmap_zero(fma_blk->gnm_bit_array, num_mbox); + + /* now that the num_mbox is set based on allocation type, get debug info setup */ + LIBCFS_ALLOC(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t) * num_mbox); + if (fma_blk->gnm_mbox_info == NULL) { + CNETERR("could not allocate mailbox debug, %lu bytes for %d mbox\n", + sizeof(kgn_mbox_info_t) * num_mbox, num_mbox); + rc = -ENOMEM; + GOTO(free_bit, rc); + } + + rc = kgnilnd_map_fmablk(device, fma_blk); + if (rc) { + GOTO(free_info, rc); + } + + fma_blk->gnm_next_avail_mbox = 0; + fma_blk->gnm_avail_mboxs = fma_blk->gnm_num_mboxs = num_mbox; + + CDEBUG(D_MALLOC, "alloc fmablk 0x%p num %d msg_maxsize %d credits %d " + "mbox_size %d MDD "LPX64"."LPX64"\n", + fma_blk, num_mbox, smsg_attr.msg_maxsize, smsg_attr.mbox_maxcredit, + fma_blk->gnm_mbox_size, fma_blk->gnm_hndl.qword1, + fma_blk->gnm_hndl.qword2); + + /* lock Is protecting data structures, not semaphore */ + + spin_lock(&device->gnd_fmablk_lock); + list_add_tail(&fma_blk->gnm_bufflist, &device->gnd_fma_buffs); + + /* toggle under the lock so once they change the list is also + * ready for others to traverse */ + atomic_inc(&device->gnd_fmablk_vers); + + spin_unlock(&device->gnd_fmablk_lock); + + up(&device->gnd_fmablk_sem); + + return 0; + +free_info: + LIBCFS_FREE(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t)*num_mbox); +free_bit: + LIBCFS_FREE(fma_blk->gnm_bit_array, BITS_TO_LONGS(num_mbox) * sizeof (unsigned long)); +free_blk: + if (fma_blk->gnm_state == GNILND_FMABLK_VIRT) { + LIBCFS_FREE(fma_blk->gnm_block, fma_blk->gnm_blk_size); + } else { + cfs_mem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block); + } +free_desc: + LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t)); +out: + up(&device->gnd_fmablk_sem); + return rc; +} + +void +kgnilnd_unmap_fmablk(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk) +{ + gni_return_t rrc; + + /* if some held, set hold_timeout from conn timeouts used in this block + * but not during shutdown, then just nuke and pave */ + if (fma_blk->gnm_held_mboxs && (!kgnilnd_data.kgn_shutdown)) { + fma_blk->gnm_hold_timeout = GNILND_TIMEOUT2DEADMAN; + } + + /* we are changing the state of a block, tickle version to tell + * proc code list is stale now */ + atomic_inc(&dev->gnd_fmablk_vers); + + rrc = kgnilnd_mem_deregister(dev->gnd_handle, &fma_blk->gnm_hndl, fma_blk->gnm_hold_timeout); + + CDEBUG(rrc == GNI_RC_SUCCESS ? D_MALLOC : D_CONSOLE|D_NETERROR, + "unmap fmablk 0x%p@%s sz %u total %d avail %d held %d mbox_size %d " + "hold_timeout %d\n", + fma_blk, kgnilnd_fmablk_state2str(fma_blk->gnm_state), + fma_blk->gnm_blk_size, fma_blk->gnm_num_mboxs, + fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs, + fma_blk->gnm_mbox_size, fma_blk->gnm_hold_timeout); + + LASSERTF(rrc == GNI_RC_SUCCESS, + "tried to double unmap or something bad, fma_blk %p (rrc %d)\n", + fma_blk, rrc); + + if (fma_blk->gnm_hold_timeout) { + atomic_inc(&dev->gnd_n_mdd_held); + } else { + atomic_dec(&dev->gnd_n_mdd); + } + + /* PHYS blocks don't get mapped */ + if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) { + atomic64_sub(fma_blk->gnm_blk_size, &dev->gnd_nbytes_map); + } else if (kgnilnd_data.kgn_in_reset) { + /* in stack reset, clear MDD handle for PHYS blocks, as we'll + * re-use the fma_blk after reset so we don't have to drop/allocate + * all of those physical blocks */ + fma_blk->gnm_hndl.qword1 = fma_blk->gnm_hndl.qword2 = 0UL; + } + + /* Decrement here as this is the # of mapped blocks */ + atomic_dec(&dev->gnd_nfmablk); +} + + +/* needs lock on gnd_fmablk_lock to cover gnd_fma_buffs */ +void +kgnilnd_free_fmablk_locked(kgn_device_t *dev, kgn_fma_memblock_t *fma_blk) +{ + LASSERTF(fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs, + "fma_blk %p@%d free in bad state (%d): blk total %d avail %d held %d\n", + fma_blk, fma_blk->gnm_state, fma_blk->gnm_hold_timeout, fma_blk->gnm_num_mboxs, + fma_blk->gnm_avail_mboxs, fma_blk->gnm_held_mboxs); + + atomic_inc(&dev->gnd_fmablk_vers); + + if (fma_blk->gnm_hold_timeout) { + CDEBUG(D_MALLOC, "mdd release fmablk 0x%p sz %u avail %d held %d " + "mbox_size %d\n", + fma_blk, fma_blk->gnm_blk_size, fma_blk->gnm_avail_mboxs, + fma_blk->gnm_held_mboxs, fma_blk->gnm_mbox_size); + + /* We leave MDD dangling over stack reset */ + if (!kgnilnd_data.kgn_in_reset) { + kgnilnd_mem_mdd_release(dev->gnd_handle, &fma_blk->gnm_hndl); + } + /* ignoring the return code - if kgni/ghal can't find it + * it must be released already */ + atomic_dec(&dev->gnd_n_mdd_held); + atomic_dec(&dev->gnd_n_mdd); + } + + /* we cant' free the gnm_block until all the conns have released their + * purgatory holds. While we have purgatory holds, we might check the conn + * RX mailbox during the CLOSING process. It is possible that kgni might + * try to look into the RX side for credits when sending the CLOSE msg too */ + CDEBUG(D_MALLOC, "fmablk %p free buffer %p mbox_size %d\n", + fma_blk, fma_blk->gnm_block, fma_blk->gnm_mbox_size); + + if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) { + cfs_mem_cache_free(kgnilnd_data.kgn_mbox_cache, fma_blk->gnm_block); + } else { + LIBCFS_FREE(fma_blk->gnm_block, fma_blk->gnm_blk_size); + } + fma_blk->gnm_state = GNILND_FMABLK_FREED; + + list_del(&fma_blk->gnm_bufflist); + + LIBCFS_FREE(fma_blk->gnm_mbox_info, sizeof(kgn_mbox_info_t)*fma_blk->gnm_num_mboxs); + LIBCFS_FREE(fma_blk->gnm_bit_array, BITS_TO_LONGS(fma_blk->gnm_num_mboxs) * sizeof (unsigned long)); + LIBCFS_FREE(fma_blk, sizeof(kgn_fma_memblock_t)); +} + +void +kgnilnd_find_free_mbox(kgn_conn_t *conn) +{ + kgn_device_t *dev = conn->gnc_device; + gni_smsg_attr_t *smsg_attr = &conn->gnpr_smsg_attr; + kgn_fma_memblock_t *fma_blk; + kgn_mbox_info_t *mbox = NULL; + int id; + + spin_lock(&dev->gnd_fmablk_lock); + + list_for_each_entry(fma_blk, &conn->gnc_device->gnd_fma_buffs, + gnm_bufflist) { + if (fma_blk->gnm_avail_mboxs <= 0 || + fma_blk->gnm_state <= GNILND_FMABLK_IDLE) { + continue; + } + /* look in bitarray for available mailbox */ + do { + id = find_next_zero_bit( + fma_blk->gnm_bit_array, + fma_blk->gnm_num_mboxs, + fma_blk->gnm_next_avail_mbox); + if (id == fma_blk->gnm_num_mboxs && + fma_blk->gnm_next_avail_mbox != 0) { + /* wrap around */ + fma_blk->gnm_next_avail_mbox = 0; + } else { + break; + } + } while (1); + + LASSERTF(id < fma_blk->gnm_num_mboxs, "id %d max %d\n", + id, fma_blk->gnm_num_mboxs); + set_bit(id, (volatile unsigned long *)fma_blk->gnm_bit_array); + conn->gnc_mbox_id = id; + + fma_blk->gnm_next_avail_mbox = + (id == (fma_blk->gnm_num_mboxs - 1)) ? 0 : (id + 1); + fma_blk->gnm_avail_mboxs--; + conn->gnc_fma_blk = fma_blk; + + kgnilnd_setup_smsg_attr(smsg_attr); + + smsg_attr->msg_buffer = fma_blk->gnm_block; + smsg_attr->mbox_offset = fma_blk->gnm_mbox_size * id; + smsg_attr->mem_hndl = fma_blk->gnm_hndl; + smsg_attr->buff_size = fma_blk->gnm_mbox_size; + + /* We'll set the hndl to zero for PHYS blocks unmapped during stack + * reset and re-use the same fma_blk after stack reset. This ensures we've + * properly mapped it before we use it */ + LASSERTF(fma_blk->gnm_hndl.qword1 != 0UL, "unmapped fma_blk %p, state %d\n", + fma_blk, fma_blk->gnm_state); + + CDEBUG(D_NET, "conn %p smsg %p fmablk %p " + "allocating SMSG mbox %d buf %p " + "offset %u hndl "LPX64"."LPX64"\n", + conn, smsg_attr, fma_blk, id, + smsg_attr->msg_buffer, smsg_attr->mbox_offset, + fma_blk->gnm_hndl.qword1, + fma_blk->gnm_hndl.qword2); + + mbox = &fma_blk->gnm_mbox_info[id]; + mbox->mbx_create_conn_memset = jiffies; + + /* zero mbox to remove any old data from our last use. + * this better be safe, if not our purgatory timers + * are too short or a peer really is misbehaving */ + memset(smsg_attr->msg_buffer + smsg_attr->mbox_offset, + 0, smsg_attr->buff_size); + break; + } + + spin_unlock(&dev->gnd_fmablk_lock); +} + +int +kgnilnd_setup_mbox(kgn_conn_t *conn) +{ + gni_smsg_attr_t *smsg_attr = &conn->gnpr_smsg_attr; + int err = 0; + + smsg_attr->msg_buffer = NULL; + /* Look for available mbox */ + do { + kgnilnd_find_free_mbox(conn); + + /* nothing in the existing buffers, make a new one */ + if (smsg_attr->msg_buffer == NULL) { + /* for runtime allocations, we only want vmalloc */ + err = kgnilnd_alloc_fmablk(conn->gnc_device, 0); + if (err) { + break; + } + } + } while (smsg_attr->msg_buffer == NULL); + + if (err) + CNETERR("couldn't allocate SMSG mbox for conn %p Error: %d\n", + conn, err); + return err; +} + +void +kgnilnd_release_mbox(kgn_conn_t *conn, int purgatory_hold) +{ + kgn_device_t *dev = conn->gnc_device; + gni_smsg_attr_t *smsg_attr = &conn->gnpr_smsg_attr; + kgn_fma_memblock_t *fma_blk = NULL; + kgn_mbox_info_t *mbox = NULL; + int found = 0; + int id; + + /* if we failed to setup mbox and now destroying conn */ + if (smsg_attr->msg_buffer == NULL) { + return; + } + + id = conn->gnc_mbox_id; + + spin_lock(&dev->gnd_fmablk_lock); + /* make sure our conn points at a valid fma_blk + * We use this instead of a mem block search out of smsg_attr + * because we could have freed a block for fma_blk #1 but the fma_blk + * is still in the list for a purgatory hold. This would induce a false + * match if that same block gets reallocated to fma_blk #2 */ + list_for_each_entry(fma_blk, &dev->gnd_fma_buffs, gnm_bufflist) { + if (fma_blk == conn->gnc_fma_blk) { + found = 1; + break; + } + } + LASSERTF(found, "unable to find conn 0x%p with gnc_fma_blk %p " + "anywhere in the world\n", conn, conn->gnc_fma_blk); + + LASSERTF(id < fma_blk->gnm_num_mboxs, + "bad id %d max %d\n", + id, fma_blk->gnm_num_mboxs); + + /* < 0 - was held, now free it + * == 0 - just free it + * > 0 - hold it for now */ + if (purgatory_hold == 0) { + CDEBUG(D_NET, "conn %p smsg %p fmablk %p freeing SMSG mbox %d " + "hndl "LPX64"."LPX64"\n", + conn, smsg_attr, fma_blk, id, + fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2); + fma_blk->gnm_avail_mboxs++; + + } else if (purgatory_hold > 0) { + CDEBUG(D_NET, "conn %p smsg %p fmablk %p holding SMSG mbox %d " + "hndl "LPX64"."LPX64"\n", + conn, smsg_attr, fma_blk, id, + fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2); + + fma_blk->gnm_held_mboxs++; + fma_blk->gnm_max_timeout = MAX(fma_blk->gnm_max_timeout, + conn->gnc_timeout); + } else { + CDEBUG(D_NET, "conn %p smsg %p fmablk %p release SMSG mbox %d " + "hndl "LPX64"."LPX64"\n", + conn, smsg_attr, fma_blk, id, + fma_blk->gnm_hndl.qword1, fma_blk->gnm_hndl.qword2); + + fma_blk->gnm_held_mboxs--; + fma_blk->gnm_avail_mboxs++; + } + + if (purgatory_hold <= 0) { + /* if kgni is retransmitting, freeing the smsg block before the EP + * is destroyed gets messy. Bug 768295. */ + LASSERTF(conn->gnc_ephandle == NULL, + "can't release mbox before EP is nuked. conn 0x%p\n", conn); + + mbox = &fma_blk->gnm_mbox_info[id]; + mbox->mbx_release_from_purgatory = jiffies; + + /* clear conn gnc_fmablk if it is gone - this allows us to + * not worry about state so much in kgnilnd_destroy_conn + * and makes the guaranteed cleanup of the resources easier */ + LASSERTF(test_and_clear_bit(id, fma_blk->gnm_bit_array), + "conn %p bit %d already cleared in fma_blk %p\n", + conn, id, fma_blk); + conn->gnc_fma_blk = NULL; + } + + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FMABLK_AVAIL)) { + CERROR("LBUGs in your future: forcibly marking fma_blk %p " + "as mapped\n", fma_blk); + fma_blk->gnm_state = GNILND_FMABLK_VIRT; + } + + /* we don't release or unmap PHYS blocks as part of the normal cycle -- + * those are controlled manually from startup/shutdown */ + if (fma_blk->gnm_state != GNILND_FMABLK_PHYS) { + /* we can unmap once all are unused (held or avail) + * but check hold_timeout to make sure we are not trying to double + * unmap this buffer. If there was no hold_timeout set due to + * held_mboxs, we'll free the mobx here shortly and won't have to + * worry about catching a double free for a 'clean' fma_blk */ + if (((fma_blk->gnm_avail_mboxs + fma_blk->gnm_held_mboxs) == fma_blk->gnm_num_mboxs) && + (!fma_blk->gnm_hold_timeout)) { + kgnilnd_unmap_fmablk(dev, fma_blk); + } + + /* But we can only free once they are all avail */ + if (fma_blk->gnm_avail_mboxs == fma_blk->gnm_num_mboxs && + fma_blk->gnm_held_mboxs == 0) { + /* all mailboxes are released, free fma_blk */ + kgnilnd_free_fmablk_locked(dev, fma_blk); + } + } + + spin_unlock(&dev->gnd_fmablk_lock); +} + +int +kgnilnd_count_phys_mbox(kgn_device_t *device) +{ + int i = 0; + kgn_fma_memblock_t *fma_blk; + + spin_lock(&device->gnd_fmablk_lock); + + list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) { + if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) + i += fma_blk->gnm_num_mboxs; + } + spin_unlock(&device->gnd_fmablk_lock); + + RETURN(i); +} + +int +kgnilnd_allocate_phys_fmablk(kgn_device_t *device) +{ + int rc; + + while (kgnilnd_count_phys_mbox(device) < *kgnilnd_tunables.kgn_nphys_mbox) { + + rc = kgnilnd_alloc_fmablk(device, 1); + if (rc) { + CERROR("failed phys mbox allocation, stopping at %d, rc %d\n", + kgnilnd_count_phys_mbox(device), rc); + RETURN(rc); + } + } + RETURN(0); +} + +int +kgnilnd_map_phys_fmablk(kgn_device_t *device) +{ + + int rc = 0; + kgn_fma_memblock_t *fma_blk; + + /* use sem to gate access to single thread, just in case */ + down(&device->gnd_fmablk_sem); + + spin_lock(&device->gnd_fmablk_lock); + + list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) { + if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) + rc = kgnilnd_map_fmablk(device, fma_blk); + if (rc) + break; + } + spin_unlock(&device->gnd_fmablk_lock); + + up(&device->gnd_fmablk_sem); + + RETURN(rc); +} + +void +kgnilnd_unmap_phys_fmablk(kgn_device_t *device) +{ + + kgn_fma_memblock_t *fma_blk; + + /* use sem to gate access to single thread, just in case */ + down(&device->gnd_fmablk_sem); + + spin_lock(&device->gnd_fmablk_lock); + + list_for_each_entry(fma_blk, &device->gnd_fma_buffs, gnm_bufflist) { + if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) + kgnilnd_unmap_fmablk(device, fma_blk); + } + spin_unlock(&device->gnd_fmablk_lock); + + up(&device->gnd_fmablk_sem); +} + +void +kgnilnd_free_phys_fmablk(kgn_device_t *device) +{ + + kgn_fma_memblock_t *fma_blk, *fma_blkN; + + /* use sem to gate access to single thread, just in case */ + down(&device->gnd_fmablk_sem); + + spin_lock(&device->gnd_fmablk_lock); + + list_for_each_entry_safe(fma_blk, fma_blkN, &device->gnd_fma_buffs, gnm_bufflist) { + if (fma_blk->gnm_state == GNILND_FMABLK_PHYS) + kgnilnd_free_fmablk_locked(device, fma_blk); + } + spin_unlock(&device->gnd_fmablk_lock); + + up(&device->gnd_fmablk_sem); +} + +/* kgnilnd dgram nid->struct managment */ + +static inline struct list_head * +kgnilnd_nid2dgramlist(kgn_device_t *dev, lnet_nid_t nid) +{ + unsigned int hash = ((unsigned int)nid) % *kgnilnd_tunables.kgn_peer_hash_size; + + RETURN(&dev->gnd_dgrams[hash]); +} + + +/* needs dev->gnd_dgram_lock held */ +kgn_dgram_t * +kgnilnd_find_dgram_locked(kgn_device_t *dev, lnet_nid_t dst_nid) +{ + struct list_head *dgram_list = kgnilnd_nid2dgramlist(dev, dst_nid); + kgn_dgram_t *dgram; + + list_for_each_entry(dgram, dgram_list, gndg_list) { + + /* if state > POSTED, we are already handling cancel/completion */ + if ((dgram->gndg_conn_out.gncr_dstnid != dst_nid) || + dgram->gndg_state > GNILND_DGRAM_POSTED) + continue; + + CDEBUG(D_NET, "got dgram [%p] -> %s\n", + dgram, libcfs_nid2str(dst_nid)); + return dgram; + } + return NULL; +} + +int +kgnilnd_find_and_cancel_dgram(kgn_device_t *dev, lnet_nid_t dst_nid) +{ + kgn_dgram_t *dgram; + + spin_lock(&dev->gnd_dgram_lock); + dgram = kgnilnd_find_dgram_locked(dev, dst_nid); + + if (dgram) { + kgnilnd_cancel_dgram_locked(dgram); + } + spin_unlock(&dev->gnd_dgram_lock); + + RETURN(!!(dgram == NULL)); +} + +int +kgnilnd_pack_connreq(kgn_connreq_t *connreq, kgn_conn_t *conn, + lnet_nid_t srcnid, lnet_nid_t dstnid, + kgn_connreq_type_t type) +{ + int err = 0; + + /* ensure we haven't violated max datagram size */ + CLASSERT(sizeof(kgn_connreq_t) <= GNI_DATAGRAM_MAXSIZE); + + /* no need to zero out, we do that when allocating dgram */ + connreq->gncr_magic = GNILND_MSG_MAGIC; + + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_SRCNID)) { + srcnid = 0xABADBABE; + } else if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) { + dstnid = 0xDEFEC8ED; + } + + connreq->gncr_srcnid = srcnid; + connreq->gncr_dstnid = dstnid; + + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) { + connreq->gncr_version = 99; + } else { + connreq->gncr_version = GNILND_CONNREQ_VERSION; + } + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) { + connreq->gncr_type = 99; + } else { + connreq->gncr_type = type; + } + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) { + connreq->gncr_peerstamp = 0; + } else { + connreq->gncr_peerstamp = kgnilnd_data.kgn_peerstamp; + } + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) { + connreq->gncr_connstamp = 0; + } else { + connreq->gncr_connstamp = conn->gnc_my_connstamp; + } + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_PROTO)) { + connreq->gncr_timeout = 0; + } else { + connreq->gncr_timeout = conn->gnc_timeout; + } + + /* the rest pack the data into the payload in other places */ + if (type == GNILND_CONNREQ_REQ) { + kgn_gniparams_t *req_params = &connreq->gncr_gnparams; + req_params->gnpr_host_id = conn->gnc_device->gnd_host_id; + req_params->gnpr_cqid = conn->gnc_cqid; + + /* allocate mailbox for this connection */ + err = kgnilnd_setup_mbox(conn); + if (err != 0) { + CERROR("Failed to setup FMA mailbox (%d)\n", err); + } + req_params->gnpr_smsg_attr = conn->gnpr_smsg_attr; + } + + /* XXX Nic: TBD - checksum computation */ + + return err; +} + +int +kgnilnd_unpack_connreq(kgn_dgram_t *dgram) +{ + kgn_connreq_t *connreq = &dgram->gndg_conn_in; + int swab, rc = 0; + kgn_net_t *net; + + /* the following fields must be handled in a backwards compatible + * manner to ensure we can always send and interpret NAKs */ + + if (connreq->gncr_magic != GNILND_MSG_MAGIC && + connreq->gncr_magic != __swab32(GNILND_MSG_MAGIC)) { + /* Unexpected magic! */ + CERROR("Unexpected magic %08x\n", + connreq->gncr_magic); + return -EBADF; + } + + swab = (connreq->gncr_magic == __swab32(GNILND_MSG_MAGIC)); + if (swab) { + __swab32s(&connreq->gncr_magic); + __swab32s(&connreq->gncr_cksum); + __swab16s(&connreq->gncr_type); + __swab16s(&connreq->gncr_version); + __swab32s(&connreq->gncr_timeout); + __swab64s(&connreq->gncr_srcnid); + __swab64s(&connreq->gncr_dstnid); + __swab64s(&connreq->gncr_peerstamp); + __swab64s(&connreq->gncr_connstamp); + } + + /* Do NOT return anything but -EBADF before we munge + * connreq->gncr_srcnid - we need that to send the nak */ + + if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) { + lnet_nid_t incoming = connreq->gncr_srcnid; + + /* even if the incoming packet is hosed, we know who we sent + * the original and can set the srcnid so that we can properly + * look up our peer to close the loop on this connreq. We still use + * -EBADF to prevent a NAK - just in case there are issues with + * the payload coming from a random spot, etc. */ + connreq->gncr_srcnid = dgram->gndg_conn_out.gncr_dstnid; + + if (LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid) != + LNET_NIDADDR(incoming)) { + /* we got a datagram match for the wrong nid... */ + CERROR("matched datagram 0x%p with srcnid %s " + "(%x), expecting %s (%x)\n", + dgram, + libcfs_nid2str(incoming), + LNET_NIDADDR(incoming), + libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid), + LNET_NIDADDR(dgram->gndg_conn_out.gncr_dstnid)); + return -EBADF; + } + } else { + /* if we have a wildcard datagram it should match an + * incoming "active" datagram that should have a fully formed + * srcnid and dstnid. If we couldn't unpack it, we drop as + * corrupted packet, otherwise we'll just verify that the dstnid + * matches the NID for the NET that the dgram was posted */ + + /* make sure their wildcard didn't match ours, that is unpossible */ + LASSERTF(connreq->gncr_dstnid != LNET_NID_ANY, + "dgram 0x%p from %s, connreq 0x%p; " + "wildcard matched wildcard \n", dgram, + libcfs_nid2str(connreq->gncr_srcnid), connreq); + + rc = kgnilnd_find_net(connreq->gncr_dstnid, &net); + + if (rc == -ESHUTDOWN) { + CERROR("Looking up network: device is in shutdown"); + return rc; + } else if (rc == -ENONET) { + CERROR("Connection data from %s: she sent " + "dst_nid %s, but net lookup failed on " + "dgram 0x%p@%s\n", + libcfs_nid2str(connreq->gncr_srcnid), + libcfs_nid2str(connreq->gncr_dstnid), + dgram, kgnilnd_dgram_type2str(dgram)); + return rc; + } + + if (net->gnn_ni->ni_nid != connreq->gncr_dstnid) { + CERROR("Bad connection data from %s: she sent " + "dst_nid %s, but I am %s with dgram 0x%p@%s\n", + libcfs_nid2str(connreq->gncr_srcnid), + libcfs_nid2str(connreq->gncr_dstnid), + libcfs_nid2str(net->gnn_ni->ni_nid), + dgram, kgnilnd_dgram_type2str(dgram)); + kgnilnd_net_decref(net); + return -EBADSLT; + } + + /* kgnilnd_find_net takes a ref on the net it finds, You need to decref it when not needed. */ + kgnilnd_net_decref(net); + } + + if (connreq->gncr_version != GNILND_CONNREQ_VERSION) { + CERROR("Unexpected version %d\n", connreq->gncr_version); + return -EPROTO; + } + + /* XXX Nic: TBD - checksum validation */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CONNREQ_DROP)) { + return -EBADF; + } + + if (swab && connreq->gncr_type == GNILND_CONNREQ_REQ) { + __u64 msg_addr = (__u64) connreq->gncr_gnparams.gnpr_smsg_attr.msg_buffer; + + __swab32s(&connreq->gncr_gnparams.gnpr_host_id); + __swab32s(&connreq->gncr_gnparams.gnpr_cqid); + __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.buff_size); + __swab16s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_maxcredit); + __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.mbox_offset); + __swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword1); + __swab64s(&connreq->gncr_gnparams.gnpr_smsg_attr.mem_hndl.qword2); + __swab64s(&msg_addr); + __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_maxsize); + __swab32s(&connreq->gncr_gnparams.gnpr_smsg_attr.msg_type); + } else if (swab && connreq->gncr_type == GNILND_CONNREQ_NAK) { + __swab32s(&connreq->gncr_nakdata.gnnd_errno); + } + + /* since we use a unique instance ID for each network, the driver + * will take care of dropping datagrams if we don't have that network. + */ + + /* few more idiot software or configuration checks */ + + switch (connreq->gncr_type) { + case GNILND_CONNREQ_REQ: + /* wire up EP and SMSG block - this will check the incoming data + * and barf a NAK back if need to */ + rc = kgnilnd_set_conn_params(dgram); + if (rc) + return rc; + break; + case GNILND_CONNREQ_NAK: + case GNILND_CONNREQ_CLOSE: + break; + default: + CERROR("unknown connreq packet type %d\n", connreq->gncr_type); + return -EPROTO; + } + + if (connreq->gncr_peerstamp == 0 || connreq->gncr_connstamp == 0) { + CERROR("Recived bad timestamps peer "LPU64" conn "LPU64"\n", + connreq->gncr_peerstamp, connreq->gncr_connstamp); + return -EPROTO; + } + + if (connreq->gncr_timeout < GNILND_MIN_TIMEOUT) { + CERROR("Received timeout %d < MIN %d\n", + connreq->gncr_timeout, GNILND_MIN_TIMEOUT); + return -EPROTO; + } + + return 0; +} + +int +kgnilnd_alloc_dgram(kgn_dgram_t **dgramp, kgn_device_t *dev, kgn_dgram_type_t type) +{ + kgn_dgram_t *dgram; + + dgram = cfs_mem_cache_alloc(kgnilnd_data.kgn_dgram_cache, + CFS_ALLOC_ATOMIC); + if (dgram == NULL) + return -ENOMEM; + + /* cache alloc'd memory is not zeroed */ + memset((void *)dgram, 0, sizeof(*dgram)) ; + + INIT_LIST_HEAD(&dgram->gndg_list); + dgram->gndg_state = GNILND_DGRAM_USED; + dgram->gndg_type = type; + dgram->gndg_magic = GNILND_DGRAM_MAGIC; + + atomic_inc(&dev->gnd_ndgrams); + + CDEBUG(D_MALLOC|D_NETTRACE, "slab-alloced 'dgram': %lu at %p.\n", + sizeof(*dgram), dgram); + + *dgramp = dgram; + return 0; +} + +/* call this on a dgram that came back from kgnilnd_ep_postdata_test_by_id + * returns < 0 on dgram to be cleaned up + * > 0 on dgram that isn't done yet + * == 0 on dgram that is ok and needs connreq processing */ +int +kgnilnd_process_dgram(kgn_dgram_t *dgram, gni_post_state_t post_state) +{ + int rc = 0; + + switch (post_state) { + case GNI_POST_COMPLETED: + /* normal state for dgrams that need actual processing */ + /* GOTO to avoid processing dgram as canceled/done */ + GOTO(process_out, rc); + + case GNI_POST_PENDING: + /* we should only see this if we are testing a WC dgram after a + * cancel - it means that it needs a full cycle of waiting + * for kgni_sm_task to finish moving it to TERMINATED */ + LASSERTF((dgram->gndg_type == GNILND_DGRAM_WC_REQ) && + (dgram->gndg_state == GNILND_DGRAM_CANCELED), + "POST_PENDING dgram 0x%p with bad type %d(%s) or state %d(%s)\n", + dgram, dgram->gndg_type, kgnilnd_dgram_type2str(dgram), + dgram->gndg_state, kgnilnd_dgram_state2str(dgram)); + + /* positive RC as this dgram isn't done yet */ + rc = EINPROGRESS; + + /* GOTO as this isn't done yet */ + GOTO(process_out, rc); + break; + + case GNI_POST_TERMINATED: + /* we've called cancel and it is done or remote guy called cancel and + * we've receved it on a WC dgram */ +#if 0 + /* we are seeing weird terminations on non WC dgrams when we have not + * canceled them */ + + LASSERTF(dgram->gndg_state == GNILND_DGRAM_CANCELED || + dgram->gndg_conn_out.gncr_dstnid == LNET_NID_ANY, + "dgram 0x%p with bad state %d(%s) or dst nid %s\n", + dgram, dgram->gndg_state, kgnilnd_dgram_state2str(dgram), + libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid)); +#endif + + CDEBUG(D_NETTRACE, "dgram 0x%p saw %s, cleaning it up\n", dgram, + dgram->gndg_state == GNILND_DGRAM_CANCELED ? "canceled" : "terminated"); + + rc = -ECANCELED; + break; + + case GNI_POST_TIMEOUT: + /* we could have a timeout on a wildcard dgram too - if + * we got the incoming request but the remote node beefed + * before kgni could send the match data back. We'll just error + * on the active case and bail out gracefully */ + if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) { + CNETERR("hardware timeout for connect to " + "%s after %lu seconds. Is node dead?\n", + libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid), + cfs_duration_sec(jiffies - dgram->gndg_post_time)); + } + + rc = -ETIMEDOUT; + break; + + default: + CERROR("dgram 0x%p with bad post_state %d\n", dgram, post_state); + LBUG(); + } + + /* now finish cleaning up a dgram that is canceled/terminated and needs to + * go away */ + + /* If this was actively canceled, drop the count now that we are processing */ + if (dgram->gndg_state == GNILND_DGRAM_CANCELED) { + atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams); + /* caller responsible for gndg_list removal */ + } + +process_out: + + RETURN(rc); +} + +/* needs dev->gnd_dgram_lock held */ +void +kgnilnd_cancel_dgram_locked(kgn_dgram_t *dgram) +{ + gni_return_t grc; + + if (dgram->gndg_state != GNILND_DGRAM_POSTED) { + return; + } + + LASSERTF(dgram->gndg_conn != NULL, + "dgram 0x%p with NULL conn\n", dgram); + + /* C.E - WC dgrams could be canceled immediately but + * if there was some match pending, we need to call + * test_by_id to clear it out. If that test returns + * POST_PENDING, it is half done and needs to go along + * with the rest of dgrams and go through a kgni_sm_task cycle + * and deliver a GNI_POST_TERMINATED event before they + * are actually canceled */ + + dgram->gndg_state = GNILND_DGRAM_CANCELED; + + if (dgram->gndg_conn->gnc_state >= GNILND_CONN_ESTABLISHED) { + /* we don't need to cancel_by_id if the datagram was good */ + return; + } + + /* let folks know there are outstanding cancels */ + atomic_inc(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams); + /* leave on nid list until cancel is done for debugging fun */ + grc = kgnilnd_ep_postdata_cancel_by_id(dgram->gndg_conn->gnc_ephandle, (__u64) dgram); + + /* if we don't get success here, we have hosed up the dgram tracking + * code and need to bail out */ + LASSERTF(grc == GNI_RC_SUCCESS, + "postdata_cancel returned %d for conn 0x%p to %s\n", + grc, dgram->gndg_conn, + dgram->gndg_conn->gnc_peer ? + libcfs_nid2str(dgram->gndg_conn->gnc_peer->gnp_nid) + : ""); + + CDEBUG(D_NETTRACE, + "canceled dgram 0x%p conn 0x%p ephandle 0x%p\n", + dgram, dgram->gndg_conn, + dgram->gndg_conn->gnc_ephandle); + + if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) { + gni_post_state_t post_state; + int rc = 0; + __u32 remote_addr = 0, remote_id = 0; + + grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle, + (__u64)dgram, &post_state, + &remote_addr, &remote_id); + + LASSERTF(grc == GNI_RC_NO_MATCH || grc == GNI_RC_SUCCESS, + "bad grc %d from test_by_id on dgram 0x%p\n", + grc, dgram); + + /* if WC was canceled immediately, we get NO_MATCH, if needs to go + * through full cycle, we get SUCCESS and need to parse post_state */ + + CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d " + "remote_addr %u remote_id %u\n", grc, dgram, + kgnilnd_dgram_type2str(dgram), + post_state, remote_addr, remote_id); + + if (grc == GNI_RC_NO_MATCH) { + /* she's gone, reduce count and move along */ + dgram->gndg_state = GNILND_DGRAM_DONE; + atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams); + RETURN_EXIT; + } + + rc = kgnilnd_process_dgram(dgram, post_state); + + if (rc <= 0) { + /* if for some weird reason we get a valid dgram back, just mark as done + * so we can drop it and move along. + * C.E - if it was completed, we'll just release the conn/mbox + * back into the pool and it'll get reused. That said, we should only + * be canceling a WC dgram on stack rest or shutdown, so that is moot */ + dgram->gndg_state = GNILND_DGRAM_DONE; + atomic_dec(&dgram->gndg_conn->gnc_device->gnd_canceled_dgrams); + + /* caller context responsible for calling kgnilnd_release_dgram() */ + } else { + /* still pending, let it simmer until golden brown and delicious */ + } + } + + /* for non WC dgrams, they are still on the nid list but marked canceled waiting + * for kgni to return their ID to us via probe - that is when we'll complete their + * cancel processing */ +} + +void +kgnilnd_cleanup_dgram(kgn_dgram_t *dgram) +{ + /* release the dgram ref on conn */ + if (dgram->gndg_conn) { + kgnilnd_conn_decref(dgram->gndg_conn); + dgram->gndg_conn = NULL; + } +} + +void +kgnilnd_free_dgram(kgn_device_t *dev, kgn_dgram_t *dgram) +{ + LASSERTF(dgram->gndg_state == GNILND_DGRAM_USED || + dgram->gndg_state == GNILND_DGRAM_DONE, + "dgram 0x%p with bad state %s\n", + dgram, kgnilnd_dgram_state2str(dgram)); + + /* bit of poisoning to help detect bad driver data */ + dgram->gndg_magic = 0x6f5a6b5f; + atomic_dec(&dev->gnd_ndgrams); + + cfs_mem_cache_free(kgnilnd_data.kgn_dgram_cache, dgram); + CDEBUG(D_MALLOC|D_NETTRACE, "slab-freed 'dgram': %lu at %p.\n", + sizeof(*dgram), dgram); +} + +int +kgnilnd_post_dgram(kgn_device_t *dev, lnet_nid_t dstnid, kgn_connreq_type_t type, + int data_rc) +{ + int rc = 0; + kgn_dgram_t *dgram = NULL; + kgn_dgram_t *tmpdgram; + kgn_dgram_type_t dgtype; + gni_return_t grc; + __u64 srcnid; + ENTRY; + + switch (type) { + case GNILND_CONNREQ_REQ: + if (dstnid == LNET_NID_ANY) + dgtype = GNILND_DGRAM_WC_REQ; + else + dgtype = GNILND_DGRAM_REQ; + break; + case GNILND_CONNREQ_NAK: + LASSERTF(dstnid != LNET_NID_ANY, "can't NAK to LNET_NID_ANY\n"); + dgtype = GNILND_DGRAM_NAK; + break; + default: + CERROR("unknown connreq type %d\n", type); + LBUG(); + } + + rc = kgnilnd_alloc_dgram(&dgram, dev, dgtype); + if (rc < 0) { + rc = -ENOMEM; + GOTO(post_failed, rc); + } + + rc = kgnilnd_create_conn(&dgram->gndg_conn, dev); + if (rc) { + GOTO(post_failed, rc); + } + + if (dgram->gndg_type == GNILND_DGRAM_WC_REQ) { + /* clear buffer for sanity on reuse of wildcard */ + memset(&dgram->gndg_conn_in, 0, sizeof(kgn_connreq_t)); + } + + if (dstnid == LNET_NID_ANY) { + /* set here to reset any dgram re-use */ + dgram->gndg_conn->gnc_state = GNILND_CONN_LISTEN; + } else { + __u32 host_id; + + rc = kgnilnd_nid_to_nicaddrs(LNET_NIDADDR(dstnid), 1, &host_id); + if (rc <= 0) { + rc = -ESRCH; + GOTO(post_failed, rc); + } + + dgram->gndg_conn->gnc_state = GNILND_CONN_CONNECTING; + + /* don't need to serialize, there are no CQs for the dgram + * EP on the kgn_net_t */ + grc = kgnilnd_ep_bind(dgram->gndg_conn->gnc_ephandle, host_id, dev->gnd_id); + + if (grc != GNI_RC_SUCCESS) { + rc = -ECONNABORTED; + GOTO(post_failed, rc); + } + + } + + /* If we are posting wildcards post using a net of 0, otherwise we'll use the + * net of the destination node. + */ + + if (dstnid == LNET_NID_ANY) { + srcnid = LNET_MKNID(LNET_MKNET(GNILND, 0), dev->gnd_nid); + } else { + srcnid = LNET_MKNID(LNET_NIDNET(dstnid), dev->gnd_nid); + } + + rc = kgnilnd_pack_connreq(&dgram->gndg_conn_out, dgram->gndg_conn, + srcnid, dstnid, type); + if (rc) { + GOTO(post_failed, rc); + } + + if (type == GNILND_CONNREQ_NAK) + dgram->gndg_conn_out.gncr_nakdata.gnnd_errno = data_rc; + + dgram->gndg_post_time = jiffies; + + /* XXX Nic: here is where we'd add in logical network multiplexing */ + + CDEBUG(D_NETTRACE, "dgram 0x%p type %s %s->%s cdm %d\n", + dgram, kgnilnd_dgram_type2str(dgram), + libcfs_nid2str(srcnid), + libcfs_nid2str(dstnid), dev->gnd_id); + + /* this allocates memory, can't hold locks across */ + grc = kgnilnd_ep_postdata_w_id(dgram->gndg_conn->gnc_ephandle, + &dgram->gndg_conn_out, sizeof(kgn_connreq_t), + &dgram->gndg_conn_in, sizeof(kgn_connreq_t), + (__u64)dgram); + + if (grc != GNI_RC_SUCCESS) { + CNETERR("dropping failed dgram post id 0x%p type %s" + " reqtype %s to %s: rc %d\n", + dgram, kgnilnd_dgram_type2str(dgram), + kgnilnd_connreq_type2str(&dgram->gndg_conn_out), + libcfs_nid2str(dstnid), grc); + rc = (grc == GNI_RC_ERROR_NOMEM) ? -ENOMEM : -EBADR; + GOTO(post_failed, rc); + } + + /* we don't need to add earlier - if someone does del_peer during post, + * that peer will get marked as unlinked and the callers wil take care of it. + * The dgram code is largely kgn_peer_t ignorant, so at worst, we'll just drop + * the completed dgram later when we cant find a peer to stuff it into */ + + spin_lock(&dev->gnd_dgram_lock); + + /* make sure we are not double posting targeted dgrams + * - we can multiple post WC dgrams to help with processing speed */ + if (dstnid != LNET_NID_ANY) { + tmpdgram = kgnilnd_find_dgram_locked(dev, dstnid); + + LASSERTF(tmpdgram == NULL, + "dgram 0x%p->%s already posted\n", + dgram, libcfs_nid2str(dstnid)); + } + + /* unmunge dstnid to help processing code cope... */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PACK_DSTNID)) { + dgram->gndg_conn_out.gncr_dstnid = dstnid; + } + + list_add_tail(&dgram->gndg_list, kgnilnd_nid2dgramlist(dev, dstnid)); + dgram->gndg_state = GNILND_DGRAM_POSTED; + spin_unlock(&dev->gnd_dgram_lock); + +post_failed: + if (rc < 0 && dgram != NULL) { + kgnilnd_cleanup_dgram(dgram); + kgnilnd_free_dgram(dev, dgram); + } + + RETURN(rc); +} + +void +kgnilnd_release_dgram(kgn_device_t *dev, kgn_dgram_t *dgram) +{ + spin_lock(&dev->gnd_dgram_lock); + kgnilnd_cancel_dgram_locked(dgram); + spin_unlock(&dev->gnd_dgram_lock); + + kgnilnd_cleanup_dgram(dgram); + + /* if the dgram is 'canceled' it needs to be wait until the event + * comes up from kgni that tells us it is safe to release */ + if (dgram->gndg_state != GNILND_DGRAM_CANCELED) { + dgram->gndg_state = GNILND_DGRAM_DONE; + + LASSERTF(list_empty(&dgram->gndg_list), "dgram 0x%p on list\n", dgram); + + /* if it is a wildcard and we are in an appropriate state, repost + * the wildcard */ + + if ((dgram->gndg_type == GNILND_DGRAM_WC_REQ) && + (!kgnilnd_data.kgn_wc_kill && !kgnilnd_data.kgn_in_reset)) { + int rerc; + + rerc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0); + LASSERTF(rerc == 0, + "error %d: dev %d could not repost wildcard datagram id 0x%p\n", + rerc, dev->gnd_id, dgram); + } + + /* always free the old dgram */ + kgnilnd_free_dgram(dev, dgram); + } +} + + +int +kgnilnd_probe_for_dgram(kgn_device_t *dev, kgn_dgram_t **dgramp) +{ + kgn_dgram_t *dgram = NULL; + gni_post_state_t post_state; + gni_return_t grc; + int rc = 0; + __u64 readyid; + __u32 remote_addr = 0, remote_id = 0; + ENTRY; + + /* Probe with the lock held. That way if we get a dgram we dont have it canceled + * between finding the ready dgram and grabbing the lock to remove it from the + * list. Otherwise we could be left in an inconsistent state. We own the dgram + * once its off the list so we don't need to worry about others changing it at + * that point. */ + spin_lock(&dev->gnd_dgram_lock); + grc = kgnilnd_postdata_probe_by_id(dev->gnd_handle, &readyid); + if (grc != GNI_RC_SUCCESS) { + spin_unlock(&dev->gnd_dgram_lock); + /* return 0 to indicate nothing happened */ + RETURN(0); + } + + CDEBUG(D_NET, "ready "LPX64" on device 0x%p\n", + readyid, dev); + + dgram = (kgn_dgram_t *)readyid; + + LASSERTF(dgram->gndg_magic == GNILND_DGRAM_MAGIC, + "dgram 0x%p from id "LPX64" with bad magic %x\n", + dgram, readyid, dgram->gndg_magic); + + LASSERTF(dgram->gndg_state == GNILND_DGRAM_POSTED || + dgram->gndg_state == GNILND_DGRAM_CANCELED, + "dgram 0x%p with bad state %s\n", + dgram, kgnilnd_dgram_state2str(dgram)); + + LASSERTF(!list_empty(&dgram->gndg_list), + "dgram 0x%p with bad list state %s\n", + dgram, kgnilnd_dgram_state2str(dgram)); + + /* now we know that the datagram structure is ok, so pull off list */ + list_del_init(&dgram->gndg_list); + + /* while we have the gnn_dgram_lock and BEFORE we call test_by_id + * change the state from POSTED to PROCESSING to ensure that + * nobody cancels it after we've pulled it from the wire */ + if (dgram->gndg_state == GNILND_DGRAM_POSTED) { + dgram->gndg_state = GNILND_DGRAM_PROCESSING; + } + + spin_unlock(&dev->gnd_dgram_lock); + + /* we now "own" this datagram */ + + LASSERTF(dgram->gndg_conn != NULL, + "dgram 0x%p with NULL conn\n", dgram); + + grc = kgnilnd_ep_postdata_test_by_id(dgram->gndg_conn->gnc_ephandle, + (__u64)dgram, &post_state, + &remote_addr, &remote_id); + + LASSERTF(grc != GNI_RC_NO_MATCH, "kgni lied! probe_by_id told us that" + " id "LPU64" was ready\n", readyid); + + CDEBUG(D_NET, "grc %d dgram 0x%p type %s post_state %d " + "remote_addr %u remote_id %u\n", grc, dgram, + kgnilnd_dgram_type2str(dgram), + post_state, remote_addr, remote_id); + + if (unlikely(grc != GNI_RC_SUCCESS)) { + CNETERR("getting data for dgram 0x%p->%s failed rc %d. Dropping it\n", + dgram, libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid), + grc); + rc = -EINVAL; + GOTO(probe_for_out, rc); + } + + rc = kgnilnd_process_dgram(dgram, post_state); + + /* we should never get probe finding a dgram for us and then it + * being a WC dgram that is still in the middle of processing */ + LASSERTF(rc <= 0, "bad rc %d from process_dgram 0x%p state %d\n", + rc, dgram, post_state); + + if (rc == 0) { + /* dgram is good enough for the data to be used */ + dgram->gndg_state = GNILND_DGRAM_PROCESSING; + /* fake rc to mark that we've done something */ + rc = 1; + } else { + /* bring out your dead! */ + dgram->gndg_state = GNILND_DGRAM_DONE; + } + + *dgramp = dgram; + RETURN(rc); + +probe_for_out: + + kgnilnd_release_dgram(dev, dgram); + RETURN(rc); +} + +int +kgnilnd_setup_wildcard_dgram(kgn_device_t *dev) +{ + /* if kgn_wildcard is zero, return error */ + int rc = -ENOENT, i; + ENTRY; + + for (i = 0; i < *kgnilnd_tunables.kgn_nwildcard; i++) { + rc = kgnilnd_post_dgram(dev, LNET_NID_ANY, GNILND_CONNREQ_REQ, 0); + if (rc < 0) { + CERROR("error %d: could not post wildcard datagram # %d\n", + rc, i); + rc = -EINVAL; + GOTO(failed, rc); + } + } + +failed: + RETURN(rc); +} + +int +kgnilnd_cancel_net_dgrams(kgn_net_t *net) +{ + kgn_dgram_t *dg, *dgN; + struct list_head zombies; + int i; + ENTRY; + + /* we want to cancel any outstanding dgrams - we don't want to rely + * on del_peer_or_conn catching all of them. This helps protect us in cases + * where we don't quite keep the peer->dgram mapping in sync due to some + * race conditions */ + + LASSERTF(net->gnn_shutdown || kgnilnd_data.kgn_in_reset, + "called with LND invalid state: net shutdown %d " + "in reset %d\n", net->gnn_shutdown, + kgnilnd_data.kgn_in_reset); + + INIT_LIST_HEAD(&zombies); + + spin_lock(&net->gnn_dev->gnd_dgram_lock); + + for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) { + list_for_each_entry_safe(dg, dgN, &net->gnn_dev->gnd_dgrams[i], gndg_list) { + + /* skip nids not on our net or are wildcards */ + + + if (dg->gndg_type == GNILND_DGRAM_WC_REQ || + net->gnn_netnum != LNET_NETNUM(LNET_NIDNET(dg->gndg_conn_out.gncr_dstnid))) + continue; + + kgnilnd_cancel_dgram_locked(dg); + } + } + + spin_unlock(&net->gnn_dev->gnd_dgram_lock); + + RETURN(0); +} + +int +kgnilnd_cancel_wc_dgrams(kgn_device_t *dev) +{ + kgn_dgram_t *dg, *dgN; + struct list_head zombies; + ENTRY; + + /* Time to kill the outstanding WC's + * WC's exist on net 0 only but match on any net... + */ + + LASSERTF(kgnilnd_data.kgn_in_reset || kgnilnd_data.kgn_wc_kill, + "called with LND invalid state: WC shutdown %d " + "in reset %d\n", kgnilnd_data.kgn_wc_kill, + kgnilnd_data.kgn_in_reset); + + INIT_LIST_HEAD(&zombies); + spin_lock(&dev->gnd_dgram_lock); + + do { + dg = kgnilnd_find_dgram_locked(dev, LNET_NID_ANY); + if (dg != NULL) { + LASSERTF(dg->gndg_type == GNILND_DGRAM_WC_REQ, + "dgram 0x%p->%s with bad type %d (%s)\n", + dg, libcfs_nid2str(dg->gndg_conn_out.gncr_dstnid), + dg->gndg_type, kgnilnd_dgram_type2str(dg)); + + kgnilnd_cancel_dgram_locked(dg); + + /* WC could be DONE already, check and if so add to list to be released */ + if (dg->gndg_state == GNILND_DGRAM_DONE) { + list_del_init(&dg->gndg_list); + list_add_tail(&dg->gndg_list, &zombies); + } + } + } while (dg != NULL); + + spin_unlock(&dev->gnd_dgram_lock); + + list_for_each_entry_safe(dg, dgN, &zombies, gndg_list) { + list_del_init(&dg->gndg_list); + kgnilnd_release_dgram(dev, dg); + } + RETURN(0); + +} + +void +kgnilnd_wait_for_canceled_dgrams(kgn_device_t *dev) +{ + int i = 4; + int rc; + gni_return_t grc; + __u64 readyid; + kgn_dgram_t *dgram; + + /* use do while to get at least one check run to allow + * regression test for 762072 to hit bug if there */ + + /* This function races with the dgram mover during shutdown so it is possible for + * a dgram to be seen in kgnilnd_postdata_probe_wait_by_id but be handled in the + * dgram mover thread instead of inside of this function. + */ + + /* This should only be called from within shutdown, baseshutdown, or stack reset. + * there are no assertions here to verify since base_shutdown has nothing in it we can check + * the net is gone by then. + */ + + do { + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, + "Waiting for %d canceled datagrams to clear on device %d\n", + atomic_read(&dev->gnd_canceled_dgrams), dev->gnd_id); + + /* check once a second */ + grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle, + 250, &readyid); + + if (grc != GNI_RC_SUCCESS) + continue; + + CDEBUG(D_NET, "ready "LPX64" on device %d->0x%p\n", + readyid, dev->gnd_id, dev); + + rc = kgnilnd_probe_for_dgram(dev, &dgram); + if (rc != 0) { + /* if we got a valid dgram or one that is now done, clean up */ + kgnilnd_release_dgram(dev, dgram); + } + } while (atomic_read(&dev->gnd_canceled_dgrams)); +} + +int +kgnilnd_start_connect(kgn_peer_t *peer) +{ + int rc = 0; + /* sync point for kgnilnd_del_peer_locked - do an early check to + * catch the most common hits where del_peer is done by the + * time we get here */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING1)) { + while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING1, 1)) {}; + } + + write_lock(&kgnilnd_data.kgn_peer_conn_lock); + if (!kgnilnd_peer_active(peer) || peer->gnp_connecting != GNILND_PEER_CONNECT) { + /* raced with peer getting unlinked */ + write_unlock(&kgnilnd_data.kgn_peer_conn_lock); + rc = ESTALE; + GOTO(out, rc); + } + peer->gnp_connecting = GNILND_PEER_POSTING; + write_unlock(&kgnilnd_data.kgn_peer_conn_lock); + + set_mb(peer->gnp_last_dgram_time, jiffies); + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING2)) { + while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING2, 1)) {}; + } + + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GNP_CONNECTING3)) { + while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_GNP_CONNECTING3, 1)) {}; + rc = cfs_fail_val ? cfs_fail_val : -ENOMEM; + } else { + rc = kgnilnd_post_dgram(peer->gnp_net->gnn_dev, + peer->gnp_nid, GNILND_CONNREQ_REQ, 0); + } + if (rc < 0) { + set_mb(peer->gnp_last_dgram_errno, rc); + GOTO(failed, rc); + } + + /* while we're posting someone could have decided this peer/dgram needed to + * die a quick death, so we check for state change and process accordingly */ + + write_lock(&kgnilnd_data.kgn_peer_conn_lock); + if (!kgnilnd_peer_active(peer) || peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) { + if (peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH) { + peer->gnp_connecting = GNILND_PEER_KILL; + } + write_unlock(&kgnilnd_data.kgn_peer_conn_lock); + /* positive RC to avoid dgram cleanup - we'll have to + * wait for the kgni GNI_POST_TERMINATED event to + * finish cleaning up */ + rc = ESTALE; + kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev, peer->gnp_nid); + GOTO(out, rc); + } + peer->gnp_connecting = GNILND_PEER_POSTED; + write_unlock(&kgnilnd_data.kgn_peer_conn_lock); + /* reaper thread will take care of any timeouts */ + CDEBUG(D_NET, "waiting for connect to finish to %s rc %d\n", + libcfs_nid2str(peer->gnp_nid), rc); + + RETURN(rc); + +failed: + CDEBUG(D_NET, "connect to %s failed: rc %d \n", + libcfs_nid2str(peer->gnp_nid), rc); +out: + RETURN(rc); +} + +int +kgnilnd_finish_connect(kgn_dgram_t *dgram) +{ + kgn_conn_t *conn = dgram->gndg_conn; + lnet_nid_t her_nid = dgram->gndg_conn_in.gncr_srcnid; + kgn_peer_t *new_peer, *peer = NULL; + kgn_tx_t *tx; + kgn_tx_t *txn; + kgn_mbox_info_t *mbox; + int rc; + int nstale; + + /* try to find a peer that matches the nid we got in the connreq + * kgnilnd_unpack_connreq makes sure that conn_in.gncr_srcnid is + * HER and conn_out.gncr_srcnid is ME for both active and WC dgrams */ + + /* assume this is a new peer - it makes locking cleaner when it isn't */ + /* no holding kgn_net_rw_sem - already are at the kgnilnd_dgram_mover level */ + + rc = kgnilnd_create_peer_safe(&new_peer, her_nid, NULL); + if (rc != 0) { + CERROR("Can't create peer for %s\n", libcfs_nid2str(her_nid)); + return rc; + } + + write_lock(&kgnilnd_data.kgn_peer_conn_lock); + + /* this transfers ref from create_peer to the kgn_peer table */ + kgnilnd_add_peer_locked(her_nid, new_peer, &peer); + + /* if we found an existing peer, is it really ready for a new conn ? */ + if (peer != new_peer) { + /* if this was an active connect attempt but we can't find a peer waiting for it + * we will dump in the trash */ + + if (peer->gnp_connecting == GNILND_PEER_IDLE && dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) { + CDEBUG(D_NET, "dropping completed connreq for %s peer 0x%p->%s\n", + libcfs_nid2str(her_nid), peer, libcfs_nid2str(peer->gnp_nid)); + write_unlock(&kgnilnd_data.kgn_peer_conn_lock); + rc = ECANCELED; + GOTO(out, rc); + } + + /* check to see if we can catch a connecting peer before it is + * removed from the connd_peers list - if not, we need to + * let the connreqs race and be handled by kgnilnd_conn_isdup_locked() */ + if (peer->gnp_connecting != GNILND_PEER_IDLE) { + spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock); + if (!list_empty(&peer->gnp_connd_list)) { + list_del_init(&peer->gnp_connd_list); + /* drop connd ref */ + kgnilnd_peer_decref(peer); + } + spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock); + /* clear rc to make sure we don't have fake error */ + rc = 0; + } + + /* no matter what, we are no longer waiting to connect this peer now */ + peer->gnp_connecting = GNILND_PEER_IDLE; + + /* Refuse to duplicate an existing connection (both sides might try to + * connect at once). NB we return success! We _are_ connected so we + * _don't_ have any blocked txs to complete with failure. */ + rc = kgnilnd_conn_isdup_locked(peer, conn); + if (rc != 0) { + write_unlock(&kgnilnd_data.kgn_peer_conn_lock); + CDEBUG(D_NET, "Not creating duplicate connection to %s: %d\n", + libcfs_nid2str(her_nid), rc); + rc = EALREADY; + GOTO(out, rc); + } + } + + nstale = kgnilnd_close_stale_conns_locked(peer, conn); + + /* either way with peer (new or existing), we are ok with ref counts here as the + * kgnilnd_add_peer_locked will use our ref on new_peer (from create_peer_safe) as the + * ref for the peer table. */ + + /* at this point, the connection request is a winner */ + + /* mark 'DONE' to avoid cancel being called from release */ + dgram->gndg_state = GNILND_DGRAM_DONE; + + /* initialise timestamps before reaper looks at them */ + conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies; + + /* last_tx is initialized to jiffies - (keepalive*2) so that if the NOOP fails it will + * immediatly send a NOOP in the reaper thread during the call to + * kgnilnd_check_conn_timeouts_locked + */ + conn->gnc_last_tx = jiffies - (cfs_time_seconds(GNILND_TO2KA(conn->gnc_timeout)) * 2); + conn->gnc_state = GNILND_CONN_ESTABLISHED; + + /* refs are not transferred from dgram to tables, so increment to + * take ownership */ + kgnilnd_conn_addref(conn); + kgnilnd_peer_addref(peer); + conn->gnc_peer = peer; + list_add_tail(&conn->gnc_list, &peer->gnp_conns); + + kgnilnd_conn_addref(conn); /* +1 ref for conn table */ + list_add_tail(&conn->gnc_hashlist, + kgnilnd_cqid2connlist(conn->gnc_cqid)); + kgnilnd_data.kgn_conn_version++; + + /* Dont send NOOP if fail_loc is set + */ + if (!CFS_FAIL_CHECK(CFS_FAIL_GNI_ONLY_NOOP)) { + tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP, peer->gnp_net->gnn_ni->ni_nid); + if (tx == NULL) { + CNETERR("can't get TX to initiate NOOP to %s\n", + libcfs_nid2str(peer->gnp_nid)); + } else { + kgnilnd_queue_tx(conn, tx); + } + } + + /* Schedule all packets blocking for a connection */ + list_for_each_entry_safe(tx, txn, &peer->gnp_tx_queue, tx_list) { + /* lock held here is the peer_conn lock */ + kgnilnd_tx_del_state_locked(tx, peer, NULL, GNILND_TX_ALLOCD); + kgnilnd_queue_tx(conn, tx); + } + + /* If this is an active connection lets mark its timestamp on the MBoX */ + if (dgram->gndg_conn_out.gncr_dstnid != LNET_NID_ANY) { + mbox = &conn->gnc_fma_blk->gnm_mbox_info[conn->gnc_mbox_id]; + /* conn->gnc_last_rx is jiffies it better exist as it was just set */ + mbox->mbx_release_purg_active_dgram = conn->gnc_last_rx; + } + + /* Bug 765042: wake up scheduler for a race with finish_connect and + * complete_conn_closed with a conn in purgatory + * since we can't use CFS_RACE due to mutex_holds in kgnilnd_process_conns, + * we just check for set and then clear */ + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_FINISH_PURG)) { + cfs_fail_loc = 0x0; + /* get scheduler thread moving again */ + kgnilnd_schedule_device(conn->gnc_device); + } + + CDEBUG(D_NET, "New conn 0x%p->%s dev %d\n", + conn, libcfs_nid2str(her_nid), conn->gnc_device->gnd_id); + + /* make sure we reset peer reconnect interval now that we have a good conn */ + kgnilnd_peer_alive(peer); + peer->gnp_reconnect_interval = 0; + + /* clear the unlink attribute if we dont clear it kgnilnd_del_conn_or_peer will wait + * on the atomic forever + */ + if (peer->gnp_pending_unlink) { + peer->gnp_pending_unlink = 0; + kgnilnd_admin_decref(kgnilnd_data.kgn_npending_unlink); + CDEBUG(D_NET, "Clearing peer unlink %p\n",peer); + } + + /* add ref to make it hang around until after we drop the lock */ + kgnilnd_conn_addref(conn); + + /* Once the peer_conn lock is dropped, the conn could actually move into + * CLOSING->CLOSED->DONE in the scheduler thread, so hold the + * lock until we are really done */ + write_unlock(&kgnilnd_data.kgn_peer_conn_lock); + + /* Notify LNET that we now have a working connection to this peer. + * This is a Cray extension to the "standard" LND behavior. */ + lnet_notify(peer->gnp_net->gnn_ni, peer->gnp_nid, + 1, cfs_time_current()); + + /* schedule the conn to pick up any SMSG sent by peer before we could + * process this dgram */ + kgnilnd_schedule_conn(conn); + + /* drop our 'hold' ref */ + kgnilnd_conn_decref(conn); + +out: + RETURN(rc); +} + +void +kgnilnd_send_nak(kgn_device_t *dev, lnet_nid_t dst_nid, int error) +{ + int rc = 0; + ENTRY; + + LASSERTF(dst_nid != LNET_NID_ANY, "bad dst_nid %s\n", libcfs_nid2str(dst_nid)); + + CDEBUG(D_NET, "NAK to %s errno %d\n", libcfs_nid2str(dst_nid), error); + + rc = kgnilnd_post_dgram(dev, dst_nid, GNILND_CONNREQ_NAK, error); + + if (rc < 0) { + CDEBUG(D_NET, "NAK to %s failed: rc %d \n", libcfs_nid2str(dst_nid), rc); + } + EXIT; +} + +int +kgnilnd_process_nak(kgn_dgram_t *dgram) +{ + kgn_connreq_t *connreq = &dgram->gndg_conn_in; + lnet_nid_t src_nid = connreq->gncr_srcnid; + int errno = connreq->gncr_nakdata.gnnd_errno; + kgn_peer_t *peer; + int rc = 0; + + write_lock(&kgnilnd_data.kgn_peer_conn_lock); + + peer = kgnilnd_find_peer_locked(src_nid); + if (peer == NULL) { + /* we likely dropped him from bad data when we processed + * the original REQ */ + write_unlock(&kgnilnd_data.kgn_peer_conn_lock); + return -EBADSLT; + } + + /* need to check peerstamp/connstamp against the ones we find + * to make sure we don't close new (and good?) conns that we + * formed after this connreq failed */ + if (peer->gnp_connecting == GNILND_PEER_IDLE) { + kgn_conn_t conn; + + if (list_empty(&peer->gnp_conns)) { + /* assume already procced datagram and it barfed up + * on this side too */ + CDEBUG(D_NET, "dropping NAK from %s; " + "peer %s is already not connected\n", + libcfs_nid2str(connreq->gncr_srcnid), + libcfs_nid2str(connreq->gncr_dstnid)); + write_unlock(&kgnilnd_data.kgn_peer_conn_lock); + return 0; + } + + /* stub up a connection with the connreq XXX_stamps to allow + * use to use close_stale_conns_locked */ + conn.gnc_peerstamp = connreq->gncr_peerstamp; + conn.gnc_my_connstamp = connreq->gncr_connstamp; + conn.gnc_peer_connstamp = connreq->gncr_connstamp; + conn.gnc_device = peer->gnp_net->gnn_dev; + + rc = kgnilnd_close_stale_conns_locked(peer, &conn); + + LCONSOLE_INFO("Received NAK from %s for %s errno %d; " + "closed %d connections\n", + libcfs_nid2str(connreq->gncr_srcnid), + libcfs_nid2str(connreq->gncr_dstnid), errno, rc); + } else { + rc = 0; + spin_lock(&dgram->gndg_conn->gnc_device->gnd_connd_lock); + + if (list_empty(&peer->gnp_connd_list)) { + /* if peer isn't on waiting list, try to find one to nuke */ + rc = kgnilnd_find_and_cancel_dgram(peer->gnp_net->gnn_dev, + peer->gnp_nid); + + if (rc) { + LCONSOLE_INFO("Received NAK from %s for %s errno %d; " + "canceled pending connect request\n", + libcfs_nid2str(connreq->gncr_srcnid), + libcfs_nid2str(connreq->gncr_dstnid), errno); + } + + /* if we can't find a waiting dgram, we just drop the nak - the conn + * connect must have failed (didn't find conn above and clear connecting + * -- so nothing to do besides drop */ + } else { + /* peer is on list, meaning it is a new connect attempt from the one + * we started that generated the NAK - so just drop NAK */ + + /* use negative to prevent error message */ + rc = -EAGAIN; + } + spin_unlock(&dgram->gndg_conn->gnc_device->gnd_connd_lock); + } + + /* success! we found a peer and at least marked pending_nak */ + write_unlock(&kgnilnd_data.kgn_peer_conn_lock); + + return 0; +} + +int +kgnilnd_process_connreq(kgn_dgram_t *dgram, int *needs_nak) +{ + int rc; + + rc = kgnilnd_unpack_connreq(dgram); + if (rc < 0) { + if (rc != -EBADF) { + /* only NAK if we have good srcnid to use */ + *needs_nak = 1; + } + goto connreq_out; + } + + switch (dgram->gndg_conn_in.gncr_type) { + case GNILND_CONNREQ_REQ: + /* wire up peer & conn, send queued TX */ + rc = kgnilnd_finish_connect(dgram); + + /* don't nak when the nid is hosed */ + if ((rc < 0)) { + *needs_nak = 1; + } + + break; + case GNILND_CONNREQ_NAK: + rc = kgnilnd_process_nak(dgram); + /* return early to prevent reconnect bump */ + return rc; + default: + CERROR("unexpected connreq type %s (%d) from %s\n", + kgnilnd_connreq_type2str(&dgram->gndg_conn_in), + dgram->gndg_conn_in.gncr_type, + libcfs_nid2str(dgram->gndg_conn_in.gncr_srcnid)); + rc = -EINVAL; + *needs_nak = 1; + break; + } + +connreq_out: + RETURN(rc); +} + +int +kgnilnd_probe_and_process_dgram(kgn_device_t *dev) +{ + int rc; + int needs_nak = 0; + lnet_nid_t nak_dstnid = LNET_NID_ANY; + lnet_nid_t orig_dstnid; + kgn_dgram_t *dgram = NULL; + kgn_peer_t *peer; + ENTRY; + + if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PAUSE_DGRAM_COMP)) { + rc = 0; + } else { + rc = kgnilnd_probe_for_dgram(dev, &dgram); + } + + if (rc == 0) { + RETURN(0); + } else if (rc < 0) { + GOTO(inform_peer, rc); + } else { + /* rc > 1 means it did something, reset for this func */ + rc = 0; + } + + switch (dgram->gndg_type) { + case GNILND_DGRAM_WC_REQ: + case GNILND_DGRAM_REQ: + rc = kgnilnd_process_connreq(dgram, &needs_nak); + break; + case GNILND_DGRAM_NAK: + CDEBUG(D_NETTRACE, "NAK to %s done\n", + libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid)); + break; + default: + CERROR("unknown datagram type %s (%d)\n", + kgnilnd_dgram_type2str(dgram), dgram->gndg_type); + break; + } + + /* stash data to use after releasing current datagram */ + /* don't stash net - we are operating on a net already, + * so the lock on rw_net_lock is sufficient */ + + nak_dstnid = dgram->gndg_conn_in.gncr_srcnid; + +inform_peer: + LASSERTF(dgram != NULL, "dgram 0x%p rc %d needs_nak %d\n", dgram, rc, needs_nak); + + orig_dstnid = dgram->gndg_conn_out.gncr_dstnid; + + kgnilnd_release_dgram(dev, dgram); + + CDEBUG(D_NET, "cleaning up dgram to %s, rc %d\n", + libcfs_nid2str(orig_dstnid), rc); + + /* if this was a WC_REQ that matched an existing peer, it'll get marked done + * in kgnilnd_finish_connect - if errors are from before we get to there, + * we just drop as it is a WC_REQ - the peer CAN'T be waiting for it */ + if ((orig_dstnid != LNET_NID_ANY) && (rc < 0)) { + /* if we have a negative rc, we want to find a peer to inform about + * the bad connection attempt. Sorry buddy, better luck next time! */ + + write_lock(&kgnilnd_data.kgn_peer_conn_lock); + peer = kgnilnd_find_peer_locked(orig_dstnid); + + if (peer != NULL) { + /* add ref to make sure he stays around past the possible unlink + * so we can tell LNet about him */ + kgnilnd_peer_addref(peer); + + /* if he still cares about the outstanding connect */ + if (peer->gnp_connecting >= GNILND_PEER_CONNECT) { + /* check if he is on the connd list and remove.. */ + spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock); + if (!list_empty(&peer->gnp_connd_list)) { + list_del_init(&peer->gnp_connd_list); + /* drop connd ref */ + kgnilnd_peer_decref(peer); + } + spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock); + + /* clear gnp_connecting so we don't have a non-connecting peer + * on gnd_connd_list */ + peer->gnp_connecting = GNILND_PEER_IDLE; + + set_mb(peer->gnp_last_dgram_errno, rc); + + kgnilnd_peer_increase_reconnect_locked(peer); + } + } + write_unlock(&kgnilnd_data.kgn_peer_conn_lock); + + /* now that we are outside the lock, tell Mommy */ + if (peer != NULL) { + kgnilnd_peer_notify(peer, rc); + kgnilnd_peer_decref(peer); + } + } + + if (needs_nak) { + kgnilnd_send_nak(dev, nak_dstnid, rc); + } + + RETURN(1); +} + +void +kgnilnd_reaper_dgram_check(kgn_device_t *dev) +{ + kgn_dgram_t *dgram, *tmp; + int i; + + spin_lock(&dev->gnd_dgram_lock); + + for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size - 1); i++) { + list_for_each_entry_safe(dgram, tmp, &dev->gnd_dgrams[i], gndg_list) { + unsigned long now = jiffies; + unsigned long timeout; + + /* don't timeout stuff if the network is mucked or shutting down */ + if (kgnilnd_check_hw_quiesce()) { + break; + } + + if ((dgram->gndg_state != GNILND_DGRAM_POSTED) || + (dgram->gndg_type == GNILND_DGRAM_WC_REQ)) { + continue; + } + CDEBUG(D_NETTRACE, "checking dgram 0x%p type %s " + "state %s conn 0x%p to %s age %lus\n", + dgram, kgnilnd_dgram_type2str(dgram), + kgnilnd_dgram_state2str(dgram), dgram->gndg_conn, + libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid), + cfs_duration_sec(now - dgram->gndg_post_time)); + + timeout = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout); + + if (time_before(now, (dgram->gndg_post_time + timeout))) + continue; + + CNETERR("%s datagram to %s timed out @ %lus dgram " + "0x%p state %s conn 0x%p\n", + kgnilnd_dgram_type2str(dgram), + libcfs_nid2str(dgram->gndg_conn_out.gncr_dstnid), + cfs_duration_sec(now - dgram->gndg_post_time), + dgram, kgnilnd_dgram_state2str(dgram), + dgram->gndg_conn); + + kgnilnd_cancel_dgram_locked(dgram); + } + } + spin_unlock(&dev->gnd_dgram_lock); +} + + +/* use a thread for the possibly long-blocking wait_by_id to prevent + * stalling the global workqueues */ +int +kgnilnd_dgram_waitq(void *arg) +{ + kgn_device_t *dev = (kgn_device_t *) arg; + char name[16]; + gni_return_t grc; + __u64 readyid; + DEFINE_WAIT(mover_done); + + snprintf(name, sizeof(name), "kgnilnd_dgn_%02d", dev->gnd_id); + cfs_daemonize(name); + cfs_block_allsigs(); + + /* all gnilnd threads need to run fairly urgently */ + set_user_nice(current, *kgnilnd_tunables.kgn_nice); + + /* we dont shut down until the device shuts down ... */ + while (!kgnilnd_data.kgn_shutdown) { + /* to quiesce or to not quiesce, that is the question */ + if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) { + KGNILND_SPIN_QUIESCE; + } + + while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_PAUSE_DGRAM_COMP, 1)) {} + + /* check once a second */ + grc = kgnilnd_postdata_probe_wait_by_id(dev->gnd_handle, + 1000, &readyid); + + if (grc == GNI_RC_SUCCESS) { + CDEBUG(D_INFO, "waking up dgram mover thread\n"); + kgnilnd_schedule_dgram(dev); + + /* wait for dgram thread to ping us before spinning again */ + prepare_to_wait(&dev->gnd_dgping_waitq, &mover_done, + TASK_INTERRUPTIBLE); + + /* don't sleep if we need to quiesce */ + if (likely(!kgnilnd_data.kgn_quiesce_trigger)) { + schedule(); + } + finish_wait(&dev->gnd_dgping_waitq, &mover_done); + } + } + + kgnilnd_thread_fini(); + return 0; +} + +int +kgnilnd_start_outbound_dgrams(kgn_device_t *dev) +{ + int did_something = 0, rc; + kgn_peer_t *peer = NULL; + + spin_lock(&dev->gnd_connd_lock); + + /* Active connect - we added this in kgnilnd_launch_tx */ + while (!list_empty(&dev->gnd_connd_peers)) { + peer = list_first_entry(&dev->gnd_connd_peers, + kgn_peer_t, gnp_connd_list); + + /* ref for connd removed in if/else below */ + list_del_init(&peer->gnp_connd_list); + + /* gnp_connecting and membership on gnd_connd_peers should be + * done coherently to avoid double adding, etc */ + /* don't need kgnilnd_data.kgn_peer_conn_lock here as that is only needed + * to get the peer to gnp_connecting in the first place. We just need to + * rely on gnd_connd_lock to serialize someone pulling him from the list + * BEFORE clearing gnp_connecting */ + LASSERTF(peer->gnp_connecting != GNILND_PEER_IDLE, "peer 0x%p->%s not connecting\n", + peer, libcfs_nid2str(peer->gnp_nid)); + + spin_unlock(&dev->gnd_connd_lock); + + CDEBUG(D_NET, "processing connect to %s\n", + libcfs_nid2str(peer->gnp_nid)); + + did_something += 1; + rc = kgnilnd_start_connect(peer); + + if (likely(rc >= 0)) { + /* 0 on success, positive on 'just drop peer' errors */ + kgnilnd_peer_decref(peer); + } else if (rc == -ENOMEM) { + /* if we are out of wildcards, add back to + * connd_list - then break out and we'll try later + * if other errors, we'll bail & cancel pending tx */ + write_lock(&kgnilnd_data.kgn_peer_conn_lock); + if (peer->gnp_connecting == GNILND_PEER_POSTING) { + peer->gnp_connecting = GNILND_PEER_CONNECT; + spin_lock(&dev->gnd_connd_lock); + list_add_tail(&peer->gnp_connd_list, + &dev->gnd_connd_peers); + } else { + /* connecting changed while we were posting */ + + LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid" + " state 0x%p->%s, connecting %d\n", + peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting); + peer->gnp_connecting = GNILND_PEER_KILL; + spin_lock(&dev->gnd_connd_lock); + /* remove the peer ref frrom the cond list */ + kgnilnd_peer_decref(peer); + /* let the system handle itself */ + } + write_unlock(&kgnilnd_data.kgn_peer_conn_lock); + /* the datagrams are a global pool, + * so break out of trying and hope some free + * up soon */ + did_something -= 1; + break; + } else { + /* something bad happened, you lose */ + CNETERR("could not start connecting to %s " + "rc %d: Will retry until TX timeout\n", + libcfs_nid2str(peer->gnp_nid), rc); + /* It didnt post so just set connecting back to zero now. + * The reaper will reattempt the connection if it needs too. + * If the peer needs death set it so the reaper will cleanup. + */ + write_lock(&kgnilnd_data.kgn_peer_conn_lock); + if (peer->gnp_connecting == GNILND_PEER_POSTING) { + peer->gnp_connecting = GNILND_PEER_IDLE; + kgnilnd_peer_increase_reconnect_locked(peer); + } else { + LASSERTF(peer->gnp_connecting == GNILND_PEER_NEEDS_DEATH, "Peer is in invalid" + " state 0x%p->%s, connecting %d\n", + peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_connecting); + peer->gnp_connecting = GNILND_PEER_KILL; + } + write_unlock(&kgnilnd_data.kgn_peer_conn_lock); + + /* hold onto ref until we are really done - if it was + * unlinked this could result in a destroy */ + kgnilnd_peer_decref(peer); + } + spin_lock(&dev->gnd_connd_lock); + } + + spin_unlock(&dev->gnd_connd_lock); + RETURN(did_something); +} + +static void +kgnilnd_dgram_poke_with_stick(unsigned long arg) +{ + int dev_id = arg; + kgn_device_t *dev = &kgnilnd_data.kgn_devices[dev_id]; + + wake_up(&dev->gnd_dgram_waitq); +} + +/* use single thread for dgrams - should be sufficient for performance */ +int +kgnilnd_dgram_mover(void *arg) +{ + kgn_device_t *dev = (kgn_device_t *)arg; + char name[16]; + int rc, did_something; + unsigned long next_purge_check = jiffies - 1; + unsigned long timeout; + struct timer_list timer; + DEFINE_WAIT(wait); + + snprintf(name, sizeof(name), "kgnilnd_dg_%02d", dev->gnd_id); + cfs_daemonize(name); + cfs_block_allsigs(); + /* all gnilnd threads need to run fairly urgently */ + set_user_nice(current, *kgnilnd_tunables.kgn_nice); + + /* we are ok not locking for these variables as the dgram waitq threads + * will block both due to tying up net (kgn_shutdown) and the completion + * event for the dgram_waitq (kgn_quiesce_trigger) */ + + while (!kgnilnd_data.kgn_shutdown) { + /* Safe: kgn_shutdown only set when quiescent */ + + /* race with stack reset - we want to hold off seeing any new incoming dgrams + * so we can force a dirty WC dgram for Bug 762072 - put right before + * quiesce check so that it'll go right into that and not do any + * dgram mucking */ + CFS_RACE(CFS_FAIL_GNI_WC_DGRAM_FREE); + + /* to quiesce or to not quiesce, that is the question */ + if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) { + KGNILND_SPIN_QUIESCE; + } + did_something = 0; + + CFS_RACE(CFS_FAIL_GNI_QUIESCE_RACE); + + /* process any newly completed dgrams */ + down_read(&kgnilnd_data.kgn_net_rw_sem); + + rc = kgnilnd_probe_and_process_dgram(dev); + if (rc > 0) { + did_something += rc; + } + + up_read(&kgnilnd_data.kgn_net_rw_sem); + + /* start new outbound dgrams */ + did_something += kgnilnd_start_outbound_dgrams(dev); + + /* find dead dgrams */ + if (time_after_eq(jiffies, next_purge_check)) { + /* these don't need to be checked that often */ + kgnilnd_reaper_dgram_check(dev); + + next_purge_check = (long) jiffies + + cfs_time_seconds(kgnilnd_data.kgn_new_min_timeout / 4); + } + + /* careful with the jiffy wrap... */ + timeout = (long)(next_purge_check - jiffies); + + CDEBUG(D_INFO, "did %d timeout %lu next %lu jiffies %lu\n", + did_something, timeout, next_purge_check, jiffies); + + if (did_something || timeout <= 0) { + did_something = 0; + continue; + } + + prepare_to_wait(&dev->gnd_dgram_waitq, &wait, TASK_INTERRUPTIBLE); + + setup_timer(&timer, kgnilnd_dgram_poke_with_stick, dev->gnd_id); + mod_timer(&timer, (long) jiffies + timeout); + + /* last second chance for others to poke us */ + did_something += xchg(&dev->gnd_dgram_ready, GNILND_DGRAM_IDLE); + + /* check flag variables before comitting */ + if (!did_something && + !kgnilnd_data.kgn_shutdown && + !kgnilnd_data.kgn_quiesce_trigger) { + CDEBUG(D_INFO, "schedule timeout %ld (%lu sec)\n", + timeout, cfs_duration_sec(timeout)); + wake_up_all(&dev->gnd_dgping_waitq); + schedule(); + CDEBUG(D_INFO, "awake after schedule\n"); + } + + del_singleshot_timer_sync(&timer); + finish_wait(&dev->gnd_dgram_waitq, &wait); + } + + kgnilnd_thread_fini(); + return 0; +} + diff --git a/lnet/klnds/gnilnd/gnilnd_debug.c b/lnet/klnds/gnilnd/gnilnd_debug.c new file mode 100644 index 0000000..8230d98 --- /dev/null +++ b/lnet/klnds/gnilnd/gnilnd_debug.c @@ -0,0 +1,108 @@ +/* + * Copyright (C) 2009-2012 Cray, Inc. + * + * Author: Nic Henke + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ +#include "gnilnd.h" + +void +_kgnilnd_debug_msg(kgn_msg_t *msg, struct libcfs_debug_msg_data *msgdata, + const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + /* XXX Nic TBD: add handling of gnm_u ? */ + libcfs_debug_vmsg2(msgdata, fmt, args, + " msg@0x%p m/v/ty/ck/pck/pl %08x/%d/%d/%x/%x/%d x%d:%s\n", + msg, msg->gnm_magic, msg->gnm_version, msg->gnm_type, + msg->gnm_cksum, msg->gnm_payload_cksum, + msg->gnm_payload_len, msg->gnm_seq, + kgnilnd_msgtype2str(msg->gnm_type)); + va_end(args); +} + +void +_kgnilnd_debug_conn(kgn_conn_t *conn, struct libcfs_debug_msg_data *msgdata, + const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + libcfs_debug_vmsg2(msgdata, fmt, args, + " conn@0x%p->%s:%s cq %u, to %ds, " + " RX %d @ %lu/%lus; TX %d @ %lus/%lus; " + " NOOP %lus/%lu/%lus; sched %lus/%lus/%lus ago \n", + conn, conn->gnc_peer ? libcfs_nid2str(conn->gnc_peer->gnp_nid) : + "", kgnilnd_conn_state2str(conn), + conn->gnc_cqid, conn->gnc_timeout, + conn->gnc_rx_seq, + cfs_duration_sec(jiffies - conn->gnc_last_rx), + cfs_duration_sec(jiffies - conn->gnc_last_rx_cq), + conn->gnc_tx_seq, + cfs_duration_sec(jiffies - conn->gnc_last_tx), + cfs_duration_sec(jiffies - conn->gnc_last_tx_cq), + cfs_duration_sec(jiffies - conn->gnc_last_noop_want), + cfs_duration_sec(jiffies - conn->gnc_last_noop_sent), + cfs_duration_sec(jiffies - conn->gnc_last_noop_cq), + cfs_duration_sec(jiffies - conn->gnc_last_sched_ask), + cfs_duration_sec(jiffies - conn->gnc_last_sched_do), + cfs_duration_sec(jiffies - conn->gnc_device->gnd_sched_alive)); + + + va_end(args); +} + +void +_kgnilnd_debug_tx(kgn_tx_t *tx, struct libcfs_debug_msg_data *msgdata, + const char *fmt, ...) +{ + kgn_tx_ev_id_t *id = &tx->tx_id; + char *nid = ""; + va_list args; + + if (tx->tx_conn && tx->tx_conn->gnc_peer) { + nid = libcfs_nid2str(tx->tx_conn->gnc_peer->gnp_nid); + } + + va_start(args, fmt); + libcfs_debug_vmsg2(msgdata, fmt, args, + " tx@0x%p->%s id "LPX64"/%u/%d:%d msg %x/%s/%d q %s@%lds->0x%p f %x re %d\n", + tx, nid, id->txe_cookie, id->txe_smsg_id, id->txe_cqid, + id->txe_idx, tx->tx_msg.gnm_type, + kgnilnd_msgtype2str(tx->tx_msg.gnm_type), tx->tx_buftype, + kgnilnd_tx_state2str(tx->tx_list_state), + cfs_duration_sec((long)jiffies - tx->tx_qtime), tx->tx_list_p, + tx->tx_state, tx->tx_retrans); + va_end(args); +} + +void +_kgnilnd_api_rc_lbug(const char* rcstr, int rc, struct libcfs_debug_msg_data *msgdata, + const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + libcfs_debug_vmsg2(msgdata, fmt, args, + " GNI API violated? Unexpected rc %s(%d)!\n", + rcstr, rc); + va_end(args); + LBUG(); +} diff --git a/lnet/klnds/gnilnd/gnilnd_hss_ops.h b/lnet/klnds/gnilnd/gnilnd_hss_ops.h new file mode 100644 index 0000000..ec75177 --- /dev/null +++ b/lnet/klnds/gnilnd/gnilnd_hss_ops.h @@ -0,0 +1,284 @@ +/* + * Copyright (C) 2010-2012 Cray, Inc. + * Author: Nic Henke + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ +#ifndef _GNILND_HSS_OPS_H +#define _GNILND_HSS_OPS_H + +/* for krca nid & nic translation */ +#include +#include + +/* the SimNow nodes can't load rca.ko, so we need to detect this + * and fake a table that'd work for lookups there */ + +typedef struct kgn_nid_entry { + __u32 nid; + __u32 nicaddr; +} kgn_nid_entry_t; + +typedef struct kgn_hssops +{ + /* function pointers for nid and nic conversion */ + /* from krca_lib.h */ + int (*nid_to_nicaddr)(__u32 nid, int numnic, __u32 *nicaddr); + int (*nicaddr_to_nid)(__u32 nicaddr, __u32 *nid); + void (*hb_to_l0)(void); +} kgn_hssops_t; + +/* pull in static store in gnilnd.c */ +extern kgn_hssops_t kgnilnd_hssops; + +#define GNILND_NO_RCA 0xdeadbeef +#define GNILND_NO_QUIESCE 0xdeadbeef + +static inline int +kgnilnd_lookup_rca_funcs(void) +{ + void *funcp; + + funcp = __symbol_get("send_hb_2_l0"); + if (funcp == 0) { + CERROR("couldn't find send_hb_2_l0\n"); + /* not fatal for now */ + } else { + kgnilnd_hssops.hb_to_l0 = funcp; + } + + /* if we find one, we should get the other */ + + funcp = __symbol_get("krca_nid_to_nicaddrs"); + if (funcp == 0) { + kgnilnd_hssops.nid_to_nicaddr = (void *)GNILND_NO_RCA; + kgnilnd_hssops.nicaddr_to_nid = (void *)GNILND_NO_RCA; + LCONSOLE_INFO("using SimNow nid table for RCA translation\n"); + return 0; + } + kgnilnd_hssops.nid_to_nicaddr = funcp; + + funcp = __symbol_get("krca_nicaddr_to_nid"); + if (funcp == 0) { + CERROR("found krca_nid_to_nicaddrs but not " + "krca_nicaddr_to_nid\n"); + return -ESRCH; + } + kgnilnd_hssops.nicaddr_to_nid = funcp; + return 0; +} + +#if defined(CONFIG_CRAY_GEMINI) +/* Gemini SimNow has a hard coded table to use - no RCA there */ +#define GNILND_MAX_NID_TABLE 0xffffffff +/* this is all of the nodes defined in the Baker SimNow "sim_platforms" page */ +static kgn_nid_entry_t kgn_nid_table[] = { + {0x1, 0x100}, {0x2, 0x101}, {0x3, 0x104}, {0x4, 0x105}, + {0x5, 0x108}, {0x6, 0x109}, {0x7, 0x10c}, {0x8, 0x10d}, + {0x9, 0x110}, {0xa, 0x111}, {0xb, 0x114}, {0xc, 0x115}, + {0xd, 0x118}, {0xe, 0x119}, {0xf, 0x11c}, {0x10, 0x11d}, + {0x11, 0x120}, {0x12, 0x121}, {0x13, 0x124}, {0x14, 0x125}, + {0x15, 0x128}, {0x16, 0x129}, {0x17, 0x12c}, {0x18, 0x12d}, + {0x19, 0x130}, {0x1a, 0x131}, {0x1b, 0x134}, {0x1c, 0x135}, + {0x1d, 0x138}, {0x1e, 0x139}, {0x1f, 0x13c}, {0x20, 0x13d}, + {0x21, 0x140}, {0x22, 0x141}, {0x23, 0x144}, {0x24, 0x145}, + {0x25, 0x148}, {0x26, 0x149}, {0x27, 0x14c}, {0x28, 0x14d}, + {0x29, 0x150}, {0x2a, 0x151}, {0x2b, 0x154}, {0x2c, 0x155}, + {0x2d, 0x158}, {0x2e, 0x159}, {0x2f, 0x15c}, {0x30, 0x15d}, + {0x31, 0x160}, {0x32, 0x161}, {0x33, 0x164}, {0x3d, 0x178}, + {0x34, 0x165}, {0x3e, 0x179}, {0x35, 0x168}, {0x3f, 0x17c}, + {0x36, 0x169}, {0x40, 0x17d}, {0x37, 0x16c}, {0x41, 0x180}, + {0x38, 0x16d}, {0x42, 0x181}, {0x39, 0x170}, {0x3a, 0x171}, + {0x3b, 0x174}, {0x3c, 0x175}, {0x43, 0x184}, {0x44, 0x185}, + {0x45, 0x188}, {0x46, 0x189}, {0x47, 0x18c}, {0x48, 0x18d}, + /* entries after this are for 'dead' peer tests */ + {0x63, 0x1ff}, {0x111, 0x209}, + {GNILND_MAX_NID_TABLE, GNILND_MAX_NID_TABLE} +}; +static int +gemini_nid_to_nicaddr(__u32 nid, int numnic, __u32 *nicaddr) +{ + int i; + + /* GNILND_NO_RCA, so use hardcoded table for Gemini SimNow */ + if (numnic > 1) { + CERROR("manual nid2nic translation doesn't support" + "multiple nic addrs (you asked for %d)\n", + numnic); + return -EINVAL; + } + + for (i = 0;;i++) { + if (kgn_nid_table[i].nid == GNILND_MAX_NID_TABLE) { + CERROR("could not translate %u to a NIC " + "address\n", nid); + return -ESRCH; + } + if (kgn_nid_table[i].nid == nid) { + *nicaddr = kgn_nid_table[i].nicaddr; + return 1; + } + } +} + +static int +gemini_nicaddr_to_nid(__u32 nicaddr, __u32 *nid) +{ + int i; + + /* GNILND_RCA_NOT_HOME, so use hardcoded table for SimNow */ + for (i = 0;;i++) { + if (kgn_nid_table[i].nicaddr == GNILND_MAX_NID_TABLE) { + CERROR("could not translate NIC address " + "%u\n", + nicaddr); + return -ESRCH; + } + if (kgn_nid_table[i].nicaddr == nicaddr) { + *nid = kgn_nid_table[i].nid; + return 1; + } + } +} + +static inline int +kgnilnd_setup_nic_translation(__u32 device_id) +{ + int rc; + + /* do lookup on first use */ + if (unlikely(kgnilnd_hssops.nid_to_nicaddr == NULL)) { + rc = kgnilnd_lookup_rca_funcs(); + if (rc) + return rc; + } + + /* if we have a real function, return - we'll use those going forward */ + if (likely(kgnilnd_hssops.nid_to_nicaddr != (void *)GNILND_NO_RCA)) + return 0; + + kgnilnd_hssops.nid_to_nicaddr = gemini_nid_to_nicaddr; + kgnilnd_hssops.nicaddr_to_nid = gemini_nicaddr_to_nid; + return 0; +} + +#elif defined(CONFIG_CRAY_ARIES) +/* for libcfs_ipif_query */ +#include + +/* Aries Sim doesn't have hardcoded tables, so we'll hijack the nic_pe + * and decode our address and nic addr from that - the rest are just offsets */ +static __u32 aries_sim_base_nid; +static __u32 aries_sim_nic; + +static int +aries_nid_to_nicaddr(__u32 nid, int numnic, __u32 *nicaddr) +{ + if (numnic > 1) { + CERROR("manual nid2nic translation doesn't support" + "multiple nic addrs (you asked for %d)\n", + numnic); + return -EINVAL; + } + if (nid < aries_sim_base_nid) { + CERROR("Request for invalid nid translation %u, minimum %u\n", + nid, aries_sim_base_nid); + return -ESRCH; + } + + *nicaddr = nid - aries_sim_base_nid; + return 1; +} + +static int +aries_nicaddr_to_nid(__u32 nicaddr, __u32 *nid) +{ + *nid = aries_sim_base_nid + nicaddr; + return 1; +} + +/* XXX Nic: This does not support multiple device!!!! */ +static inline int +kgnilnd_setup_nic_translation(__u32 device_id) +{ + char *if_name = "ipogif0"; + __u32 ipaddr, netmask, my_nid; + int up, rc; + + /* do lookup on first use */ + if (unlikely(kgnilnd_hssops.nid_to_nicaddr == NULL)) { + rc = kgnilnd_lookup_rca_funcs(); + if (rc) + return rc; + } + + /* if we have a real function, return - we'll use those going forward */ + if (likely(kgnilnd_hssops.nid_to_nicaddr != (void *)GNILND_NO_RCA)) + return 0; + + rc = libcfs_ipif_query(if_name, &up, &ipaddr, &netmask); + if (rc != 0) { + CERROR("can't get IP interface for %s: %d\n", if_name, rc); + return rc; + } + if (!up) { + CERROR("IP interface %s is down\n", if_name); + return -ENODEV; + } + + my_nid = ((ipaddr >> 8) & 0xFF) + (ipaddr & 0xFF); + aries_sim_nic = device_id; + aries_sim_base_nid = my_nid - aries_sim_nic; + + kgnilnd_hssops.nid_to_nicaddr = aries_nid_to_nicaddr; + kgnilnd_hssops.nicaddr_to_nid = aries_nicaddr_to_nid; + + return 0; +} +#else +#error "Undefined Network Type" +#endif + +/* we use RCA types here to get the compiler to whine when we have + * mismatched types */ +static inline int +kgnilnd_nid_to_nicaddrs(rca_nid_t nid, int numnic, nic_addr_t *nicaddrs) +{ + /* compile time checks to ensure that the RCA types match + * the LNet idea of NID and NIC */ + typecheck(__u32, nid); + typecheck(__u32, *nicaddrs); + + LASSERTF(kgnilnd_hssops.nid_to_nicaddr != NULL, "missing setup?\n"); + + return kgnilnd_hssops.nid_to_nicaddr(nid, numnic, nicaddrs); +} + +static inline int +kgnilnd_nicaddr_to_nid(nic_addr_t nicaddr, rca_nid_t *nid) +{ + /* compile time checks to ensure that the RCA types match + * the LNet idea of NID and NIC */ + typecheck(__u32, nicaddr); + typecheck(__u32, nid[0]); + + LASSERTF(kgnilnd_hssops.nicaddr_to_nid != NULL, "missing setup ?\n"); + + return kgnilnd_hssops.nicaddr_to_nid(nicaddr, nid); +} + +#endif /* _GNILND_HSS_OPS_H */ diff --git a/lnet/klnds/gnilnd/gnilnd_modparams.c b/lnet/klnds/gnilnd/gnilnd_modparams.c new file mode 100644 index 0000000..17cbfd6 --- /dev/null +++ b/lnet/klnds/gnilnd/gnilnd_modparams.c @@ -0,0 +1,500 @@ +/* + * Copyright (C) 2004 Cluster File Systems, Inc. + * + * Copyright (C) 2009-2012 Cray, Inc. + * + * Derived from work by: Eric Barton + * Author: Nic Henke + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "gnilnd.h" + +static int credits = 256; +CFS_MODULE_PARM(credits, "i", int, 0444, + "# concurrent sends"); + +static int peer_credits = 16; +CFS_MODULE_PARM(peer_credits, "i", int, 0444, + "# LNet peer credits"); + +/* NB - we'll not actually limit sends to this, we just size the mailbox buffer + * such that at most we'll have concurrent_sends * max_immediate messages + * in the mailbox */ +static int concurrent_sends = 0; +CFS_MODULE_PARM(concurrent_sends, "i", int, 0444, + "# concurrent HW sends to 1 peer"); + +/* default for 2k nodes @ 16 peer credits */ +static int fma_cq_size = 32768; +CFS_MODULE_PARM(fma_cq_size, "i", int, 0444, + "size of the completion queue"); + +static int timeout = GNILND_BASE_TIMEOUT; +/* can't change @ runtime because LNet gets NI data at startup from + * this value */ +CFS_MODULE_PARM(timeout, "i", int, 0444, + "communications timeout (seconds)"); + +/* time to wait between datagram timeout and sending of next dgram */ +static int min_reconnect_interval = GNILND_MIN_RECONNECT_TO; +CFS_MODULE_PARM(min_reconnect_interval, "i", int, 0644, + "minimum connection retry interval (seconds)"); + +/* if this goes longer than timeout, we'll timeout the TX before + * the dgram */ +static int max_reconnect_interval = GNILND_MAX_RECONNECT_TO; +CFS_MODULE_PARM(max_reconnect_interval, "i", int, 0644, + "maximum connection retry interval (seconds)"); + +static int max_immediate = (2<<10); +CFS_MODULE_PARM(max_immediate, "i", int, 0644, + "immediate/RDMA breakpoint"); + +#ifdef CONFIG_CRAY_GEMINI +static int checksum = GNILND_CHECKSUM_SMSG_BTE; +#else +static int checksum = 0; +#endif +CFS_MODULE_PARM(checksum, "i", int, 0644, + "0: None, 1: headers, 2: short msg, 3: all traffic"); + +static int checksum_dump = 0; +CFS_MODULE_PARM(checksum_dump, "i", int, 0644, + "0: None, 1: dump log on failure, 2: payload data to D_INFO log"); + +static int bte_hash = 1; +CFS_MODULE_PARM(bte_hash, "i", int, 0644, + "enable hashing for BTE (RDMA) transfers"); + +static int bte_adapt = 1; +CFS_MODULE_PARM(bte_adapt, "i", int, 0644, + "enable adaptive request and response for BTE (RDMA) transfers"); + +static int bte_relaxed_ordering = 1; +CFS_MODULE_PARM(bte_relaxed_ordering, "i", int, 0644, + "enable relaxed ordering (PASSPW) for BTE (RDMA) transfers"); + +static int ptag = GNI_PTAG_LND; +CFS_MODULE_PARM(ptag, "i", int, 0444, + "ptag for Gemini CDM"); + +static int max_retransmits = 1024; +CFS_MODULE_PARM(max_retransmits, "i", int, 0644, + "max retransmits for FMA"); + +static int nwildcard = 4; +CFS_MODULE_PARM(nwildcard, "i", int, 0444, + "# wildcard datagrams to post per net (interface)"); + +static int nice = -20; +CFS_MODULE_PARM(nice, "i", int, 0444, + "nice value for kgnilnd threads, default -20"); + +static int rdmaq_intervals = 4; +CFS_MODULE_PARM(rdmaq_intervals, "i", int, 0644, + "# intervals per second for rdmaq throttling, default 4, 0 to disable"); + +static int loops = 100; +CFS_MODULE_PARM(loops, "i", int, 0644, + "# of loops before scheduler is friendly, default 100"); + +static int hash_size = 503; +CFS_MODULE_PARM(hash_size, "i", int, 0444, + "prime number for peer/conn hash sizing, default 503"); + +static int peer_health = 0; +CFS_MODULE_PARM(peer_health, "i", int, 0444, + "Disable peer timeout for LNet peer health, default off, > 0 to enable"); + +static int vmap_cksum = 0; +CFS_MODULE_PARM(vmap_cksum, "i", int, 0644, + "use vmap for all kiov checksumming, default off"); + +static int mbox_per_block = GNILND_FMABLK; +CFS_MODULE_PARM(mbox_per_block, "i", int, 0644, + "mailboxes per block"); + +static int nphys_mbox = 0; +CFS_MODULE_PARM(nphys_mbox, "i", int, 0444, + "# mbox to preallocate from physical memory, default 0"); + +static int mbox_credits = GNILND_MBOX_CREDITS; +CFS_MODULE_PARM(mbox_credits, "i", int, 0644, + "number of credits per mailbox"); + +static int sched_threads = GNILND_SCHED_THREADS; +CFS_MODULE_PARM(sched_threads, "i", int, 0444, + "number of threads for moving data"); + +static int net_hash_size = 11; +CFS_MODULE_PARM(net_hash_size, "i", int, 0444, + "prime number for net hash sizing, default 11"); + +static int hardware_timeout = GNILND_HARDWARE_TIMEOUT; +CFS_MODULE_PARM(hardware_timeout, "i", int, 0444, + "maximum time for traffic to get from one node to another"); + +static int mdd_timeout = GNILND_MDD_TIMEOUT; +CFS_MODULE_PARM(mdd_timeout, "i", int, 0644, + "maximum time (in minutes) for mdd to be held"); + +kgn_tunables_t kgnilnd_tunables = { + .kgn_min_reconnect_interval = &min_reconnect_interval, + .kgn_max_reconnect_interval = &max_reconnect_interval, + .kgn_credits = &credits, + .kgn_peer_credits = &peer_credits, + .kgn_concurrent_sends = &concurrent_sends, + .kgn_fma_cq_size = &fma_cq_size, + .kgn_timeout = &timeout, + .kgn_max_immediate = &max_immediate, + .kgn_checksum = &checksum, + .kgn_checksum_dump = &checksum_dump, + .kgn_bte_hash = &bte_hash, + .kgn_bte_adapt = &bte_adapt, + .kgn_bte_relaxed_ordering = &bte_relaxed_ordering, + .kgn_ptag = &ptag, + .kgn_max_retransmits = &max_retransmits, + .kgn_nwildcard = &nwildcard, + .kgn_nice = &nice, + .kgn_rdmaq_intervals = &rdmaq_intervals, + .kgn_loops = &loops, + .kgn_peer_hash_size = &hash_size, + .kgn_peer_health = &peer_health, + .kgn_vmap_cksum = &vmap_cksum, + .kgn_mbox_per_block = &mbox_per_block, + .kgn_nphys_mbox = &nphys_mbox, + .kgn_mbox_credits = &mbox_credits, + .kgn_sched_threads = &sched_threads, + .kgn_net_hash_size = &net_hash_size, + .kgn_hardware_timeout = &hardware_timeout, + .kgn_mdd_timeout = &mdd_timeout +}; + +#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM +static cfs_sysctl_table_t kgnilnd_ctl_table[] = { + { + INIT_CTL_NAME(2) + .procname = "min_reconnect_interval", + .data = &min_reconnect_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(3) + .procname = "max_reconnect_interval", + .data = &max_reconnect_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(5) + .procname = "credits", + .data = &credits, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(6) + .procname = "peer_credits", + .data = &peer_credits, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(7) + .procname = "fma_cq_size", + .data = &fma_cq_size, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(8) + .procname = "timeout", + .data = &timeout, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(9) + .procname = "max_immediate", + .data = &max_immediate, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(10) + .procname = "checksum", + .data = &checksum, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(11) + .procname = "bte_hash", + .data = &bte_hash, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(12) + .procname = "bte_adapt", + .data = &bte_adapt, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(13) + .procname = "ptag", + .data = &ptag, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(14) + .procname = "nwildcard", + .data = &nwildcard, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(15) + .procname = "bte_relaxed_ordering", + .data = &bte_relaxed_ordering, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(16) + .procname = "checksum_dump", + .data = &checksum_dump, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(17) + .procname = "nice", + .data = &nice, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(18) + .procname = "rdmaq_intervals", + .data = &rdmaq_intervals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(19) + .procname = "loops", + .data = &loops, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(20) + .procname = "hash_size", + .data = &hash_size, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(21) + .procname = "peer_health", + .data = &peer_health, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(22) + .procname = "vmap_cksum", + .data = &vmap_cksum, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(23) + .procname = "mbox_per_block", + .data = &mbox_per_block, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(24) + .procname = "mbox_credits" + .data = &mbox_credits, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(25) + .procname = "sched_threads" + .data = &sched_threads, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(26) + .procname = "net_hash_size", + .data = &net_hash_size, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(27) + .procname = "hardware_timeout", + .data = &hardware_timeout, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(28) + .procname = "mdd_timeout", + .data = &mdd_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(29) + .procname = "max_retransmits" + .data = &max_retransmits, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(30) + .procname = "concurrent_sends", + .data = &concurrent_sends, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(31) + .procname = "nphys_mbox", + .data = &nphys_mbox, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + {0} +}; + +static cfs_sysctl_table_t kgnilnd_top_ctl_table[] = { + { + INIT_CTL_NAME(202) + .procname = "gnilnd", + .data = NULL, + .maxlen = 0, + .mode = 0555, + .child = kgnilnd_ctl_table + }, + { INIT_CTL_NAME(0) } +}; +#endif + +int +kgnilnd_tunables_init() +{ + int rc = 0; + +#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM + kgnilnd_tunables.kgn_sysctl = + cfs_register_sysctl_table(kgnilnd_top_ctl_table, 0); + + if (kgnilnd_tunables.kgn_sysctl == NULL) + CWARN("Can't setup /proc tunables\n"); +#endif + switch (*kgnilnd_tunables.kgn_checksum) { + default: + CERROR("Invalid checksum module parameter: %d\n", + *kgnilnd_tunables.kgn_checksum); + rc = -EINVAL; + GOTO(out, rc); + case GNILND_CHECKSUM_OFF: + /* no checksumming */ + break; + case GNILND_CHECKSUM_SMSG_HEADER: + LCONSOLE_INFO("SMSG header only checksumming enabled\n"); + break; + case GNILND_CHECKSUM_SMSG: + LCONSOLE_INFO("SMSG checksumming enabled\n"); + break; + case GNILND_CHECKSUM_SMSG_BTE: + LCONSOLE_INFO("SMSG + BTE checksumming enabled\n"); + break; + } + + if (*kgnilnd_tunables.kgn_max_immediate > GNILND_MAX_IMMEDIATE) { + LCONSOLE_ERROR("kgnilnd module parameter 'max_immediate' too large %d > %d\n", + *kgnilnd_tunables.kgn_max_immediate, GNILND_MAX_IMMEDIATE); + rc = -EINVAL; + GOTO(out, rc); + } + + if (*kgnilnd_tunables.kgn_mbox_per_block < 1) { + *kgnilnd_tunables.kgn_mbox_per_block = 1; + } + + if (*kgnilnd_tunables.kgn_concurrent_sends == 0) { + *kgnilnd_tunables.kgn_concurrent_sends = *kgnilnd_tunables.kgn_peer_credits; + } else if (*kgnilnd_tunables.kgn_concurrent_sends > *kgnilnd_tunables.kgn_peer_credits) { + LCONSOLE_ERROR("kgnilnd parameter 'concurrent_sends' too large: %d > %d (peer_credits)\n", + *kgnilnd_tunables.kgn_concurrent_sends, *kgnilnd_tunables.kgn_peer_credits); + rc = -EINVAL; + } +out: + return rc; +} + +void +kgnilnd_tunables_fini() +{ +#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM + if (kgnilnd_tunables.kgn_sysctl != NULL) + cfs_unregister_sysctl_table(kgnilnd_tunables.kgn_sysctl); +#endif +} diff --git a/lnet/klnds/gnilnd/gnilnd_proc.c b/lnet/klnds/gnilnd/gnilnd_proc.c new file mode 100644 index 0000000..f161224 --- /dev/null +++ b/lnet/klnds/gnilnd/gnilnd_proc.c @@ -0,0 +1,1260 @@ +/* + * Copyright (C) 2009-2012 Cray, Inc. + * + * Author: Nic Henke + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* this code liberated and modified from lnet/lnet/router_proc.c */ + +#define DEBUG_SUBSYSTEM S_LND +#include "gnilnd.h" +#include + +#define GNILND_PROC_STATS "stats" +#define GNILND_PROC_MDD "mdd" +#define GNILND_PROC_SMSG "smsg" +#define GNILND_PROC_CONN "conn" +#define GNILND_PROC_PEER "peer" +#define GNILND_PROC_CKSUM_TEST "cksum_test" + +static int +_kgnilnd_proc_run_cksum_test(int caseno, int nloops, int nob) +{ + lnet_kiov_t *src, *dest; + struct timespec begin, end, diff; + int niov; + int i = 0, j = 0, n; + __u16 cksum, cksum2; + __u64 mbytes; + + LIBCFS_ALLOC(src, LNET_MAX_IOV * sizeof(lnet_kiov_t)); + LIBCFS_ALLOC(dest, LNET_MAX_IOV * sizeof(lnet_kiov_t)); + + if (src == NULL || dest == NULL) { + CERROR("couldn't allocate iovs\n"); + GOTO(unwind, -ENOMEM); + } + + for (i = 0; i < LNET_MAX_IOV; i++) { + src[i].kiov_offset = 0; + src[i].kiov_len = CFS_PAGE_SIZE; + src[i].kiov_page = cfs_alloc_page(CFS_ALLOC_STD|CFS_ALLOC_ZERO); + + if (src[i].kiov_page == NULL) { + CERROR("couldn't allocate page %d\n", i); + GOTO(unwind, -ENOMEM); + } + + dest[i].kiov_offset = 0; + dest[i].kiov_len = CFS_PAGE_SIZE; + dest[i].kiov_page = cfs_alloc_page(CFS_ALLOC_STD|CFS_ALLOC_ZERO); + + if (dest[i].kiov_page == NULL) { + CERROR("couldn't allocate page %d\n", i); + GOTO(unwind, -ENOMEM); + } + } + + /* add extra 2 pages - one for offset of src, 2nd to allow dest offset */ + niov = (nob / PAGE_SIZE) + 2; + if (niov > LNET_MAX_IOV) { + CERROR("bytes %d too large, requires niov %d > %d\n", + nob, niov, LNET_MAX_IOV); + GOTO(unwind, -E2BIG); + } + + /* setup real data */ + src[0].kiov_offset = 317; + dest[0].kiov_offset = 592; + switch (caseno) { + default: + /* odd -> even */ + break; + case 1: + /* odd -> odd */ + dest[0].kiov_offset -= 1; + break; + case 2: + /* even -> even */ + src[0].kiov_offset += 1; + break; + case 3: + /* even -> odd */ + src[0].kiov_offset += 1; + dest[0].kiov_offset -= 1; + } + src[0].kiov_len = PAGE_SIZE - src[0].kiov_offset; + dest[0].kiov_len = PAGE_SIZE - dest[0].kiov_offset; + + for (i = 0; i < niov; i++) { + memset(page_address(src[i].kiov_page) + src[i].kiov_offset, + 0xf0 + i, src[i].kiov_len); + } + + lnet_copy_kiov2kiov(niov, dest, 0, niov, src, 0, nob); + + getnstimeofday(&begin); + + for (n = 0; n < nloops; n++) { + CDEBUG(D_BUFFS, "case %d loop %d src %d dest %d nob %d niov %d\n", + caseno, n, src[0].kiov_offset, dest[0].kiov_offset, nob, niov); + cksum = kgnilnd_cksum_kiov(niov, src, 0, nob - n, 1); + cksum2 = kgnilnd_cksum_kiov(niov, dest, 0, nob - n, 1); + + if (cksum != cksum2) { + CERROR("case %d loop %d different checksums %x expected %x\n", + j, n, cksum2, cksum); + GOTO(unwind, -ENOKEY); + } + } + + getnstimeofday(&end); + + mbytes = (nloops * nob * 2) / (1024*1024); + + diff = kgnilnd_ts_sub(end, begin); + + LCONSOLE_INFO("running "LPD64"MB took %ld.%ld seconds\n", + mbytes, diff.tv_sec, diff.tv_nsec); + +unwind: + CDEBUG(D_NET, "freeing %d pages\n", i); + for (i -= 1; i >= 0; i--) { + if (src[i].kiov_page != NULL) { + cfs_free_page(src[i].kiov_page); + } + if (dest[i].kiov_page != NULL) { + cfs_free_page(dest[i].kiov_page); + } + } + + if (src != NULL) + LIBCFS_FREE(src, LNET_MAX_IOV * sizeof(lnet_kiov_t)); + if (dest != NULL) + LIBCFS_FREE(dest, LNET_MAX_IOV * sizeof(lnet_kiov_t)); + return 0; +} + +static int +kgnilnd_proc_cksum_test_write(struct file *file, const char *ubuffer, + unsigned long count, void *data) +{ + char dummy[256 + 1] = { '\0' }; + int testno, nloops, nbytes; + int rc; + + if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) { + CERROR("can't run cksum test, kgnilnd is not initialized yet\n"); + return -ENOSYS; + } + + if (count >= sizeof(dummy) || count == 0) + return -EINVAL; + + if (copy_from_user(dummy, ubuffer, count)) + return -EFAULT; + + if (sscanf(dummy, "%d:%d:%d", &testno, &nloops, &nbytes) == 3) { + rc = _kgnilnd_proc_run_cksum_test(testno, nloops, nbytes); + if (rc < 0) { + RETURN(rc); + } else { + /* spurious, but lets us know the parse was ok */ + RETURN(count); + } + } + RETURN(count); +} + +static int +kgnilnd_proc_stats_read(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + kgn_device_t *dev; + struct timeval now; + int rc; + + if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) { + rc = sprintf(page, + "kgnilnd is not initialized yet\n"); + return rc; + } + + /* only do the first device */ + dev = &kgnilnd_data.kgn_devices[0]; + + /* sampling is racy, but so is reading this file! */ + smp_rmb(); + do_gettimeofday(&now); + + rc = sprintf(page, "time: %lu.%lu\n" + "ntx: %d\n" + "npeers: %d\n" + "nconns: %d\n" + "nEPs: %d\n" + "ndgrams: %d\n" + "nfmablk: %d\n" + "n_mdd: %d\n" + "n_mdd_held: %d\n" + "GART map bytes: %ld\n" + "TX queued maps: %d\n" + "TX phys nmaps: %d\n" + "TX phys bytes: %lu\n" + "TX virt nmaps: %d\n" + "TX virt bytes: "LPU64"\n" + "RDMAQ bytes_auth: %ld\n" + "RDMAQ bytes_left: %ld\n" + "RDMAQ nstalls: %d\n" + "dev mutex delay: %ld\n" + "dev n_yield: %d\n" + "dev n_schedule: %d\n" + "SMSG fast_try: %d\n" + "SMSG fast_ok: %d\n" + "SMSG fast_block: %d\n" + "SMSG ntx: %d\n" + "SMSG tx_bytes: %ld\n" + "SMSG nrx: %d\n" + "SMSG rx_bytes: %ld\n" + "RDMA ntx: %d\n" + "RDMA tx_bytes: %ld\n" + "RDMA nrx: %d\n" + "RDMA rx_bytes: %ld\n" + "VMAP short: %d\n" + "VMAP cksum: %d\n" + "KMAP short: %d\n", + now.tv_sec, now.tv_usec, + atomic_read(&kgnilnd_data.kgn_ntx), + atomic_read(&kgnilnd_data.kgn_npeers), + atomic_read(&kgnilnd_data.kgn_nconns), + atomic_read(&dev->gnd_neps), + atomic_read(&dev->gnd_ndgrams), + atomic_read(&dev->gnd_nfmablk), + atomic_read(&dev->gnd_n_mdd), atomic_read(&dev->gnd_n_mdd_held), + atomic64_read(&dev->gnd_nbytes_map), + atomic_read(&dev->gnd_nq_map), + dev->gnd_map_nphys, dev->gnd_map_physnop * PAGE_SIZE, + dev->gnd_map_nvirt, dev->gnd_map_virtnob, + atomic64_read(&dev->gnd_rdmaq_bytes_out), + atomic64_read(&dev->gnd_rdmaq_bytes_ok), + atomic_read(&dev->gnd_rdmaq_nstalls), + dev->gnd_mutex_delay, + atomic_read(&dev->gnd_n_yield), atomic_read(&dev->gnd_n_schedule), + atomic_read(&dev->gnd_fast_try), atomic_read(&dev->gnd_fast_ok), + atomic_read(&dev->gnd_fast_block), + atomic_read(&dev->gnd_short_ntx), atomic64_read(&dev->gnd_short_txbytes), + atomic_read(&dev->gnd_short_nrx), atomic64_read(&dev->gnd_short_rxbytes), + atomic_read(&dev->gnd_rdma_ntx), atomic64_read(&dev->gnd_rdma_txbytes), + atomic_read(&dev->gnd_rdma_nrx), atomic64_read(&dev->gnd_rdma_rxbytes), + atomic_read(&kgnilnd_data.kgn_nvmap_short), + atomic_read(&kgnilnd_data.kgn_nvmap_cksum), + atomic_read(&kgnilnd_data.kgn_nkmap_short)); + + return rc; +} + +static int +kgnilnd_proc_stats_write(struct file *file, const char *ubuffer, + unsigned long count, void *data) +{ + kgn_device_t *dev; + + if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) { + CERROR("kgnilnd is not initialized for stats write\n"); + return -EINVAL; + } + + /* only do the first device */ + dev = &kgnilnd_data.kgn_devices[0]; + + atomic_set(&dev->gnd_short_ntx, 0); + atomic_set(&dev->gnd_short_nrx, 0); + atomic64_set(&dev->gnd_short_txbytes, 0); + atomic64_set(&dev->gnd_short_rxbytes, 0); + atomic_set(&dev->gnd_rdma_ntx, 0); + atomic_set(&dev->gnd_rdma_nrx, 0); + atomic_set(&dev->gnd_fast_ok, 0); + atomic_set(&dev->gnd_fast_try, 0); + atomic_set(&dev->gnd_fast_block, 0); + atomic64_set(&dev->gnd_rdma_txbytes, 0); + atomic64_set(&dev->gnd_rdma_rxbytes, 0); + atomic_set(&dev->gnd_rdmaq_nstalls, 0); + set_mb(dev->gnd_mutex_delay, 0); + atomic_set(&dev->gnd_n_yield, 0); + atomic_set(&dev->gnd_n_schedule, 0); + atomic_set(&kgnilnd_data.kgn_nvmap_short, 0); + atomic_set(&kgnilnd_data.kgn_nvmap_cksum, 0); + atomic_set(&kgnilnd_data.kgn_nkmap_short, 0); + /* sampling is racy, but so is writing this file! */ + smp_wmb(); + return count; +} + +typedef struct { + kgn_device_t *gmdd_dev; + kgn_tx_t *gmdd_tx; + loff_t gmdd_off; +} kgn_mdd_seq_iter_t; + +int +kgnilnd_mdd_seq_seek(kgn_mdd_seq_iter_t *gseq, loff_t off) +{ + kgn_tx_t *tx; + struct list_head *r; + loff_t here; + int rc = 0; + + if (off == 0) { + gseq->gmdd_tx = NULL; + gseq->gmdd_off = 0; + return 0; + } + + tx = gseq->gmdd_tx; + + if (tx == NULL || gseq->gmdd_off > off) { + /* search from start */ + r = gseq->gmdd_dev->gnd_map_list.next; + here = 1; + } else { + /* continue current search */ + r = &tx->tx_map_list; + here = gseq->gmdd_off; + } + + gseq->gmdd_off = off; + + while (r != &gseq->gmdd_dev->gnd_map_list) { + kgn_tx_t *t; + + t = list_entry(r, kgn_tx_t, tx_map_list); + + if (here == off) { + gseq->gmdd_tx = t; + rc = 0; + goto out; + } + r = r->next; + here++; + } + + gseq->gmdd_tx = NULL; + rc = -ENOENT; +out: + return rc; +} + +static void * +kgnilnd_mdd_seq_start(struct seq_file *s, loff_t *pos) +{ + + kgn_mdd_seq_iter_t *gseq; + int rc; + + if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) { + return NULL; + } + + LIBCFS_ALLOC(gseq, sizeof(*gseq)); + if (gseq == NULL) { + CERROR("could not allocate mdd sequence iterator\n"); + return NULL; + } + + /* only doing device 0 for now */ + gseq->gmdd_dev = &kgnilnd_data.kgn_devices[0]; + gseq->gmdd_tx = NULL; + + /* need to lock map while we poke - huge disturbance + * but without it, no way to get the data printed */ + spin_lock(&gseq->gmdd_dev->gnd_map_lock); + + /* set private to gseq for stop */ + s->private = gseq; + + rc = kgnilnd_mdd_seq_seek(gseq, *pos); + if (rc == 0) + return gseq; + else + return NULL; +} + +static void +kgnilnd_mdd_seq_stop(struct seq_file *s, void *iter) +{ + kgn_mdd_seq_iter_t *gseq = s->private; + + if (gseq != NULL) { + spin_unlock(&gseq->gmdd_dev->gnd_map_lock); + LIBCFS_FREE(gseq, sizeof(*gseq)); + } +} + +static void * +kgnilnd_mdd_seq_next(struct seq_file *s, void *iter, loff_t *pos) +{ + kgn_mdd_seq_iter_t *gseq = iter; + int rc; + loff_t next = *pos + 1; + + rc = kgnilnd_mdd_seq_seek(gseq, next); + if (rc != 0) { + return NULL; + } + *pos = next; + return gseq; +} + +static int +kgnilnd_mdd_seq_show(struct seq_file *s, void *iter) +{ + kgn_mdd_seq_iter_t *gseq = iter; + kgn_tx_t *tx; + __u64 nob; + __u32 physnop; + int id; + int buftype; + gni_mem_handle_t hndl; + + if (gseq->gmdd_off == 0) { + seq_printf(s, "%s %22s %16s %8s %8s %37s\n", + "tx", "tx_id", "nob", "physnop", + "buftype", "mem handle"); + return 0; + } + + tx = gseq->gmdd_tx; + LASSERT(tx != NULL); + + id = tx->tx_id.txe_smsg_id; + nob = tx->tx_nob; + physnop = tx->tx_phys_npages; + buftype = tx->tx_buftype; + hndl.qword1 = tx->tx_map_key.qword1; + hndl.qword2 = tx->tx_map_key.qword2; + + seq_printf(s, "%p %x %16"LPF64"u %8d %#8x "LPX64"."LPX64"x\n", + tx, id, nob, physnop, buftype, + hndl.qword1, hndl.qword2); + + return 0; +} + +static struct seq_operations kgn_mdd_sops = { + .start = kgnilnd_mdd_seq_start, + .stop = kgnilnd_mdd_seq_stop, + .next = kgnilnd_mdd_seq_next, + .show = kgnilnd_mdd_seq_show, + +}; + +static int +kgnilnd_mdd_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *sf; + int rc; + + rc = seq_open(file, &kgn_mdd_sops); + if (rc == 0) { + sf = file->private_data; + + /* NULL means we've not yet open() */ + sf->private = NULL; + } + return rc; +} + +static struct file_operations kgn_mdd_fops = { + .owner = THIS_MODULE, + .open = kgnilnd_mdd_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +typedef struct { + __u64 gsmsg_version; + kgn_device_t *gsmsg_dev; + kgn_fma_memblock_t *gsmsg_fmablk; + loff_t gsmsg_off; +} kgn_smsg_seq_iter_t; + +int +kgnilnd_smsg_seq_seek(kgn_smsg_seq_iter_t *gseq, loff_t off) +{ + kgn_fma_memblock_t *fmablk; + kgn_device_t *dev; + struct list_head *r; + loff_t here; + int rc = 0; + + /* offset 0 is the header, so we start real entries at + * here == off == 1 */ + if (off == 0) { + gseq->gsmsg_fmablk = NULL; + gseq->gsmsg_off = 0; + return 0; + } + + fmablk = gseq->gsmsg_fmablk; + dev = gseq->gsmsg_dev; + + spin_lock(&dev->gnd_fmablk_lock); + + if (fmablk != NULL && + gseq->gsmsg_version != atomic_read(&dev->gnd_fmablk_vers)) { + /* list changed */ + rc = -ESTALE; + goto out; + } + + if (fmablk == NULL || gseq->gsmsg_off > off) { + /* search from start */ + r = dev->gnd_fma_buffs.next; + here = 1; + } else { + /* continue current search */ + r = &fmablk->gnm_bufflist; + here = gseq->gsmsg_off; + } + + gseq->gsmsg_version = atomic_read(&dev->gnd_fmablk_vers); + gseq->gsmsg_off = off; + + while (r != &dev->gnd_fma_buffs) { + kgn_fma_memblock_t *t; + + t = list_entry(r, kgn_fma_memblock_t, gnm_bufflist); + + if (here == off) { + gseq->gsmsg_fmablk = t; + rc = 0; + goto out; + } + r = r->next; + here++; + } + + gseq->gsmsg_fmablk = NULL; + rc = -ENOENT; +out: + spin_unlock(&dev->gnd_fmablk_lock); + return rc; +} + +static void * +kgnilnd_smsg_seq_start(struct seq_file *s, loff_t *pos) +{ + + kgn_smsg_seq_iter_t *gseq; + int rc; + + if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) { + return NULL; + } + + LIBCFS_ALLOC(gseq, sizeof(*gseq)); + if (gseq == NULL) { + CERROR("could not allocate smsg sequence iterator\n"); + return NULL; + } + + /* only doing device 0 for now */ + gseq->gsmsg_dev = &kgnilnd_data.kgn_devices[0]; + gseq->gsmsg_fmablk = NULL; + rc = kgnilnd_smsg_seq_seek(gseq, *pos); + if (rc == 0) + return gseq; + + LIBCFS_FREE(gseq, sizeof(*gseq)); + return NULL; +} + +static void +kgnilnd_smsg_seq_stop(struct seq_file *s, void *iter) +{ + kgn_smsg_seq_iter_t *gseq = iter; + + if (gseq != NULL) + LIBCFS_FREE(gseq, sizeof(*gseq)); +} + +static void * +kgnilnd_smsg_seq_next(struct seq_file *s, void *iter, loff_t *pos) +{ + kgn_smsg_seq_iter_t *gseq = iter; + int rc; + loff_t next = *pos + 1; + + rc = kgnilnd_smsg_seq_seek(gseq, next); + if (rc != 0) { + LIBCFS_FREE(gseq, sizeof(*gseq)); + return NULL; + } + *pos = next; + return gseq; +} + +static int +kgnilnd_smsg_seq_show(struct seq_file *s, void *iter) +{ + kgn_smsg_seq_iter_t *gseq = iter; + kgn_fma_memblock_t *fmablk; + kgn_device_t *dev; + int avail_mboxs, held_mboxs, num_mboxs; + unsigned int blk_size; + int live; + kgn_fmablk_state_t state; + gni_mem_handle_t hndl; + + if (gseq->gsmsg_off == 0) { + seq_printf(s, "%5s %4s %6s/%5s/%5s %9s %18s %37s\n", + "blk#", "type", "avail", "held", "total", "size", + "fmablk", "mem handle"); + return 0; + } + + fmablk = gseq->gsmsg_fmablk; + dev = gseq->gsmsg_dev; + LASSERT(fmablk != NULL); + + spin_lock(&dev->gnd_fmablk_lock); + + if (gseq->gsmsg_version != atomic_read(&dev->gnd_fmablk_vers)) { + /* list changed */ + spin_unlock(&dev->gnd_fmablk_lock); + return -ESTALE; + } + + live = fmablk->gnm_hold_timeout == 0; + /* none are available if it isn't live... */ + avail_mboxs = live ? fmablk->gnm_avail_mboxs : 0; + held_mboxs = fmablk->gnm_held_mboxs; + num_mboxs = fmablk->gnm_num_mboxs; + blk_size = fmablk->gnm_blk_size; + state = fmablk->gnm_state; + hndl.qword1 = fmablk->gnm_hndl.qword1; + hndl.qword2 = fmablk->gnm_hndl.qword2; + + spin_unlock(&dev->gnd_fmablk_lock); + + if (live) { + seq_printf(s, "%5d %4s %6d/%5d/%5d %9d %18p "LPX64"."LPX64"\n", + (int) gseq->gsmsg_off, kgnilnd_fmablk_state2str(state), + avail_mboxs, held_mboxs, num_mboxs, blk_size, + fmablk, hndl.qword1, hndl.qword2); + } else { + seq_printf(s, "%5d %4s %6d/%5d/%5d %9d %18p %37s\n", + (int) gseq->gsmsg_off, kgnilnd_fmablk_state2str(state), + avail_mboxs, held_mboxs, num_mboxs, blk_size, + fmablk, "PURGATORY.HOLD"); + } + + return 0; +} + +static struct seq_operations kgn_smsg_sops = { + .start = kgnilnd_smsg_seq_start, + .stop = kgnilnd_smsg_seq_stop, + .next = kgnilnd_smsg_seq_next, + .show = kgnilnd_smsg_seq_show, + +}; + +static int +kgnilnd_smsg_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *dp = PDE(inode); + struct seq_file *sf; + int rc; + + rc = seq_open(file, &kgn_smsg_sops); + if (rc == 0) { + sf = file->private_data; + sf->private = dp->data; + } + + return rc; +} + +static struct file_operations kgn_smsg_fops = { + .owner = THIS_MODULE, + .open = kgnilnd_smsg_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +typedef struct { + __u64 gconn_version; + struct list_head *gconn_list; + kgn_conn_t *gconn_conn; + loff_t gconn_off; + int gconn_hashidx; +} kgn_conn_seq_iter_t; + +int +kgnilnd_conn_seq_seek(kgn_conn_seq_iter_t *gseq, loff_t off) +{ + struct list_head *list, *tmp; + loff_t here = 0; + int rc = 0; + + if (off == 0) { + gseq->gconn_hashidx = 0; + gseq->gconn_list = NULL; + } + + if (off > atomic_read(&kgnilnd_data.kgn_nconns)) { + gseq->gconn_list = NULL; + rc = -ENOENT; + } + + read_lock(&kgnilnd_data.kgn_peer_conn_lock); + if (gseq->gconn_list != NULL && + gseq->gconn_version != kgnilnd_data.kgn_conn_version) { + /* list changed */ + rc = -ESTALE; + goto out; + } + + if ((gseq->gconn_list == NULL) || + (gseq->gconn_off > off) || + (gseq->gconn_hashidx >= *kgnilnd_tunables.kgn_peer_hash_size)) { + /* search from start */ + gseq->gconn_hashidx = 0; + list = &kgnilnd_data.kgn_conns[gseq->gconn_hashidx]; + here = 0; + } else { + /* continue current search */ + list = gseq->gconn_list; + } + + gseq->gconn_version = kgnilnd_data.kgn_conn_version; + gseq->gconn_off = off; + +start_list: + + list_for_each(tmp, list) { + if (here == off) { + kgn_conn_t *conn; + conn = list_entry(tmp, kgn_conn_t, gnc_hashlist); + gseq->gconn_conn = conn; + rc = 0; + goto out; + } + here++; + } + /* if we got through this hash bucket with 'off' still to go, try next*/ + gseq->gconn_hashidx++; + if ((here <= off) && + (gseq->gconn_hashidx < *kgnilnd_tunables.kgn_peer_hash_size)) { + list = &kgnilnd_data.kgn_conns[gseq->gconn_hashidx]; + goto start_list; + } + + gseq->gconn_list = NULL; + rc = -ENOENT; +out: + read_unlock(&kgnilnd_data.kgn_peer_conn_lock); + return rc; +} + +static void * +kgnilnd_conn_seq_start(struct seq_file *s, loff_t *pos) +{ + + kgn_conn_seq_iter_t *gseq; + int rc; + + if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) { + return NULL; + } + + LIBCFS_ALLOC(gseq, sizeof(*gseq)); + if (gseq == NULL) { + CERROR("could not allocate conn sequence iterator\n"); + return NULL; + } + + /* only doing device 0 for now */ + gseq->gconn_list = NULL; + rc = kgnilnd_conn_seq_seek(gseq, *pos); + if (rc == 0) + return gseq; + + LIBCFS_FREE(gseq, sizeof(*gseq)); + return NULL; +} + +static void +kgnilnd_conn_seq_stop(struct seq_file *s, void *iter) +{ + kgn_conn_seq_iter_t *gseq = iter; + + if (gseq != NULL) + LIBCFS_FREE(gseq, sizeof(*gseq)); +} + +static void * +kgnilnd_conn_seq_next(struct seq_file *s, void *iter, loff_t *pos) +{ + kgn_conn_seq_iter_t *gseq = iter; + int rc; + loff_t next = *pos + 1; + + rc = kgnilnd_conn_seq_seek(gseq, next); + if (rc != 0) { + LIBCFS_FREE(gseq, sizeof(*gseq)); + return NULL; + } + *pos = next; + return gseq; +} + +static int +kgnilnd_conn_seq_show(struct seq_file *s, void *iter) +{ + kgn_conn_seq_iter_t *gseq = iter; + kgn_peer_t *peer = NULL; + kgn_conn_t *conn; + + /* there is no header data for conns, so offset 0 is the first + * real entry. */ + + conn = gseq->gconn_conn; + LASSERT(conn != NULL); + + read_lock(&kgnilnd_data.kgn_peer_conn_lock); + if (gseq->gconn_list != NULL && + gseq->gconn_version != kgnilnd_data.kgn_conn_version) { + /* list changed */ + read_unlock(&kgnilnd_data.kgn_peer_conn_lock); + return -ESTALE; + } + + /* instead of saving off the data, just refcount */ + kgnilnd_conn_addref(conn); + if (conn->gnc_peer) { + /* don't use link - after unlock it could get nuked */ + peer = conn->gnc_peer; + kgnilnd_peer_addref(peer); + } + + read_unlock(&kgnilnd_data.kgn_peer_conn_lock); + + seq_printf(s, "%p->%s [%d] q %d/%d/%d " + "tx sq %u %dms/%dms " + "rx sq %u %dms/%dms " + "noop r/s %d/%d w/s/cq %lds/%lds/%lds " + "sched a/d %lds/%lds " + "tx_re "LPD64" TO %ds %s\n", + conn, peer ? libcfs_nid2str(peer->gnp_nid) : "", + atomic_read(&conn->gnc_refcount), + kgnilnd_count_list(&conn->gnc_fmaq), + atomic_read(&conn->gnc_nlive_fma), + atomic_read(&conn->gnc_nlive_rdma), + conn->gnc_tx_seq, + jiffies_to_msecs(jiffies - conn->gnc_last_tx), + jiffies_to_msecs(jiffies - conn->gnc_last_tx_cq), + conn->gnc_rx_seq, + jiffies_to_msecs(jiffies - conn->gnc_last_rx), + jiffies_to_msecs(jiffies - conn->gnc_last_rx_cq), + atomic_read(&conn->gnc_reaper_noop), + atomic_read(&conn->gnc_sched_noop), + cfs_duration_sec(jiffies - conn->gnc_last_noop_want), + cfs_duration_sec(jiffies - conn->gnc_last_noop_sent), + cfs_duration_sec(jiffies - conn->gnc_last_noop_cq), + cfs_duration_sec(jiffies - conn->gnc_last_sched_ask), + cfs_duration_sec(jiffies - conn->gnc_last_sched_do), + conn->gnc_tx_retrans, conn->gnc_timeout, + kgnilnd_conn_state2str(conn)); + + if (peer) + kgnilnd_peer_decref(peer); + kgnilnd_conn_decref(conn); + + return 0; +} + +static struct seq_operations kgn_conn_sops = { + .start = kgnilnd_conn_seq_start, + .stop = kgnilnd_conn_seq_stop, + .next = kgnilnd_conn_seq_next, + .show = kgnilnd_conn_seq_show, + +}; + +static int +kgnilnd_conn_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *dp = PDE(inode); + struct seq_file *sf; + int rc; + + rc = seq_open(file, &kgn_conn_sops); + if (rc == 0) { + sf = file->private_data; + sf->private = dp->data; + } + + return rc; +} + +static struct file_operations kgn_conn_fops = { + .owner = THIS_MODULE, + .open = kgnilnd_conn_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +typedef struct { + __u64 gpeer_version; + struct list_head *gpeer_list; + kgn_peer_t *gpeer_peer; + loff_t gpeer_off; + int gpeer_hashidx; +} kgn_peer_seq_iter_t; + +int +kgnilnd_peer_seq_seek(kgn_peer_seq_iter_t *gseq, loff_t off) +{ + struct list_head *list, *tmp; + loff_t here = 0; + int rc = 0; + + if (off == 0) { + gseq->gpeer_hashidx = 0; + gseq->gpeer_list = NULL; + } + + if (off > atomic_read(&kgnilnd_data.kgn_npeers)) { + gseq->gpeer_list = NULL; + rc = -ENOENT; + } + + read_lock(&kgnilnd_data.kgn_peer_conn_lock); + if (gseq->gpeer_list != NULL && + gseq->gpeer_version != kgnilnd_data.kgn_peer_version) { + /* list changed */ + rc = -ESTALE; + goto out; + } + + if ((gseq->gpeer_list == NULL) || + (gseq->gpeer_off > off) || + (gseq->gpeer_hashidx >= *kgnilnd_tunables.kgn_peer_hash_size)) { + /* search from start */ + gseq->gpeer_hashidx = 0; + list = &kgnilnd_data.kgn_peers[gseq->gpeer_hashidx]; + here = 0; + } else { + /* continue current search */ + list = gseq->gpeer_list; + } + + gseq->gpeer_version = kgnilnd_data.kgn_peer_version; + gseq->gpeer_off = off; + +start_list: + + list_for_each(tmp, list) { + if (here == off) { + kgn_peer_t *peer; + peer = list_entry(tmp, kgn_peer_t, gnp_list); + gseq->gpeer_peer = peer; + rc = 0; + goto out; + } + here++; + } + /* if we got through this hash bucket with 'off' still to go, try next*/ + gseq->gpeer_hashidx++; + if ((here <= off) && + (gseq->gpeer_hashidx < *kgnilnd_tunables.kgn_peer_hash_size)) { + list = &kgnilnd_data.kgn_peers[gseq->gpeer_hashidx]; + goto start_list; + } + + gseq->gpeer_list = NULL; + rc = -ENOENT; +out: + read_unlock(&kgnilnd_data.kgn_peer_conn_lock); + return rc; +} + +static void * +kgnilnd_peer_seq_start(struct seq_file *s, loff_t *pos) +{ + + kgn_peer_seq_iter_t *gseq; + int rc; + + if (kgnilnd_data.kgn_init < GNILND_INIT_ALL) { + return NULL; + } + + LIBCFS_ALLOC(gseq, sizeof(*gseq)); + if (gseq == NULL) { + CERROR("could not allocate peer sequence iterator\n"); + return NULL; + } + + /* only doing device 0 for now */ + gseq->gpeer_list = NULL; + rc = kgnilnd_peer_seq_seek(gseq, *pos); + if (rc == 0) + return gseq; + + LIBCFS_FREE(gseq, sizeof(*gseq)); + return NULL; +} + +static void +kgnilnd_peer_seq_stop(struct seq_file *s, void *iter) +{ + kgn_peer_seq_iter_t *gseq = iter; + + if (gseq != NULL) + LIBCFS_FREE(gseq, sizeof(*gseq)); +} + +static void * +kgnilnd_peer_seq_next(struct seq_file *s, void *iter, loff_t *pos) +{ + kgn_peer_seq_iter_t *gseq = iter; + int rc; + loff_t next = *pos + 1; + + rc = kgnilnd_peer_seq_seek(gseq, next); + if (rc != 0) { + LIBCFS_FREE(gseq, sizeof(*gseq)); + return NULL; + } + *pos = next; + return gseq; +} + +static int +kgnilnd_peer_seq_show(struct seq_file *s, void *iter) +{ + kgn_peer_seq_iter_t *gseq = iter; + kgn_peer_t *peer; + kgn_conn_t *conn; + char conn_str; + int purg_count = 0; + /* there is no header data for peers, so offset 0 is the first + * real entry. */ + + peer = gseq->gpeer_peer; + LASSERT(peer != NULL); + + read_lock(&kgnilnd_data.kgn_peer_conn_lock); + if (gseq->gpeer_list != NULL && + gseq->gpeer_version != kgnilnd_data.kgn_peer_version) { + /* list changed */ + read_unlock(&kgnilnd_data.kgn_peer_conn_lock); + return -ESTALE; + } + + /* instead of saving off the data, just refcount */ + kgnilnd_peer_addref(peer); + conn = kgnilnd_find_conn_locked(peer); + + if (peer->gnp_connecting) { + conn_str = 'S'; + } else if (conn != NULL) { + conn_str = 'C'; + } else { + conn_str = 'D'; + } + + list_for_each_entry(conn, &peer->gnp_conns, gnc_list) { + if (conn->gnc_in_purgatory) { + purg_count++; + } + } + + read_unlock(&kgnilnd_data.kgn_peer_conn_lock); + + seq_printf(s, "%p->%s [%d] NIC 0x%x q %d conn %c purg %d " + "last %d@%dms dgram %d@%dms " + "reconn %dms to %lus \n", + peer, libcfs_nid2str(peer->gnp_nid), + atomic_read(&peer->gnp_refcount), + peer->gnp_host_id, + kgnilnd_count_list(&peer->gnp_tx_queue), + conn_str, + purg_count, + peer->gnp_last_errno, + jiffies_to_msecs(jiffies - peer->gnp_last_alive), + peer->gnp_last_dgram_errno, + jiffies_to_msecs(jiffies - peer->gnp_last_dgram_time), + peer->gnp_reconnect_interval != 0 + ? jiffies_to_msecs(jiffies - peer->gnp_reconnect_time) + : 0, + peer->gnp_reconnect_interval); + + kgnilnd_peer_decref(peer); + + return 0; +} + +static struct seq_operations kgn_peer_sops = { + .start = kgnilnd_peer_seq_start, + .stop = kgnilnd_peer_seq_stop, + .next = kgnilnd_peer_seq_next, + .show = kgnilnd_peer_seq_show, +}; + +static int +kgnilnd_peer_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *dp = PDE(inode); + struct seq_file *sf; + int rc; + + rc = seq_open(file, &kgn_peer_sops); + if (rc == 0) { + sf = file->private_data; + sf->private = dp->data; + } + + return rc; +} + +static struct file_operations kgn_peer_fops = { + .owner = THIS_MODULE, + .open = kgnilnd_peer_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct proc_dir_entry *kgn_proc_root; + +void +kgnilnd_proc_init(void) +{ + struct proc_dir_entry *pde; + int rc = 0; + ENTRY; + + /* setup dir */ + kgn_proc_root = proc_mkdir(libcfs_lnd2modname(GNILND), NULL); + if (kgn_proc_root == NULL) { + CERROR("couldn't create proc dir %s\n", + libcfs_lnd2modname(GNILND)); + return; + } + + /* Initialize CKSUM_TEST */ + pde = create_proc_entry(GNILND_PROC_CKSUM_TEST, 0200, kgn_proc_root); + if (pde == NULL) { + CERROR("couldn't create proc entry %s\n", GNILND_PROC_CKSUM_TEST); + rc = -ENOENT; + GOTO(remove_dir, rc); + } + + pde->data = NULL; + pde->write_proc = kgnilnd_proc_cksum_test_write; + + /* Initialize STATS */ + pde = create_proc_entry(GNILND_PROC_STATS, 0644, kgn_proc_root); + if (pde == NULL) { + CERROR("couldn't create proc entry %s\n", GNILND_PROC_STATS); + rc = -ENOENT; + GOTO(remove_test, rc); + } + + pde->data = NULL; + pde->read_proc = kgnilnd_proc_stats_read; + pde->write_proc = kgnilnd_proc_stats_write; + + /* Initialize MDD */ + pde = create_proc_entry(GNILND_PROC_MDD, 0444, kgn_proc_root); + if (pde == NULL) { + CERROR("couldn't create proc entry %s\n", GNILND_PROC_MDD); + rc = -ENOENT; + GOTO(remove_stats, rc); + } + + pde->data = NULL; + pde->proc_fops = &kgn_mdd_fops; + + /* Initialize SMSG */ + pde = create_proc_entry(GNILND_PROC_SMSG, 0444, kgn_proc_root); + if (pde == NULL) { + CERROR("couldn't create proc entry %s\n", GNILND_PROC_SMSG); + rc = -ENOENT; + GOTO(remove_mdd, rc); + } + + pde->data = NULL; + pde->proc_fops = &kgn_smsg_fops; + + /* Initialize CONN */ + pde = create_proc_entry(GNILND_PROC_CONN, 0444, kgn_proc_root); + if (pde == NULL) { + CERROR("couldn't create proc entry %s\n", GNILND_PROC_CONN); + rc = -ENOENT; + GOTO(remove_smsg, rc); + } + + pde->data = NULL; + pde->proc_fops = &kgn_conn_fops; + + /* Initialize PEER */ + pde = create_proc_entry(GNILND_PROC_PEER, 0444, kgn_proc_root); + if (pde == NULL) { + CERROR("couldn't create proc entry %s\n", GNILND_PROC_PEER); + rc = -ENOENT; + GOTO(remove_conn, rc); + } + + pde->data = NULL; + pde->proc_fops = &kgn_peer_fops; + RETURN_EXIT; + +remove_conn: + remove_proc_entry(GNILND_PROC_CONN, kgn_proc_root); +remove_smsg: + remove_proc_entry(GNILND_PROC_SMSG, kgn_proc_root); +remove_mdd: + remove_proc_entry(GNILND_PROC_MDD, kgn_proc_root); +remove_stats: + remove_proc_entry(GNILND_PROC_STATS, kgn_proc_root); +remove_test: + remove_proc_entry(GNILND_PROC_CKSUM_TEST, kgn_proc_root); +remove_dir: + remove_proc_entry(kgn_proc_root->name, NULL); + + RETURN_EXIT; +} + +void +kgnilnd_proc_fini(void) +{ + remove_proc_entry(GNILND_PROC_PEER, kgn_proc_root); + remove_proc_entry(GNILND_PROC_CONN, kgn_proc_root); + remove_proc_entry(GNILND_PROC_MDD, kgn_proc_root); + remove_proc_entry(GNILND_PROC_SMSG, kgn_proc_root); + remove_proc_entry(GNILND_PROC_STATS, kgn_proc_root); + remove_proc_entry(GNILND_PROC_CKSUM_TEST, kgn_proc_root); + remove_proc_entry(kgn_proc_root->name, NULL); +} diff --git a/lnet/klnds/gnilnd/gnilnd_stack.c b/lnet/klnds/gnilnd/gnilnd_stack.c new file mode 100644 index 0000000..10ae493 --- /dev/null +++ b/lnet/klnds/gnilnd/gnilnd_stack.c @@ -0,0 +1,564 @@ +/* + * Copyright (C) 2012 Cray, Inc. + * + * Author: Nic Henke + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ +#include "gnilnd.h" + +/* Advance all timeouts by nap_time seconds. */ +void +kgnilnd_bump_timeouts(__u32 nap_time, char *reason) +{ + int i; + kgn_peer_t *peer; + kgn_conn_t *conn; + kgn_tx_t *tx; + kgn_device_t *dev; + kgn_dgram_t *dgram; + + LCONSOLE_INFO("%s: bumping all timeouts by %ds\n", reason, nap_time); + + LASSERTF(GNILND_IS_QUIESCED, "gnilnd not quiesced %d != %d\n", + atomic_read(&kgnilnd_data.kgn_nquiesce), + atomic_read(&kgnilnd_data.kgn_nthreads)); + + /* requiring that the threads are paused ensures a couple of things: + * - combined code paths for stack reset and quiesce event as stack reset + * runs with the threads paused + * - prevents traffic to the Gemini during a quiesce period + * - reduces the locking requirements + */ + + for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) { + list_for_each_entry(peer, &kgnilnd_data.kgn_peers[i], gnp_list) { + + /* we can reconnect again at any time */ + peer->gnp_reconnect_time = jiffies; + /* reset now that network is healthy */ + peer->gnp_reconnect_interval = 0; + /* tell LNet dude is still alive */ + kgnilnd_peer_alive(peer); + + list_for_each_entry(tx, &peer->gnp_tx_queue, tx_list) { + tx->tx_qtime = jiffies; + } + + list_for_each_entry(conn, &peer->gnp_conns, gnc_list) { + unsigned long timeout; + + timeout = cfs_time_seconds(conn->gnc_timeout); + + /* bump last_rx/last_rx_cq on all conns - including + * closed ones, this will have the effect of + * bumping the purgatory timers for those */ + conn->gnc_last_rx = conn->gnc_last_rx_cq = jiffies; + + /* we don't timeout based on old gnc_last_tx, so + * we'll back it up and schedule the conn to trigger + * a NOOP */ + conn->gnc_last_tx = jiffies - timeout; + kgnilnd_schedule_conn(conn); + } + } + } + + for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) { + dev = &kgnilnd_data.kgn_devices[i]; + for (i = 0; i < (*kgnilnd_tunables.kgn_peer_hash_size - 1); i++) { + list_for_each_entry(dgram, &dev->gnd_dgrams[i], gndg_list) { + dgram->gndg_post_time = jiffies; + } + } + } +} + +/* Quiesce or wake up the stack. The caller must hold the kgn_quiesce_sem semaphore + * on entry, which holds off any pending stack shutdown. */ +void +kgnilnd_quiesce_wait(char *reason) +{ + int i; + + if (kgnilnd_data.kgn_quiesce_trigger) { + unsigned long quiesce_deadline, quiesce_to; + /* FREEZE TAG!!!! */ + + /* morning sunshine */ + spin_lock(&kgnilnd_data.kgn_reaper_lock); + wake_up_all(&kgnilnd_data.kgn_reaper_waitq); + spin_unlock(&kgnilnd_data.kgn_reaper_lock); + + for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) { + kgn_device_t *dev = &kgnilnd_data.kgn_devices[i]; + + wake_up_all(&dev->gnd_waitq); + wake_up_all(&dev->gnd_dgram_waitq); + wake_up_all(&dev->gnd_dgping_waitq); + } + + /* we'll wait for 10x the timeout for the threads to pause */ + quiesce_to = cfs_time_seconds(*kgnilnd_tunables.kgn_timeout * 10); + quiesce_deadline = (long) jiffies + quiesce_to; + + /* wait for everyone to check-in as quiesced */ + i = 1; + while (!GNILND_IS_QUIESCED) { + i++; + LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET, + "%s: Waiting for %d threads to pause\n", + reason, + atomic_read(&kgnilnd_data.kgn_nthreads) - + atomic_read(&kgnilnd_data.kgn_nquiesce)); + CFS_RACE(CFS_FAIL_GNI_QUIESCE_RACE); + cfs_pause(cfs_time_seconds(1 * i)); + + LASSERTF(quiesce_deadline > jiffies, + "couldn't quiesce threads in %lu seconds, falling over now\n", + cfs_duration_sec(quiesce_to)); + } + + LCONSOLE_WARN("%s: All threads paused!\n", reason); + /* XXX Nic: Is there a set of counters we can grab here to + * ensure that there is no traffic until quiesce is over ?*/ + } else { + /* GO! GO! GO! */ + + for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) { + kgn_device_t *dev = &kgnilnd_data.kgn_devices[i]; + kgnilnd_schedule_dgram(dev); + } + + /* wait for everyone to check-in as running - they will be spinning + * and looking, so no need to poke any waitq */ + i = 1; + while (atomic_read(&kgnilnd_data.kgn_nquiesce) > 0) { + i++; + LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET, + "%s: Waiting for %d threads to wake up\n", + reason, + atomic_read(&kgnilnd_data.kgn_nquiesce)); + cfs_pause(cfs_time_seconds(1 * i)); + } + + LCONSOLE_WARN("%s: All threads awake!\n", reason); + } +} + +/* Reset the stack. */ +void +kgnilnd_reset_stack(void) +{ + int i, rc = 0; + kgn_net_t *net; + kgn_peer_t *peer, *peerN; + LIST_HEAD (souls); + char *reason = "critical hardware error"; + __u32 seconds; + unsigned long start, end; + ENTRY; + + /* Race with del_peer and its atomics */ + CFS_RACE(CFS_FAIL_GNI_RACE_RESET); + + if (kgnilnd_data.kgn_init != GNILND_INIT_ALL) { + CERROR("can't reset the stack, gnilnd is not initialized\n"); + RETURN_EXIT; + } + + /* First make sure we are not already quiesced - we panic if so, + * as that could leave software in a bad state */ + LASSERTF(kgnilnd_data.kgn_quiesce_trigger == GNILND_QUIESCE_IDLE, + "can't reset the stack, already doing so: trigger %d\n", + kgnilnd_data.kgn_quiesce_trigger); + + set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_RESET); + + /* wake up the dgram waitq thread - but after trigger set to make sure it + * goes into quiesce */ + CFS_RACE(CFS_FAIL_GNI_WC_DGRAM_FREE); + /* same for scheduler that is dropping state transitiosn */ + CFS_RACE(CFS_FAIL_GNI_DROP_CLOSING); + CFS_RACE(CFS_FAIL_GNI_DROP_DESTROY_EP); + + kgnilnd_quiesce_wait(reason); + + start = jiffies; + + kgnilnd_data.kgn_in_reset = 1; + kgnilnd_data.kgn_nresets++; + LCONSOLE_WARN("%s: resetting all resources (count %d)\n", + reason, kgnilnd_data.kgn_nresets); + + for (i = 0; i < *kgnilnd_tunables.kgn_net_hash_size; i++) { + list_for_each_entry(net, &kgnilnd_data.kgn_nets[i], gnn_list) { + rc = kgnilnd_cancel_net_dgrams(net); + LASSERTF(rc == 0, "couldn't cleanup datagrams: %d\n", rc); + } + } + + /* error -ENOTRECOVERABLE is stack reset */ + kgnilnd_del_conn_or_peer(NULL, LNET_NID_ANY, GNILND_DEL_CONN, -ENOTRECOVERABLE); + + for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) { + kgn_device_t *dev = &kgnilnd_data.kgn_devices[i]; + kgnilnd_cancel_wc_dgrams(dev); + kgnilnd_wait_for_canceled_dgrams(dev); + } + + /* manually do some conn processing ala kgnilnd_process_conns */ + for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) { + kgn_device_t *dev = &kgnilnd_data.kgn_devices[i]; + kgn_conn_t *conn; + int conn_sched; + + /* go find all the closed conns that need to be nuked - the + * scheduler thread isn't running to do this for us */ + + CDEBUG(D_NET, "will try to clear up %d ready_conns\n", + kgnilnd_count_list(&dev->gnd_ready_conns)); + + /* use while/list_first_entry loop to ensure we can handle any + * DESTROY_EP conns added from kgnilnd_complete_closed_conn */ + while (!list_empty(&dev->gnd_ready_conns)) { + conn = list_first_entry(&dev->gnd_ready_conns, + kgn_conn_t, gnc_schedlist); + conn_sched = xchg(&conn->gnc_scheduled, GNILND_CONN_PROCESS); + + LASSERTF(conn_sched != GNILND_CONN_IDLE && + conn_sched != GNILND_CONN_PROCESS, + "conn %p on ready list but in bad state: %d\n", + conn, conn_sched); + + list_del_init(&conn->gnc_schedlist); + + if (conn->gnc_state == GNILND_CONN_CLOSING) { + /* bump to CLOSED to fake out send of CLOSE */ + conn->gnc_state = GNILND_CONN_CLOSED; + conn->gnc_close_sent = 1; + } + + if (conn->gnc_state == GNILND_CONN_DESTROY_EP) { + kgnilnd_destroy_conn_ep(conn); + } else { + kgnilnd_complete_closed_conn(conn); + } + + /* there really shouldn't be any other states here - + * they would have been cleared out in the del_peer_or_conn or the dgram + * aborts above. + * there is an LASSERTF in kgnilnd_complete_closed_conn that will take + * care of catching anything else for us */ + + kgnilnd_schedule_process_conn(conn, -1); + + kgnilnd_conn_decref(conn); + } + } + + /* don't let the little weasily purgatory conns hide from us */ + for (i = 0; i < *kgnilnd_tunables.kgn_peer_hash_size; i++) { + list_for_each_entry_safe(peer, peerN, &kgnilnd_data.kgn_peers[i], gnp_list) { + kgn_conn_t *conn, *connN; + + list_for_each_entry_safe(conn, connN, &peer->gnp_conns, gnc_list) { + kgnilnd_detach_purgatory_locked(conn, &souls); + } + } + } + + CDEBUG(D_NET, "about to release %d purgatory entries\n", + kgnilnd_count_list(&souls)); + + kgnilnd_release_purgatory_list(&souls); + + /* validate we are now clean */ + for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) { + kgn_device_t *dev = &kgnilnd_data.kgn_devices[i]; + + /* now all the cons/mboxes should be cleaned up, including purgatory + * so go through and release the MDDs for our persistent PHYS fma_blks + */ + kgnilnd_unmap_phys_fmablk(dev); + + LASSERTF(atomic_read(&dev->gnd_nfmablk) == 0, + "reset failed: fma blocks still live %d\n", + atomic_read(&dev->gnd_nfmablk)); + + LASSERTF(atomic_read(&dev->gnd_neps) == 0, + "reset failed: EP handles still live %d\n", + atomic_read(&dev->gnd_neps)); + } + + LASSERTF(atomic_read(&kgnilnd_data.kgn_nconns) == 0, + "reset failed: conns left %d\n", + atomic_read(&kgnilnd_data.kgn_nconns)); + + /* fine to have peers left - they are waiting for new conns + * but should not be holding any open HW resources */ + + /* like the last part of kgnilnd_base_shutdown() */ + + CFS_RACE(CFS_FAIL_GNI_SR_DOWN_RACE); + + for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) { + kgnilnd_dev_fini(&kgnilnd_data.kgn_devices[i]); + } + + /* no need to free and recreate the TX descriptors + * we nuked all the ones that could be using HW resources in + * kgnilnd_close_matching_conns and asserted it worked in + * kgnilnd_dev_fini */ + + /* At this point, all HW is torn down, start to reset */ + + /* only reset our known devs */ + for (i = 0; i < kgnilnd_data.kgn_ndevs; i++) { + kgn_device_t *dev = &kgnilnd_data.kgn_devices[i]; + rc = kgnilnd_dev_init(dev); + LASSERTF(rc == 0, "dev_init failed for dev %d\n", i); + kgnilnd_map_phys_fmablk(dev); + LASSERTF(rc == 0, "map_phys_fmablk failed for dev %d\n", i); + rc = kgnilnd_setup_wildcard_dgram(dev); + LASSERTF(rc == 0, "couldnt setup datagrams on dev %d: %d\n", + i, rc); + } + + /* Now the fun restarts... - release the hounds! */ + + end = jiffies; + seconds = cfs_duration_sec((long)end - start); + kgnilnd_bump_timeouts(seconds, reason); + + kgnilnd_data.kgn_in_reset = 0; + set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_IDLE); + kgnilnd_quiesce_wait(reason); + LCONSOLE_WARN("%s reset of all hardware resources\n", + rc ? "failed" : "successful"); + + RETURN_EXIT; +} + +/* A thread that handles quiece and reset hardware events. + * We do the same thing regardless of which device reported the event. */ +int +kgnilnd_ruhroh_thread(void *arg) +{ + int i = 1; + DEFINE_WAIT(wait); + + cfs_daemonize("kgnilnd_rr"); + cfs_block_allsigs(); + set_user_nice(current, *kgnilnd_tunables.kgn_nice); + kgnilnd_data.kgn_ruhroh_running = 1; + + while (1) { + + /* Block until there's a request.. A reset request could come in + * while we're handling a quiesce one, or vice versa. + * Keep processing requests until there are none.*/ + prepare_to_wait(&kgnilnd_data.kgn_ruhroh_waitq, &wait, TASK_INTERRUPTIBLE); + while (!(kgnilnd_data.kgn_ruhroh_shutdown || + kgnilnd_data.kgn_needs_reset || kgnilnd_data.kgn_needs_pause)) + schedule(); + finish_wait(&kgnilnd_data.kgn_ruhroh_waitq, &wait); + + /* Exit if the driver is shutting down. */ + if (kgnilnd_data.kgn_ruhroh_shutdown) + break; + + /* Serialize with driver startup and shutdown. */ + down(&kgnilnd_data.kgn_quiesce_sem); + + CDEBUG(D_NET, "trigger %d reset %d to_bump %d pause %d\n", + kgnilnd_data.kgn_quiesce_trigger, + kgnilnd_data.kgn_needs_reset, + kgnilnd_data.kgn_bump_info_rdy, + kgnilnd_data.kgn_needs_pause); + + /* Do we need to do a pause/quiesce? */ + if (kgnilnd_data.kgn_needs_pause) { + + /* Pause all other kgnilnd threads. */ + set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_HW_QUIESCE); + kgnilnd_quiesce_wait("hardware quiesce flag"); + + /* If the hardware quiesce flag is set, wait for it to clear. + * This should happen relatively quickly, so we wait for it. + * This will hold up the eventd thread, but on everything but + * the simulator, this is ok-- there is one thread per core. + * + * Handle (possibly multiple) quiesce events while we wait. The + * memory barrier ensures that the core doesn't start fetching + * kgn_bump_info_rdy before it fetches kgn_needs_pause, and + * matches the second mb in kgnilnd_quiesce_end_callback(). */ + smp_rmb(); + while (kgnilnd_hw_in_quiesce() || kgnilnd_data.kgn_bump_info_rdy) { + + i++; + LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET, + "Waiting for hardware quiesce flag to clear\n"); + cfs_pause(cfs_time_seconds(1 * i)); + + /* If we got a quiesce event with bump info, DO THE BUMP!. */ + if (kgnilnd_data.kgn_bump_info_rdy) { + /* reset console rate limiting for each event */ + i = 1; + + /* Make sure the core doesn't start fetching + * kgni_quiesce_seconds until after it sees + * kgn_bump_info_rdy set. This is the match to the + * first mb in kgnilnd_quiesce_end_callback(). */ + smp_rmb(); + (void) kgnilnd_bump_timeouts(kgnilnd_data.kgn_quiesce_secs, + "hardware quiesce callback"); + set_mb(kgnilnd_data.kgn_quiesce_secs, 0); + set_mb(kgnilnd_data.kgn_bump_info_rdy, 0); + } + } + + /* Reset the kgn_needs_pause flag before coming out of + * the pause. This ordering avoids a race with the + * setting of this flag in kgnilnd_pause_threads(). */ + set_mb(kgnilnd_data.kgn_needs_pause, 0); + + /* ok, let the kids back into the pool */ + set_mb(kgnilnd_data.kgn_quiesce_trigger, GNILND_QUIESCE_IDLE); + kgnilnd_quiesce_wait("hardware quiesce"); + } + + /* Do a stack reset if needed. */ + if (kgnilnd_data.kgn_needs_reset) { + kgnilnd_reset_stack(); + set_mb(kgnilnd_data.kgn_needs_reset, 0); + } + + up(&kgnilnd_data.kgn_quiesce_sem); + } + + kgnilnd_data.kgn_ruhroh_running = 0; + return 0; +} + +/* Set pause request flag. Any functions that + * call this one are responsible for ensuring that + * variables they set up are visible on other cores before + * this flag setting. This executes in interrupt or kernel + * thread context. */ +void +kgnilnd_pause_threads(void) +{ + /* only device 0 gets the handle, see kgnilnd_dev_init */ + kgn_device_t *dev = &kgnilnd_data.kgn_devices[0]; + LASSERTF(dev != NULL, "dev 0 is NULL\n"); + + /* If we're currently in a pause triggered by the pause flag, + * there's no need to set it again. We clear the kgn_needs_pause + * flag before we reset kgn_quiesce_trigger to avoid a race. The + * read memory barrier matches the setmb() on the trigger in + * kgnilnd_ruhroh_task(). */ + smp_rmb(); + if (!(kgnilnd_data.kgn_quiesce_trigger == GNILND_QUIESCE_HW_QUIESCE && + GNILND_IS_QUIESCED)) { + CDEBUG(D_NET, "requesting thread pause\n"); + + kgnilnd_data.kgn_needs_pause = 1; + + wake_up(&kgnilnd_data.kgn_ruhroh_waitq); + } else { + CDEBUG(D_NET, "thread pause already underway\n"); + } +} + +/* Return non-zero if the GNI hardware quiesce flag is set */ +int +kgnilnd_hw_in_quiesce(void) +{ + /* only device 0 gets the handle, see kgnilnd_dev_init */ + kgn_device_t *dev0 = &kgnilnd_data.kgn_devices[0]; + + LASSERTF(dev0 != NULL, "dev 0 is NULL\n"); + + smp_rmb(); + return kgnilnd_get_quiesce_status(dev0->gnd_handle) != 0; +} + + +/* If the GNI hardware quiesce flag is set, initiate our pause and + * return non-zero. Also return non-zero if the stack is shutting down. */ +int +kgnilnd_check_hw_quiesce(void) +{ + if (likely(!kgnilnd_hw_in_quiesce())) + return 0; + + if (!kgnilnd_data.kgn_ruhroh_shutdown) { + CDEBUG(D_NET, "initiating thread pause\n"); + kgnilnd_pause_threads(); + } else { + CDEBUG(D_NET, "thread pause bypassed because of shutdown\n"); + } + + return 1; +} + +/* Callback from kngi with the quiesce duration. This executes + * in interrupt context. */ +void +kgnilnd_quiesce_end_callback(gni_nic_handle_t nic_handle, uint64_t msecs) +{ + /* only device 0 gets the handle, see kgnilnd_dev_init */ + kgn_device_t *dev = &kgnilnd_data.kgn_devices[0]; + LASSERTF(dev != NULL, "dev 0 is NULL\n"); + + if (!kgnilnd_data.kgn_ruhroh_shutdown) { + + CDEBUG(D_NET, "requesting timeout bump by "LPD64" msecs\n", msecs); + + /* Save the bump interval and request the bump. + * The memory barrier ensures that the interval is in place before + * the bump flag can be seen (in case a core is already running the + * ruhroh task), and that the bump request flag in place before + * the pause request can be seen (to ensure a core doesn't miss the bump + * request flag). */ + /* If another callback occurred before the ruhroh task + * finished processing the first bump request, we'd over-write its info. + * Nic says that callbacks occur so slowly that this isn't an issue. */ + set_mb(kgnilnd_data.kgn_quiesce_secs, msecs / MSEC_PER_SEC); + set_mb(kgnilnd_data.kgn_bump_info_rdy, 1); + kgnilnd_pause_threads(); + } else { + CDEBUG(D_NET, "timeout bump bypassed because of shutdown\n"); + } +} + +void +kgnilnd_critical_error(struct gni_err *err_handle) +{ + /* only device 0 gets the handle, see kgnilnd_dev_init */ + kgn_device_t *dev = &kgnilnd_data.kgn_devices[0]; + LASSERTF(dev != NULL, "dev 0 is NULL\n"); + + if (!kgnilnd_data.kgn_ruhroh_shutdown) { + CDEBUG(D_NET, "requesting stack reset\n"); + kgnilnd_data.kgn_needs_reset = 1; + wake_up(&kgnilnd_data.kgn_ruhroh_waitq); + } else { + CDEBUG(D_NET, "stack reset bypassed because of shutdown\n"); + } +} diff --git a/lnet/klnds/gnilnd/gnilnd_sysctl.c b/lnet/klnds/gnilnd/gnilnd_sysctl.c new file mode 100644 index 0000000..cd33d3e --- /dev/null +++ b/lnet/klnds/gnilnd/gnilnd_sysctl.c @@ -0,0 +1,252 @@ +/* + * Copyright (C) 2012 Cray, Inc. + * + * Author: Nic Henke + * Author: James Shimek + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* this code liberated and modified from Lustre */ + +#define DEBUG_SUBSYSTEM S_LND + +#include "gnilnd.h" + +typedef struct kgn_sysctl_data { + int ksd_pause_trigger; + int ksd_quiesce_secs; + int ksd_rdmaq_override; +} kgn_sysctl_data_t; + +static kgn_sysctl_data_t kgnilnd_sysctl; + +#if defined(CONFIG_SYSCTL) + +static cfs_sysctl_table_header_t *kgnilnd_table_header = NULL; +#ifndef HAVE_SYSCTL_UNNUMBERED + +enum { + GNILND_VERSION = 1, + GNILND_THREAD_PAUSE, + GNILND_HW_QUIESCE, + GNILND_STACK_RESET, + GNILND_RDMAQ_OVERRIDE, +}; +#else +#define GNILND_VERSION CTL_UNNUMBERED +#define GNILND_THREAD_PAUSE CTL_UNNUMBERED +#define GNILND_HW_QUIESCE CTL_UNNUMBERED +#define GNILND_STACK_RESET CTL_UNNUMBERED +#define GNILND_RDMAQ_OVERRIDE CTL_UNNUMBERED +#endif + +static int LL_PROC_PROTO(proc_toggle_thread_pause) +{ + int old_val = kgnilnd_sysctl.ksd_pause_trigger; + int rc = 0; + ENTRY; + + rc = ll_proc_dointvec(table, write, filp, buffer, lenp, ppos); + if (!write) { + /* read */ + RETURN(rc); + } + + if (kgnilnd_data.kgn_init != GNILND_INIT_ALL) { + rc = -EINVAL; + RETURN(rc); + } + + if (old_val != kgnilnd_sysctl.ksd_pause_trigger) { + down(&kgnilnd_data.kgn_quiesce_sem); + CDEBUG(D_NET, "setting quiesce_trigger %d\n", old_val); + kgnilnd_data.kgn_quiesce_trigger = kgnilnd_sysctl.ksd_pause_trigger; + kgnilnd_quiesce_wait("admin sysctl"); + up(&kgnilnd_data.kgn_quiesce_sem); + } + + RETURN(rc); +} + +static int LL_PROC_PROTO(proc_hw_quiesce) +{ + int rc = 0; + kgn_device_t *dev; + ENTRY; + + rc = ll_proc_dointvec(table, write, filp, buffer, lenp, ppos); + if (!write) { + /* read */ + RETURN(rc); + } + + if (kgnilnd_data.kgn_init != GNILND_INIT_ALL) { + rc = -EINVAL; + RETURN(rc); + } + + + /* only device 0 gets the handle, see kgnilnd_dev_init */ + dev = &kgnilnd_data.kgn_devices[0]; + + LASSERTF(dev != NULL, "dev 0 is NULL\n"); + + kgnilnd_quiesce_end_callback(dev->gnd_handle, + kgnilnd_sysctl.ksd_quiesce_secs * MSEC_PER_SEC); + + RETURN(rc); +} + +int LL_PROC_PROTO(proc_trigger_stack_reset) +{ + int rc = 0; + int i = 1; + kgn_device_t *dev; + ENTRY; + + if (!write) { + /* read */ + rc = ll_proc_dointvec(table, write, filp, buffer, lenp, ppos); + RETURN(rc); + } + + /* only device 0 gets the handle, see kgnilnd_dev_init */ + dev = &kgnilnd_data.kgn_devices[0]; + + LASSERTF(dev != NULL, "dev 0 is NULL\n"); + + kgnilnd_critical_error(dev->gnd_err_handle); + + /* Wait for the reset to complete. This prevents any races in testing + * where we'd immediately try to send traffic again */ + while (kgnilnd_data.kgn_needs_reset != 0) { + i++; + LCONSOLE((((i) & (-i)) == i) ? D_WARNING : D_NET, + "Waiting for stack reset request to clear\n"); + cfs_pause(cfs_time_seconds(1 * i)); + } + + RETURN(rc); +} + +static int LL_PROC_PROTO(proc_toggle_rdmaq_override) +{ + int old_val = kgnilnd_sysctl.ksd_rdmaq_override; + int rc = 0; + ENTRY; + + rc = ll_proc_dointvec(table, write, filp, buffer, lenp, ppos); + if (!write) { + /* read */ + RETURN(rc); + } + + if (kgnilnd_data.kgn_init != GNILND_INIT_ALL) { + rc = -EINVAL; + RETURN(rc); + } + + if (old_val != kgnilnd_sysctl.ksd_rdmaq_override) { + long new_mb = kgnilnd_sysctl.ksd_rdmaq_override * (long)(1024*1024); + LCONSOLE_INFO("changing RDMAQ override to %d mbytes/sec\n", + kgnilnd_sysctl.ksd_rdmaq_override); + /* override proc is mbytes, but we calc in bytes */ + kgnilnd_data.kgn_rdmaq_override = new_mb; + smp_wmb(); + } + + RETURN(rc); +} + +static cfs_sysctl_table_t kgnilnd_table[] = { + /* + * NB No .strategy entries have been provided since sysctl(8) prefers + * to go via /proc for portability. + */ + { + INIT_CTL_NAME(GNILND_VERSION) + .procname = "version", + .data = KGNILND_BUILD_REV, + .maxlen = sizeof(KGNILND_BUILD_REV), + .mode = 0444, + .proc_handler = &proc_dostring + }, + { + INIT_CTL_NAME(GNILND_THREAD_PAUSE) + .procname = "thread_pause", + .data = &kgnilnd_sysctl.ksd_pause_trigger, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_toggle_thread_pause, + }, + { + INIT_CTL_NAME(GNILND_HW_QUIESCE) + .procname = "hw_quiesce", + .data = &kgnilnd_sysctl.ksd_quiesce_secs, + .maxlen = sizeof(__u32), + .mode = 0644, + .proc_handler = &proc_hw_quiesce, + }, + { + INIT_CTL_NAME(GNILND_STACK_RESET) + .procname = "stack_reset", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = &proc_trigger_stack_reset, + }, + { + INIT_CTL_NAME(GNILND_RDMAQ_OVERRIDE) + .procname = "rdmaq_override", + .data = &kgnilnd_sysctl.ksd_rdmaq_override, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_toggle_rdmaq_override, + }, + { INIT_CTL_NAME(0) } +}; + +static cfs_sysctl_table_t kgnilnd_top_table[2] = { + { + INIT_CTL_NAME(CTL_GNILND) + .procname = "kgnilnd", + .data = NULL, + .maxlen = 0, + .mode = 0555, + .child = kgnilnd_table + }, + { INIT_CTL_NAME(0) } +}; + +void kgnilnd_insert_sysctl(void) +{ + if (kgnilnd_table_header == NULL) + kgnilnd_table_header = cfs_register_sysctl_table(kgnilnd_top_table, 0); +} + +void kgnilnd_remove_sysctl(void) +{ + if (kgnilnd_table_header != NULL) + cfs_unregister_sysctl_table(kgnilnd_table_header); + + kgnilnd_table_header = NULL; +} + +#else +void kgnilnd_insert_sysctl(void) {} +void kgnilnd_remove_sysctl(void) {} +#endif diff --git a/lnet/klnds/gnilnd/gnilnd_version.h b/lnet/klnds/gnilnd/gnilnd_version.h new file mode 100644 index 0000000..10f6278 --- /dev/null +++ b/lnet/klnds/gnilnd/gnilnd_version.h @@ -0,0 +1 @@ +#define KGNILND_BUILD_REV SVN_CODE_REV diff --git a/lnet/utils/debug.c b/lnet/utils/debug.c index 21b5de5..d720f3d 100644 --- a/lnet/utils/debug.c +++ b/lnet/utils/debug.c @@ -856,6 +856,7 @@ static struct mod_paths { { "kmxlnd", "lnet/klnds/mxlnd" }, { "ko2iblnd", "lnet/klnds/o2iblnd" }, { "kptllnd", "lnet/klnds/ptllnd" }, + { "kgnilnd", "lnet/klnds/gnilnd"}, { "kqswlnd", "lnet/klnds/qswlnd" }, { "kralnd", "lnet/klnds/ralnd" }, { "ksocklnd", "lnet/klnds/socklnd" }, diff --git a/lnet/utils/portals.c b/lnet/utils/portals.c index dedb75d..3c09a8d 100644 --- a/lnet/utils/portals.c +++ b/lnet/utils/portals.c @@ -567,7 +567,7 @@ jt_ptl_print_peers (int argc, char **argv) int rc; if (!g_net_is_compatible (argv[0], SOCKLND, RALND, PTLLND, MXLND, - O2IBLND, 0)) + O2IBLND, GNILND, 0)) return -1; for (index = 0;;index++) { @@ -620,6 +620,26 @@ jt_ptl_print_peers (int argc, char **argv) ptl_ipaddr_2_str(data.ioc_u32[0], buffer[1], sizeof(buffer[1]), 1), data.ioc_u32[1]); /* peer port */ + } else if (g_net_is_compatible(NULL, GNILND, 0)) { + int disconn = data.ioc_flags >> 16; + char *state; + + if (disconn) + state = "D"; + else + state = data.ioc_flags & 0xffff ? "C" : "U"; + + printf ("%-20s (%d) %s [%d] "LPU64" " + "sq %d/%d tx %d/%d/%d\n", + libcfs_nid2str(data.ioc_nid), /* peer nid */ + data.ioc_net, /* gemini device id */ + state, /* peer is Connecting, Up, or Down */ + data.ioc_count, /* peer refcount */ + data.ioc_u64[0], /* peerstamp */ + data.ioc_u32[2], data.ioc_u32[3], /* tx and rx seq */ + /* fmaq, nfma, nrdma */ + data.ioc_u32[0], data.ioc_u32[1], data.ioc_u32[4] + ); } else { printf ("%-20s [%d]\n", libcfs_nid2str(data.ioc_nid), data.ioc_count); @@ -647,11 +667,12 @@ jt_ptl_add_peer (int argc, char **argv) int port = 0; int rc; - if (!g_net_is_compatible (argv[0], SOCKLND, RALND, 0)) + if (!g_net_is_compatible (argv[0], SOCKLND, RALND, + GNILND, 0)) return -1; if (argc != 4) { - fprintf (stderr, "usage(tcp,ra): %s nid ipaddr port\n", + fprintf (stderr, "usage(tcp,ra,gni): %s nid ipaddr port\n", argv[0]); return 0; } @@ -699,7 +720,7 @@ jt_ptl_del_peer (int argc, char **argv) int rc; if (!g_net_is_compatible (argv[0], SOCKLND, RALND, MXLND, PTLLND, - O2IBLND, 0)) + O2IBLND, GNILND, 0)) return -1; if (g_net_is_compatible(NULL, SOCKLND, 0)) { @@ -768,7 +789,8 @@ jt_ptl_print_connections (int argc, char **argv) int index; int rc; - if (!g_net_is_compatible (argv[0], SOCKLND, RALND, MXLND, O2IBLND, 0)) + if (!g_net_is_compatible (argv[0], SOCKLND, RALND, MXLND, O2IBLND, + GNILND, 0)) return -1; for (index = 0; ; index++) { @@ -808,6 +830,10 @@ jt_ptl_print_connections (int argc, char **argv) printf ("%s mtu %d\n", libcfs_nid2str(data.ioc_nid), data.ioc_u32[0]); /* path MTU */ + } else if (g_net_is_compatible (NULL, GNILND, 0)) { + printf ("%-20s [%d]\n", + libcfs_nid2str(data.ioc_nid), + data.ioc_u32[0] /* device id */); } else { printf ("%s\n", libcfs_nid2str(data.ioc_nid)); } @@ -837,7 +863,8 @@ int jt_ptl_disconnect(int argc, char **argv) return 0; } - if (!g_net_is_compatible (NULL, SOCKLND, RALND, MXLND, O2IBLND, 0)) + if (!g_net_is_compatible (NULL, SOCKLND, RALND, MXLND, O2IBLND, + GNILND, 0)) return 0; if (argc >= 2 && @@ -879,7 +906,7 @@ int jt_ptl_push_connection (int argc, char **argv) return 0; } - if (!g_net_is_compatible (argv[0], SOCKLND, 0)) + if (!g_net_is_compatible (argv[0], SOCKLND, GNILND, 0)) return -1; if (argc > 1 &&