#define IOC_LIBCFS_DEL_PEER _IOWR('e', 74, IOCTL_LIBCFS_TYPE)
#define IOC_LIBCFS_ADD_PEER _IOWR('e', 75, IOCTL_LIBCFS_TYPE)
#define IOC_LIBCFS_GET_PEER _IOWR('e', 76, IOCTL_LIBCFS_TYPE)
-#define IOC_LIBCFS_GET_TXDESC _IOWR('e', 77, IOCTL_LIBCFS_TYPE)
+/* ioctl 77 is free for use */
#define IOC_LIBCFS_ADD_INTERFACE _IOWR('e', 78, IOCTL_LIBCFS_TYPE)
#define IOC_LIBCFS_DEL_INTERFACE _IOWR('e', 79, IOCTL_LIBCFS_TYPE)
#define IOC_LIBCFS_GET_INTERFACE _IOWR('e', 80, IOCTL_LIBCFS_TYPE)
])
#
-# LN_CONFIG_QUADRICS
-#
-# check if quadrics support is in this kernel
-#
-AC_DEFUN([LN_CONFIG_QUADRICS], [
-AC_MSG_CHECKING([for QsNet sources])
-AC_ARG_WITH([qsnet],
- AC_HELP_STRING([--with-qsnet=path],
- [set path to qsnet source (default=$LINUX)]),
- [QSNET=$with_qsnet], [QSNET=$LINUX])
-AC_MSG_RESULT([$QSNET])
-
-QSWLND=""
-QSWCPPFLAGS=""
-AC_MSG_CHECKING([if quadrics kernel headers are present])
-AS_IF([test -d $QSNET/drivers/net/qsnet], [
- AC_MSG_RESULT([yes])
- QSWLND="qswlnd"
- AC_MSG_CHECKING([for multirail EKC])
- AS_IF([test -f $QSNET/include/elan/epcomms.h], [
- AC_MSG_RESULT([supported])
- QSNET=$(readlink --canonicalize $QSNET)
- QSWCPPFLAGS="-I$QSNET/include -DMULTIRAIL_EKC=1"
- ], [
- AC_MSG_RESULT([not supported])
- AC_MSG_ERROR([Need multirail EKC])
- ])
-
- AS_IF([test x$QSNET = x$LINUX], [
- LB_CHECK_CONFIG([QSNET], [], [
- LB_CHECK_CONFIG([QSNET_MODULE], [], [
- AC_MSG_WARN([QSNET is not enabled in this kernel; not building qswlnd.])
- QSWLND=""
- QSWCPPFLAGS=""
- ])
- ])
- ])
-], [
- AC_MSG_RESULT([no])
-])
-AC_SUBST(QSWLND)
-AC_SUBST(QSWCPPFLAGS)
-]) # LN_CONFIG_QUADRICS
-
-#
-# LN_CONFIG_MX
-#
-AC_DEFUN([LN_CONFIG_MX], [
-# set default
-MXPATH="/opt/mx"
-AC_MSG_CHECKING([whether to enable Myrinet MX support])
-AC_ARG_WITH([mx],
- AC_HELP_STRING([--with-mx=path],
- [build mxlnd against path]),
- [
- case $with_mx in
- yes) ENABLEMX=2 ;;
- no) ENABLEMX=0 ;;
- *) ENABLEMX=3; MXPATH=$with_mx ;;
- esac
- ],[
- ENABLEMX=1
- ])
-AS_IF([test $ENABLEMX -eq 0], [
- AC_MSG_RESULT([disabled])
-], [test ! \( -f ${MXPATH}/include/myriexpress.h -a \
- -f ${MXPATH}/include/mx_kernel_api.h -a \
- -f ${MXPATH}/include/mx_pin.h \)], [
- AC_MSG_RESULT([no])
- case $ENABLEMX in
- 1) ;;
- 2) AC_MSG_ERROR([Myrinet MX kernel headers not present]) ;;
- 3) AC_MSG_ERROR([bad --with-mx path]) ;;
- *) AC_MSG_ERROR([internal error]) ;;
- esac
-], [
- AC_MSG_RESULT([check])
- MXPATH=$(readlink --canonicalize $MXPATH)
- MXCPPFLAGS="-I$MXPATH/include"
- MXLIBS="-L$MXPATH/lib"
- EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS"
- EXTRA_KCFLAGS="$EXTRA_KCFLAGS $MXCPPFLAGS"
- LB_CHECK_COMPILE([if have Myrinet MX support],
- myrinet_mx_support, [
- #define MX_KERNEL 1
- #include <mx_extensions.h>
- #include <myriexpress.h>
- ],[
- mx_endpoint_t end;
- mx_status_t status;
- mx_request_t request;
- int result;
- mx_init();
- mx_open_endpoint(MX_ANY_NIC, MX_ANY_ENDPOINT, 0, NULL, 0, &end);
- mx_register_unexp_handler(end, (mx_unexp_handler_t) NULL, NULL);
- mx_wait_any(end, MX_INFINITE, 0LL, 0LL, &status, &result);
- mx_iconnect(end, 0LL, 0, 0, 0, NULL, &request);
- return 0;
- ],[
- MXLND="mxlnd"
- ],[
- case $ENABLEMX in
- 1) ;;
- 2) AC_MSG_ERROR([can't compile with Myrinet MX kernel headers]) ;;
- 3) AC_MSG_ERROR([can't compile with Myrinet MX headers under $MXPATH]) ;;
- *) AC_MSG_ERROR([internal error]) ;;
- esac
- MXCPPFLAGS=""
- MXLIBS=""
- MXLND=""
- ])
- EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save"
-])
-AC_SUBST(MXCPPFLAGS)
-AC_SUBST(MXLIBS)
-AC_SUBST(MXLND)
-]) # LN_CONFIG_MX
-
-#
# LN_CONFIG_O2IB
#
AC_DEFUN([LN_CONFIG_O2IB], [
]) # LN_CONFIG_O2IB
#
-# LN_CONFIG_RALND
-#
-# check whether to use the RapidArray lnd
-#
-AC_DEFUN([LN_CONFIG_RALND], [
-RALND=""
-RACPPFLAGS="-I${LINUX}/drivers/xd1/include"
-EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS"
-EXTRA_KCFLAGS="$EXTRA_KCFLAGS $RACPPFLAGS"
-LB_CHECK_COMPILE([if 'RapidArray' kernel headers are present],
-RapkGetDeviceByIndex, [
- #include <linux/types.h>
- #include <rapl.h>
-],[
- RAP_RETURN rc;
- RAP_PVOID dev_handle;
- rc = RapkGetDeviceByIndex(0, NULL, &dev_handle);
- return rc == RAP_SUCCESS ? 0 : 1;
-],[
- RALND="ralnd"
-],[
- RACPPFLAGS=""
-])
-EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save"
-AC_SUBST(RACPPFLAGS)
-AC_SUBST(RALND)
-]) # LN_CONFIG_RALND
-
-#
# LN_CONFIG_GNILND
#
# check whether to use the Gemini Network Interface lnd
LN_FUNC_DEV_GET_BY_NAME_2ARG
LN_CONFIG_AFFINITY
LN_CONFIG_BACKOFF
-LN_CONFIG_QUADRICS
LN_CONFIG_O2IB
-LN_CONFIG_RALND
LN_CONFIG_GNILND
-LN_CONFIG_MX
# 2.6.36
LN_CONFIG_TCP_SENDPAGE
# 3.15
# AM_CONDITOINAL defines for lnet
#
AC_DEFUN([LN_CONDITIONALS], [
-AM_CONDITIONAL(BUILD_QSWLND, test x$QSWLND = "xqswlnd")
-AM_CONDITIONAL(BUILD_MXLND, test x$MXLND = "xmxlnd")
AM_CONDITIONAL(BUILD_O2IBLND, test x$O2IBLND = "xo2iblnd")
-AM_CONDITIONAL(BUILD_RALND, test x$RALND = "xralnd")
AM_CONDITIONAL(BUILD_GNILND, test x$GNILND = "xgnilnd")
AM_CONDITIONAL(BUILD_GNILND_RCA, test x$GNILNDRCA = "xgnilndrca")
AM_CONDITIONAL(BUILD_DLC, test x$USE_DLC = "xyes")
lnet/include/lnet/Makefile
lnet/klnds/Makefile
lnet/klnds/autoMakefile
-lnet/klnds/mxlnd/autoMakefile
-lnet/klnds/mxlnd/Makefile
lnet/klnds/o2iblnd/Makefile
lnet/klnds/o2iblnd/autoMakefile
-lnet/klnds/qswlnd/Makefile
-lnet/klnds/qswlnd/autoMakefile
-lnet/klnds/ralnd/Makefile
-lnet/klnds/ralnd/autoMakefile
lnet/klnds/gnilnd/Makefile
lnet/klnds/gnilnd/autoMakefile
lnet/klnds/socklnd/Makefile
void lnet_register_lnd(lnd_t *lnd);
void lnet_unregister_lnd(lnd_t *lnd);
-int lnet_set_ip_niaddr (lnet_ni_t *ni);
int lnet_connect(cfs_socket_t **sockp, lnet_nid_t peer_nid,
__u32 local_ip, __u32 peer_ip, int peer_port);
int jt_ptl_print_connections (int argc, char **argv);
int jt_ptl_disconnect(int argc, char **argv);
int jt_ptl_push_connection(int argc, char **argv);
-int jt_ptl_print_active_txs(int argc, char **argv);
int jt_ptl_ping(int argc, char **argv);
int jt_ptl_mynid(int argc, char **argv);
int jt_ptl_add_uuid(int argc, char **argv);
-@BUILD_MXLND_TRUE@subdir-m += mxlnd
-@BUILD_RALND_TRUE@subdir-m += ralnd
@BUILD_GNILND_TRUE@subdir-m += gnilnd
@BUILD_O2IBLND_TRUE@subdir-m += o2iblnd
-@BUILD_QSWLND_TRUE@subdir-m += qswlnd
subdir-m += socklnd
@INCLUDE_RULES@
# Lustre is a trademark of Sun Microsystems, Inc.
#
-SUBDIRS = socklnd qswlnd mxlnd ralnd gnilnd o2iblnd
+SUBDIRS = socklnd gnilnd o2iblnd
+++ /dev/null
-MODULES := kmxlnd
-kmxlnd-objs := mxlnd.o mxlnd_cb.o mxlnd_modparams.o
-
-EXTRA_POST_CFLAGS := @MXCPPFLAGS@
-
-@INCLUDE_RULES@
+++ /dev/null
-*************************************************************************
-* *
-* Myrinet Express Lustre Networking Driver (MXLND) documentation *
-* *
-*************************************************************************
-
-README of MXLND
-
-MXLND provides support for Myricom's Myrinet Express (MX) communication
-layer in Lustre.
-
-MXLND may be used with either MX-10G or MX-2G. See MX's README for
-supported NICs.
-
-Table of Contents:
- I. Installation
- 1. Configuring and compiling
- 2. Module Parameters
- II. MXLND Performance
- III. Caveats
- 1. Systems with different page sizes
- 2. Multi-homing
- 3. MX endpoint collision
- IV. License
- V. Support
-
-================
-I. Installation
-================
-
-MXLND is supported on Linux 2.6. It may be possible to run it on 2.4,
-but it has not been tested. MXLND requires Myricom's MX version 1.2.8
-or higher. See MX's README for the supported list of processors.
-
-MXLND requires the optional MX kernel library interface. MX must be compiled
-with --enable-kernel-lib.
-
-1. Configuring and compiling
-
-MXLND should be already integrated into the Lustre build process. To
-build MXLND, you will need to set the path to your MX installation
-in Lustre's ./configure:
-
- --with-mx=/opt/mx
-
-replacing /opt with the actual path. Configure will check to ensure that
-the MX version has the required functions. If not, it will fail to build.
-To check if MXLND built, look for:
-
- checking whether to enable Myrinet MX support... yes
-
-in configure's output or the presence of Makefile in
-$LUSTRE/lnet/klnds/mxlnd.
-
-2. Module Parameters
-
-MXLND supports a number of load-time parameters using Linux's module
-parameter system. On our test systems, we created the following file:
-
- /etc/modprobe.d/kmxlnd
-
-On some (older?) systems, you may need to modify /etc/modprobe.conf.
-
-The available options are:
-
- n_waitd # of completion daemons
- cksum set non-zero to enable small message (< 4KB) checksums
- ntx # of total tx message descriptors
- peercredits # concurrent sends to one peer
- board index value of the Myrinet board
- ep_id MX endpoint ID
- ipif_name IPoMX interface name
- polling Use 0 to block (wait). A value > 0 will poll that many times before blocking
-
- credits Unused - was # concurrent sends to all peers
- max_peers Unused - was maximum number of peers that may connect
- hosts Unused - was IP-to-hostname resolution file
-
-You may want to vary the options to obtain the optimal performance for your
-platform.
-
- n_waitd sets the number of threads that process completed MX requests
-(sends and receives). In our testing, the default of 1 performed best.
-
- cksum turns on small message checksums. It can be used to aid in trouble-
-shooting. MX also provides an optional checksumming feature which can check
-all messages (large and small). See the MX README for details.
-
- ntx is the number of total sends in flight from this machine.
-
- peercredits is the number of in-flight messages for a specific peer. This is part
-of the flow-control system in Lustre. Increasing this value may improve performance
-but it requires more memory since each message requires at least one page.
-
- board is the index of the Myricom NIC. Hosts can have multiple Myricom NICs
-and this identifies which one MXLND should use.
-
- ep_id is the MX endpoint ID. Each process that uses MX is required to have at
-least one MX endpoint to access the MX library and NIC. The ID is a simple index
-starting at 0. When used on a server, the server will attempt to use this end-
-point. When used on a client, it specifies the endpoint to connect to on the
-management server.
-
- ipif_name is the name of the Ethernet interface over MX. Generally, it is
-myriN, where N matches the MX board index.
-
- polling determines whether this host will poll or block for MX request com-
-pletions. A value of 0 blocks and any positive value will poll that many times
-before blocking. Since polling increases CPU usage, we suggest you set this to
-0 on the client and experiment with different values for servers.
-
-=====================
-II. MXLND Performance
-=====================
-
-On MX-2G systems, MXLND should easily saturate the link and use minimal CPU
-(5-10% for read and write operations). On MX-10G systems, MXLND can saturate
-the link and use moderate CPU resources (20-30% for read and write operations).
-MX-10G relies on PCI-Express which is relatively new and performance varies
-considerably by processor, motherboard and PCI-E chipset. Refer to Myricom's
-website for the latest DMA read/write performance results by motherboard. The
-DMA results will place an upper-bound on MXLND performance.
-
-============
-III. Caveats
-============
-
-1. Systems with different page sizes
-
-MXLND will set the maximum small message size equal to the kernel's page size.
-This means that machines running MXLND that have different page sizes are not
-able to communicate with each other. If you wish to run MXLND in this case,
-send email to help@myri.com.
-
-2. Multi-homing
-
-At this time, the MXLND does not support more than one interface at a time.
-Thus, a single Lustre router cannot route between two MX-10G, between two
-MX-2G, or between MX-10G and MX-2G fabrics.
-
-3. MX endpoint collision
-
-Each process that uses MX is required to have at least one MX endpoint to
-access the MX library and NIC. Other processes may need to use MX and no two
-processes can use the same endpoint ID. MPICH-MX dynamically chooses one at
-MPI startup and should not interfere with MXLND. Sockets-MX, on the other hand,
-is hard coded to use 0 for its ID. If it is possible that anyone will want to
-run Sockets-MX on this system, use a non-0 value for MXLND's endpoint ID.
-
-
-===========
-IV. License
-===========
-
-MXLND is copyright (C) 2006 of Myricom, Inc.
-
-MXLND is part of Lustre, http://www.lustre.org.
-
-MXLND is free software; you can redistribute it and/or modify it under the
-terms of version 2 of the GNU General Public License as published by the Free
-Software Foundation.
-
-MXLND is distributed in the hope that it will be useful, but WITHOUT ANY
-WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-Lustre; if not, write to the Free Software Foundation, Inc., 675 Mass Ave,
-Cambridge, MA 02139, USA.
-
-==========
-V. Support
-==========
-
-If you have questions about MXLND, please contact help@myri.com.
+++ /dev/null
-#
-# GPL HEADER START
-#
-# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 only,
-# as published by the Free Software Foundation.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# General Public License version 2 for more details (a copy is included
-# in the LICENSE file that accompanied this code).
-#
-# You should have received a copy of the GNU General Public License
-# version 2 along with this program; If not, see
-# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
-#
-# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
-# CA 95054 USA or visit www.sun.com if you need additional information or
-# have any questions.
-#
-# GPL HEADER END
-#
-
-#
-# Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
-# Use is subject to license terms.
-#
-
-#
-# This file is part of Lustre, http://www.lustre.org/
-# Lustre is a trademark of Sun Microsystems, Inc.
-#
-
-if MODULES
-if BUILD_MXLND
-modulenet_DATA = kmxlnd$(KMODEXT)
-endif
-endif
-
-MOSTLYCLEANFILES = @MOSTLYCLEANFILES@
-EXTRA_DIST = $(kmxlnd-objs:%.o=%.c) mxlnd.h
+++ /dev/null
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
- *
- * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
- * CA 95054 USA or visit www.sun.com if you need additional information or
- * have any questions.
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2012, 2014, Intel Corporation.
- *
- * Copyright (C) 2006 Myricom, Inc.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * lnet/klnds/mxlnd/mxlnd.c
- *
- * Author: Eric Barton <eric@bartonsoftware.com>
- * Author: Scott Atchley <atchley at myri.com>
- */
-
-#include "mxlnd.h"
-
-lnd_t the_kmxlnd = {
- .lnd_type = MXLND,
- .lnd_startup = mxlnd_startup,
- .lnd_shutdown = mxlnd_shutdown,
- .lnd_ctl = mxlnd_ctl,
- .lnd_send = mxlnd_send,
- .lnd_recv = mxlnd_recv,
-};
-
-kmx_data_t kmxlnd_data;
-
-void
-mxlnd_free_pages(kmx_pages_t *p)
-{
- int npages = p->mxg_npages;
- int i;
-
- CDEBUG(D_MALLOC, "freeing %d pages\n", npages);
-
- for (i = 0; i < npages; i++) {
- if (p->mxg_pages[i] != NULL) {
- __free_page(p->mxg_pages[i]);
- spin_lock(&kmxlnd_data.kmx_mem_lock);
- kmxlnd_data.kmx_mem_used -= PAGE_SIZE;
- spin_unlock(&kmxlnd_data.kmx_mem_lock);
- }
- }
-
- MXLND_FREE(p, offsetof(kmx_pages_t, mxg_pages[npages]));
-}
-
-int
-mxlnd_alloc_pages(kmx_pages_t **pp, int npages)
-{
- kmx_pages_t *p = NULL;
- int i = 0;
-
- CDEBUG(D_MALLOC, "allocing %d pages\n", npages);
-
- MXLND_ALLOC(p, offsetof(kmx_pages_t, mxg_pages[npages]));
- if (p == NULL) {
- CERROR("Can't allocate descriptor for %d pages\n", npages);
- return -ENOMEM;
- }
-
- memset(p, 0, offsetof(kmx_pages_t, mxg_pages[npages]));
- p->mxg_npages = npages;
-
- for (i = 0; i < npages; i++) {
- p->mxg_pages[i] = alloc_page(GFP_KERNEL);
- if (p->mxg_pages[i] == NULL) {
- CERROR("Can't allocate page %d of %d\n", i, npages);
- mxlnd_free_pages(p);
- return -ENOMEM;
- }
- spin_lock(&kmxlnd_data.kmx_mem_lock);
- kmxlnd_data.kmx_mem_used += PAGE_SIZE;
- spin_unlock(&kmxlnd_data.kmx_mem_lock);
- }
-
- *pp = p;
- return 0;
-}
-
-/**
- * mxlnd_ctx_init - reset ctx struct to the default values
- * @ctx - a kmx_ctx pointer
- */
-void
-mxlnd_ctx_init(kmx_ctx_t *ctx)
-{
- if (ctx == NULL) return;
-
- /* do not change mxc_type */
- ctx->mxc_incarnation = 0;
- ctx->mxc_deadline = 0;
- ctx->mxc_state = MXLND_CTX_IDLE;
- if (!cfs_list_empty(&ctx->mxc_list))
- cfs_list_del_init(&ctx->mxc_list);
- /* ignore mxc_rx_list */
- if (ctx->mxc_type == MXLND_REQ_TX) {
- ctx->mxc_nid = 0;
- ctx->mxc_peer = NULL;
- ctx->mxc_conn = NULL;
- }
- /* ignore mxc_msg */
- ctx->mxc_lntmsg[0] = NULL;
- ctx->mxc_lntmsg[1] = NULL;
- ctx->mxc_msg_type = 0;
- ctx->mxc_cookie = 0LL;
- ctx->mxc_match = 0LL;
- /* ctx->mxc_seg.segment_ptr points to backing page */
- ctx->mxc_seg.segment_length = 0;
- if (ctx->mxc_seg_list != NULL) {
- LASSERT(ctx->mxc_nseg > 0);
- MXLND_FREE(ctx->mxc_seg_list, ctx->mxc_nseg * sizeof(mx_ksegment_t));
- }
- ctx->mxc_seg_list = NULL;
- ctx->mxc_nseg = 0;
- ctx->mxc_nob = 0;
- memset(&ctx->mxc_mxreq, 0, sizeof(mx_request_t));
- memset(&ctx->mxc_status, 0, sizeof(mx_status_t));
- ctx->mxc_errno = 0;
- /* ctx->mxc_get */
- /* ctx->mxc_put */
-
- ctx->mxc_msg->mxm_type = 0;
- ctx->mxc_msg->mxm_credits = 0;
- ctx->mxc_msg->mxm_nob = 0;
-
- return;
-}
-
-/**
- * mxlnd_free_txs - free kmx_txs and associated pages
- *
- * Called from mxlnd_shutdown()
- */
-void
-mxlnd_free_txs(void)
-{
- int i = 0;
- kmx_ctx_t *tx = NULL;
-
- if (kmxlnd_data.kmx_tx_pages) {
- for (i = 0; i < MXLND_TX_MSGS(); i++) {
- tx = &kmxlnd_data.kmx_txs[i];
- if (tx->mxc_seg_list != NULL) {
- LASSERT(tx->mxc_nseg > 0);
- MXLND_FREE(tx->mxc_seg_list,
- tx->mxc_nseg *
- sizeof(*tx->mxc_seg_list));
- }
- }
- MXLND_FREE(kmxlnd_data.kmx_txs,
- MXLND_TX_MSGS() * sizeof(kmx_ctx_t));
- mxlnd_free_pages(kmxlnd_data.kmx_tx_pages);
- }
-
- return;
-}
-
-/**
- * mxlnd_init_txs - allocate tx descriptors then stash on txs and idle tx lists
- *
- * Called from mxlnd_startup()
- * returns 0 on success, else -ENOMEM
- */
-int
-mxlnd_init_txs(void)
-{
- int ret = 0;
- int i = 0;
- int ipage = 0;
- int offset = 0;
- void *addr = NULL;
- kmx_ctx_t *tx = NULL;
- kmx_pages_t *pages = NULL;
- struct page *page = NULL;
-
- /* pre-mapped messages are not bigger than 1 page */
- CLASSERT(MXLND_MSG_SIZE <= PAGE_SIZE);
-
- /* No fancy arithmetic when we do the buffer calculations */
- CLASSERT (PAGE_SIZE % MXLND_MSG_SIZE == 0);
-
- ret = mxlnd_alloc_pages(&pages, MXLND_TX_MSG_PAGES());
- if (ret != 0) {
- CERROR("Can't allocate tx pages\n");
- return -ENOMEM;
- }
- kmxlnd_data.kmx_tx_pages = pages;
-
- MXLND_ALLOC(kmxlnd_data.kmx_txs, MXLND_TX_MSGS() * sizeof(kmx_ctx_t));
- if (&kmxlnd_data.kmx_txs == NULL) {
- CERROR("Can't allocate %d tx descriptors\n", MXLND_TX_MSGS());
- mxlnd_free_pages(pages);
- return -ENOMEM;
- }
-
- memset(kmxlnd_data.kmx_txs, 0, MXLND_TX_MSGS() * sizeof(kmx_ctx_t));
-
- for (i = 0; i < MXLND_TX_MSGS(); i++) {
-
- tx = &kmxlnd_data.kmx_txs[i];
- tx->mxc_type = MXLND_REQ_TX;
-
- CFS_INIT_LIST_HEAD(&tx->mxc_list);
-
- /* map mxc_msg to page */
- page = pages->mxg_pages[ipage];
- addr = page_address(page);
- LASSERT(addr != NULL);
- tx->mxc_msg = (kmx_msg_t *)(addr + offset);
- tx->mxc_seg.segment_ptr = MX_PA_TO_U64(virt_to_phys(tx->mxc_msg));
-
- mxlnd_ctx_init(tx);
-
- offset += MXLND_MSG_SIZE;
- LASSERT (offset <= PAGE_SIZE);
-
- if (offset == PAGE_SIZE) {
- offset = 0;
- ipage++;
- LASSERT (ipage <= MXLND_TX_MSG_PAGES());
- }
-
- /* in startup(), no locks required */
- cfs_list_add_tail(&tx->mxc_list, &kmxlnd_data.kmx_tx_idle);
- }
-
- return 0;
-}
-
-/**
- * mxlnd_free_peers - free peers
- *
- * Called from mxlnd_shutdown()
- */
-void
-mxlnd_free_peers(void)
-{
- int i = 0;
- int count = 0;
- kmx_peer_t *peer = NULL;
- kmx_peer_t *next = NULL;
-
- for (i = 0; i < MXLND_HASH_SIZE; i++) {
- cfs_list_for_each_entry_safe(peer, next,
- &kmxlnd_data.kmx_peers[i],
- mxp_list) {
- cfs_list_del_init(&peer->mxp_list);
- if (peer->mxp_conn) mxlnd_conn_decref(peer->mxp_conn);
- mxlnd_peer_decref(peer);
- count++;
- }
- }
- CDEBUG(D_NET, "%s: freed %d peers\n", __func__, count);
-}
-
-/**
- * mxlnd_init_mx - open the endpoint, set our ID, register the EAGER callback
- * @ni - the network interface
- *
- * Returns 0 on success, -1 on failure
- */
-int
-mxlnd_init_mx(lnet_ni_t *ni)
-{
- int ret = 0;
- mx_return_t mxret;
- u32 board = *kmxlnd_tunables.kmx_board;
- u32 ep_id = *kmxlnd_tunables.kmx_ep_id;
- u64 nic_id = 0LL;
- char *ifname = NULL;
- __u32 ip;
- __u32 netmask;
- int if_up = 0;
-
- mxret = mx_init();
- if (mxret != MX_SUCCESS) {
- CERROR("mx_init() failed with %s (%d)\n", mx_strerror(mxret), mxret);
- return -1;
- }
-
- if (ni->ni_interfaces[0] != NULL) {
- /* Use the IPoMX interface specified in 'networks=' */
-
- CLASSERT (LNET_MAX_INTERFACES > 1);
- if (ni->ni_interfaces[1] != NULL) {
- CERROR("Multiple interfaces not supported\n");
- goto failed_with_init;
- }
-
- ifname = ni->ni_interfaces[0];
- } else {
- ifname = *kmxlnd_tunables.kmx_default_ipif;
- }
-
- ret = libcfs_ipif_query(ifname, &if_up, &ip, &netmask);
- if (ret != 0) {
- CERROR("Can't query IPoMX interface %s: %d\n",
- ifname, ret);
- goto failed_with_init;
- }
-
- if (!if_up) {
- CERROR("Can't query IPoMX interface %s: it's down\n",
- ifname);
- goto failed_with_init;
- }
-
- mxret = mx_open_endpoint(board, ep_id, MXLND_MSG_MAGIC,
- NULL, 0, &kmxlnd_data.kmx_endpt);
- if (mxret != MX_SUCCESS) {
- CERROR("mx_open_endpoint() failed with %d\n", mxret);
- goto failed_with_init;
- }
-
- mx_get_endpoint_addr(kmxlnd_data.kmx_endpt, &kmxlnd_data.kmx_epa);
- mx_decompose_endpoint_addr(kmxlnd_data.kmx_epa, &nic_id, &ep_id);
- mxret = mx_connect(kmxlnd_data.kmx_endpt, nic_id, ep_id,
- MXLND_MSG_MAGIC,
- jiffies_to_msecs(MXLND_CONNECT_TIMEOUT),
- &kmxlnd_data.kmx_epa);
- if (mxret != MX_SUCCESS) {
- CNETERR("unable to connect to myself (%s)\n", mx_strerror(mxret));
- goto failed_with_endpoint;
- }
-
- ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ip);
- CDEBUG(D_NET, "My NID is 0x%llx\n", ni->ni_nid);
-
- /* this will catch all unexpected receives. */
- mxret = mx_register_unexp_handler(kmxlnd_data.kmx_endpt,
- (mx_unexp_handler_t) mxlnd_unexpected_recv,
- NULL);
- if (mxret != MX_SUCCESS) {
- CERROR("mx_register_unexp_callback() failed with %s\n",
- mx_strerror(mxret));
- goto failed_with_endpoint;
- }
- mxret = mx_set_request_timeout(kmxlnd_data.kmx_endpt, NULL,
- jiffies_to_msecs(MXLND_COMM_TIMEOUT));
- if (mxret != MX_SUCCESS) {
- CERROR("mx_set_request_timeout() failed with %s\n",
- mx_strerror(mxret));
- goto failed_with_endpoint;
- }
- return 0;
-
-failed_with_endpoint:
- mx_close_endpoint(kmxlnd_data.kmx_endpt);
-failed_with_init:
- mx_finalize();
- return -1;
-}
-
-
-/**
- * mxlnd_thread_start - spawn a kernel thread with this function
- * @fn - function pointer
- * @arg - pointer to the parameter data
- * @name - name of new thread
- *
- * Returns 0 on success and a negative value on failure
- */
-int
-mxlnd_thread_start(int (*fn)(void *arg), void *arg, char *name)
-{
- cfs_task *task;
- int i = (int) ((long) arg);
-
- atomic_inc(&kmxlnd_data.kmx_nthreads);
- init_completion(&kmxlnd_data.kmx_completions[i]);
-
- task = kthread_run(fn, arg, name);
- if (IS_ERR(task)) {
- CERROR("cfs_create_thread() failed with %d\n", PTR_ERR(task));
- atomic_dec(&kmxlnd_data.kmx_nthreads);
- }
- return PTR_ERR(task);
-}
-
-/**
- * mxlnd_thread_stop - decrement thread counter
- *
- * The thread returns 0 when it detects shutdown.
- * We are simply decrementing the thread counter.
- */
-void
-mxlnd_thread_stop(long id)
-{
- int i = (int) id;
- atomic_dec (&kmxlnd_data.kmx_nthreads);
- complete(&kmxlnd_data.kmx_completions[i]);
-}
-
-/**
- * mxlnd_shutdown - stop IO, clean up state
- * @ni - LNET interface handle
- *
- * No calls to the LND should be made after calling this function.
- */
-void
-mxlnd_shutdown (lnet_ni_t *ni)
-{
- int i = 0;
- int nthreads = MXLND_NDAEMONS + *kmxlnd_tunables.kmx_n_waitd;
-
- LASSERT (ni == kmxlnd_data.kmx_ni);
- LASSERT (ni->ni_data == &kmxlnd_data);
- CDEBUG(D_NET, "in shutdown()\n");
-
- CDEBUG(D_MALLOC, "before MXLND cleanup: libcfs_kmemory %d "
- "kmx_mem_used %ld\n", atomic_read(&libcfs_kmemory),
- kmxlnd_data.kmx_mem_used);
-
-
- CDEBUG(D_NET, "setting shutdown = 1\n");
- atomic_set(&kmxlnd_data.kmx_shutdown, 1);
-
- switch (kmxlnd_data.kmx_init) {
-
- case MXLND_INIT_ALL:
-
- /* calls write_[un]lock(kmx_global_lock) */
- mxlnd_del_peer(LNET_NID_ANY);
-
- /* wakeup request_waitds */
- mx_wakeup(kmxlnd_data.kmx_endpt);
- up(&kmxlnd_data.kmx_tx_queue_sem);
- up(&kmxlnd_data.kmx_conn_sem);
- mxlnd_sleep(msecs_to_jiffies(2 * MSEC_PER_SEC));
-
- /* fall through */
-
- case MXLND_INIT_THREADS:
-
- CDEBUG(D_NET, "waiting on threads\n");
- /* wait for threads to complete */
- for (i = 0; i < nthreads; i++) {
- wait_for_completion(&kmxlnd_data.kmx_completions[i]);
- }
- LASSERT(atomic_read(&kmxlnd_data.kmx_nthreads) == 0);
-
- CDEBUG(D_NET, "freeing completions\n");
- MXLND_FREE(kmxlnd_data.kmx_completions,
- nthreads * sizeof(struct completion));
-
- /* fall through */
-
- case MXLND_INIT_MX:
-
- CDEBUG(D_NET, "stopping mx\n");
-
- /* no peers left, close the endpoint */
- mx_close_endpoint(kmxlnd_data.kmx_endpt);
- mx_finalize();
-
- /* fall through */
-
- case MXLND_INIT_TXS:
-
- CDEBUG(D_NET, "freeing txs\n");
-
- /* free all txs and associated pages */
- mxlnd_free_txs();
-
- /* fall through */
-
- case MXLND_INIT_DATA:
-
- CDEBUG(D_NET, "freeing peers\n");
-
- /* peers should be gone, but check again */
- mxlnd_free_peers();
-
- /* conn zombies should be gone, but check again */
- mxlnd_free_conn_zombies();
-
- /* fall through */
-
- case MXLND_INIT_NOTHING:
- break;
- }
- CDEBUG(D_NET, "shutdown complete\n");
-
- CDEBUG(D_MALLOC, "after MXLND cleanup: libcfs_kmemory %d "
- "kmx_mem_used %ld\n", atomic_read(&libcfs_kmemory),
- kmxlnd_data.kmx_mem_used);
-
- kmxlnd_data.kmx_init = MXLND_INIT_NOTHING;
- module_put(THIS_MODULE);
- return;
-}
-
-/**
- * mxlnd_startup - initialize state, open an endpoint, start IO
- * @ni - LNET interface handle
- *
- * Initialize state, open an endpoint, start monitoring threads.
- * Should only be called once.
- */
-int
-mxlnd_startup (lnet_ni_t *ni)
-{
- int i = 0;
- int ret = 0;
- int nthreads = MXLND_NDAEMONS /* tx_queued, timeoutd, connd */
- + *kmxlnd_tunables.kmx_n_waitd;
- struct timeval tv;
-
- LASSERT (ni->ni_lnd == &the_kmxlnd);
-
- if (kmxlnd_data.kmx_init != MXLND_INIT_NOTHING) {
- CERROR("Only 1 instance supported\n");
- return -EPERM;
- }
- CDEBUG(D_MALLOC, "before MXLND startup: libcfs_kmemory %d "
- "kmx_mem_used %ld\n", atomic_read(&libcfs_kmemory),
- kmxlnd_data.kmx_mem_used);
-
- ni->ni_maxtxcredits = MXLND_TX_MSGS();
- ni->ni_peertxcredits = *kmxlnd_tunables.kmx_peercredits;
- if (ni->ni_maxtxcredits < ni->ni_peertxcredits)
- ni->ni_maxtxcredits = ni->ni_peertxcredits;
-
- try_module_get(THIS_MODULE);
- memset (&kmxlnd_data, 0, sizeof (kmxlnd_data));
-
- kmxlnd_data.kmx_ni = ni;
- ni->ni_data = &kmxlnd_data;
-
- do_gettimeofday(&tv);
- kmxlnd_data.kmx_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
- CDEBUG(D_NET, "my incarnation is %llu\n", kmxlnd_data.kmx_incarnation);
-
- rwlock_init (&kmxlnd_data.kmx_global_lock);
- spin_lock_init (&kmxlnd_data.kmx_mem_lock);
-
- CFS_INIT_LIST_HEAD (&kmxlnd_data.kmx_conn_reqs);
- CFS_INIT_LIST_HEAD (&kmxlnd_data.kmx_conn_zombies);
- CFS_INIT_LIST_HEAD (&kmxlnd_data.kmx_orphan_msgs);
- spin_lock_init (&kmxlnd_data.kmx_conn_lock);
- sema_init(&kmxlnd_data.kmx_conn_sem, 0);
-
- for (i = 0; i < MXLND_HASH_SIZE; i++) {
- CFS_INIT_LIST_HEAD (&kmxlnd_data.kmx_peers[i]);
- }
-
- CFS_INIT_LIST_HEAD (&kmxlnd_data.kmx_tx_idle);
- spin_lock_init (&kmxlnd_data.kmx_tx_idle_lock);
- kmxlnd_data.kmx_tx_next_cookie = 1;
- CFS_INIT_LIST_HEAD (&kmxlnd_data.kmx_tx_queue);
- spin_lock_init (&kmxlnd_data.kmx_tx_queue_lock);
- sema_init(&kmxlnd_data.kmx_tx_queue_sem, 0);
-
- kmxlnd_data.kmx_init = MXLND_INIT_DATA;
- /*****************************************************/
-
- ret = mxlnd_init_txs();
- if (ret != 0) {
- CERROR("Can't alloc tx descs: %d\n", ret);
- goto failed;
- }
- kmxlnd_data.kmx_init = MXLND_INIT_TXS;
- /*****************************************************/
-
- ret = mxlnd_init_mx(ni);
- if (ret != 0) {
- CERROR("Can't init mx\n");
- goto failed;
- }
-
- kmxlnd_data.kmx_init = MXLND_INIT_MX;
- /*****************************************************/
-
- /* start threads */
-
- MXLND_ALLOC(kmxlnd_data.kmx_completions,
- nthreads * sizeof(struct completion));
- if (kmxlnd_data.kmx_completions == NULL) {
- CERROR("failed to alloc kmxlnd_data.kmx_completions\n");
- goto failed;
- }
- memset(kmxlnd_data.kmx_completions, 0,
- nthreads * sizeof(struct completion));
-
- CDEBUG(D_NET, "using %d %s in mx_wait_any()\n",
- *kmxlnd_tunables.kmx_n_waitd,
- *kmxlnd_tunables.kmx_n_waitd == 1 ? "thread" : "threads");
-
- for (i = 0; i < *kmxlnd_tunables.kmx_n_waitd; i++) {
- char name[24];
- memset(name, 0, sizeof(name));
- snprintf(name, sizeof(name), "mxlnd_request_waitd_%02ld", i);
- ret = mxlnd_thread_start(mxlnd_request_waitd, (void*)((long)i));
- if (ret < 0) {
- CERROR("Starting mxlnd_request_waitd[%d] "
- "failed with %d\n", i, ret);
- atomic_set(&kmxlnd_data.kmx_shutdown, 1);
- mx_wakeup(kmxlnd_data.kmx_endpt);
- for (--i; i >= 0; i--) {
- wait_for_completion(&kmxlnd_data.kmx_completions[i]);
- }
- LASSERT(atomic_read(&kmxlnd_data.kmx_nthreads) == 0);
- MXLND_FREE(kmxlnd_data.kmx_completions,
- nthreads * sizeof(struct completion));
-
- goto failed;
- }
- }
- ret = mxlnd_thread_start(mxlnd_tx_queued, (void *)((long)i++),
- "mxlnd_tx_queued");
- if (ret < 0) {
- CERROR("Starting mxlnd_tx_queued failed with %d\n", ret);
- atomic_set(&kmxlnd_data.kmx_shutdown, 1);
- mx_wakeup(kmxlnd_data.kmx_endpt);
- for (--i; i >= 0; i--) {
- wait_for_completion(&kmxlnd_data.kmx_completions[i]);
- }
- LASSERT(atomic_read(&kmxlnd_data.kmx_nthreads) == 0);
- MXLND_FREE(kmxlnd_data.kmx_completions,
- nthreads * sizeof(struct completion));
- goto failed;
- }
- ret = mxlnd_thread_start(mxlnd_timeoutd, (void *)((long)i++),
- "mxlnd_timeoutd");
- if (ret < 0) {
- CERROR("Starting mxlnd_timeoutd failed with %d\n", ret);
- atomic_set(&kmxlnd_data.kmx_shutdown, 1);
- mx_wakeup(kmxlnd_data.kmx_endpt);
- up(&kmxlnd_data.kmx_tx_queue_sem);
- for (--i; i >= 0; i--) {
- wait_for_completion(&kmxlnd_data.kmx_completions[i]);
- }
- LASSERT(atomic_read(&kmxlnd_data.kmx_nthreads) == 0);
- MXLND_FREE(kmxlnd_data.kmx_completions,
- nthreads * sizeof(struct completion));
- goto failed;
- }
- ret = mxlnd_thread_start(mxlnd_connd, (void *)((long)i++),
- "mxlnd_connd");
- if (ret < 0) {
- CERROR("Starting mxlnd_connd failed with %d\n", ret);
- atomic_set(&kmxlnd_data.kmx_shutdown, 1);
- mx_wakeup(kmxlnd_data.kmx_endpt);
- up(&kmxlnd_data.kmx_tx_queue_sem);
- for (--i; i >= 0; i--) {
- wait_for_completion(&kmxlnd_data.kmx_completions[i]);
- }
- LASSERT(atomic_read(&kmxlnd_data.kmx_nthreads) == 0);
- MXLND_FREE(kmxlnd_data.kmx_completions,
- nthreads * sizeof(struct completion));
- goto failed;
- }
-
- kmxlnd_data.kmx_init = MXLND_INIT_THREADS;
- /*****************************************************/
-
- kmxlnd_data.kmx_init = MXLND_INIT_ALL;
- CDEBUG(D_MALLOC, "startup complete (kmx_mem_used %ld)\n", kmxlnd_data.kmx_mem_used);
-
- return 0;
-failed:
- CERROR("mxlnd_startup failed\n");
- mxlnd_shutdown(ni);
- return (-ENETDOWN);
-}
-
-static int mxlnd_init(void)
-{
- lnet_register_lnd(&the_kmxlnd);
- return 0;
-}
-
-static void mxlnd_exit(void)
-{
- lnet_unregister_lnd(&the_kmxlnd);
- return;
-}
-
-module_init(mxlnd_init);
-module_exit(mxlnd_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Myricom, Inc. - help@myri.com");
-MODULE_DESCRIPTION("Kernel MyrinetExpress LND");
-MODULE_VERSION("0.6.0");
+++ /dev/null
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
- *
- * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
- * CA 95054 USA or visit www.sun.com if you need additional information or
- * have any questions.
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2012, 2014, Intel Corporation.
- *
- * Copyright (C) 2006 Myricom, Inc.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * lnet/klnds/mxlnd/mxlnd.h
- *
- * Author: Eric Barton <eric@bartonsoftware.com>
- * Author: Scott Atchley <atchley at myri.com>
- */
-
-#include <linux/module.h> /* module */
-#include <linux/kernel.h> /* module */
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/errno.h>
-#include <linux/unistd.h>
-#include <linux/uio.h>
-#include <linux/fs.h>
-
-#include <asm/uaccess.h>
-#include <asm/io.h>
-
-#include <linux/init.h> /* module */
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/stat.h>
-#include <linux/list.h>
-#include <linux/kmod.h>
-#include <linux/sysctl.h>
-#include <linux/random.h>
-#include <linux/utsname.h>
-#include <linux/jiffies.h> /* msecs_to_jiffies */
-#include <asm/semaphore.h>
-
-#include <net/sock.h>
-#include <linux/in.h>
-
-#include <asm/byteorder.h> /* __LITTLE_ENDIAN */
-#include <net/arp.h> /* arp table */
-#include <linux/netdevice.h> /* get_device_by_name */
-#include <linux/inetdevice.h> /* neigh_lookup, etc. */
-#include <linux/net.h> /* sock_create_kern, kernel_connect, sock_release */
-
-#define DEBUG_SUBSYSTEM S_LND
-
-#include "libcfs/libcfs.h"
-#include "lnet/lnet.h"
-#include "lnet/lib-lnet.h"
-
-#define MX_KERNEL 1
-#include "mx_extensions.h"
-#include "myriexpress.h"
-
-#if LNET_MAX_IOV > MX_MAX_SEGMENTS
- #error LNET_MAX_IOV is greater then MX_MAX_SEGMENTS
-#endif
-
-#define MXLND_MSG_MAGIC 0x4d583130 /* unique magic 'MX10' */
-#define MXLND_MSG_VERSION 0x03
-
-/* Using MX's 64 match bits
- * We are using the match bits to specify message type and the cookie. The
- * highest four bits (60-63) are reserved for message type. Below we specify
- * the types. We reserve the remaining combinations for future use. The next 8
- * bits (52-59) are reserved for returning a status code for failed GET_DATA
- * (payload) messages. The last 52 bits are used for cookies. That should allow
- * unique cookies for 4 KB messages at 10 Gbps line rate without rollover for
- * about 8 years. That should be enough. */
-
-#define MXLND_MSG_OFFSET 60 /* msg type offset */
-#define MXLND_MSG_BITS (64 - MXLND_MSG_OFFSET)
-#define MXLND_MSG_MASK (((1ULL<<MXLND_MSG_BITS) - 1) << MXLND_MSG_OFFSET)
-#define MXLND_MSG_TYPE(x) (((x) & MXLND_MSG_MASK) >> MXLND_MSG_OFFSET)
-
-#define MXLND_ERROR_OFFSET 52 /* error value offset */
-#define MXLND_ERROR_BITS (MXLND_MSG_OFFSET - MXLND_ERROR_OFFSET)
-#define MXLND_ERROR_MASK (((1ULL<<MXLND_ERROR_BITS) - 1) << MXLND_ERROR_OFFSET)
-#define MXLND_ERROR_VAL(x) (((x) & MXLND_ERROR_MASK) >> MXLND_ERROR_OFFSET)
-
-/* message types */
-#define MXLND_MSG_ICON_REQ 0xb /* mx_iconnect() before CONN_REQ */
-#define MXLND_MSG_CONN_REQ 0xc /* connection request */
-#define MXLND_MSG_ICON_ACK 0x9 /* mx_iconnect() before CONN_ACK */
-#define MXLND_MSG_CONN_ACK 0xa /* connection request response */
-#define MXLND_MSG_BYE 0xd /* disconnect msg */
-#define MXLND_MSG_EAGER 0xe /* eager message */
-#define MXLND_MSG_NOOP 0x1 /* no msg, return credits */
-#define MXLND_MSG_PUT_REQ 0x2 /* put request src->sink */
-#define MXLND_MSG_PUT_ACK 0x3 /* put ack src<-sink */
-#define MXLND_MSG_PUT_DATA 0x4 /* put payload src->sink */
-#define MXLND_MSG_GET_REQ 0x5 /* get request sink->src */
-#define MXLND_MSG_GET_DATA 0x6 /* get payload sink<-src */
-
-/* when to roll-over the cookie value */
-#define MXLND_MAX_COOKIE ((1ULL << MXLND_ERROR_OFFSET) - 1)
-
-/* defaults for configurable parameters */
-#define MXLND_N_SCHED 1 /* # schedulers (mx_wait_any() threads) */
-#define MXLND_NDAEMONS 3 /* connd, timeoutd, tx_queued */
-#define MXLND_MX_BOARD 0 /* Use the first MX NIC if more than 1 avail */
-#define MXLND_MX_EP_ID 0 /* MX endpoint ID */
-/* timeout for send/recv (jiffies) */
-#define MXLND_COMM_TIMEOUT msecs_to_jiffies(20 * MSEC_PER_SEC)
-/* timeout for wait (jiffies) */
-#define MXLND_WAIT_TIMEOUT msecs_to_jiffies(MSEC_PER_SEC)
-/* timeout for connections (jiffies) */
-#define MXLND_CONNECT_TIMEOUT msecs_to_jiffies(5 * MSEC_PER_SEC)
-#define MXLND_POLLING 1000 /* poll iterations before blocking */
-#define MXLND_LOOKUP_COUNT 5 /* how many times to try to resolve MAC */
-#define MXLND_MAX_PEERS 1024 /* number of nodes talking to me */
-
-#define MXLND_MSG_SIZE (4<<10) /* pre-posted eager message size */
-#define MXLND_MSG_QUEUE_DEPTH 8 /* default msg queue depth */
-#define MXLND_NTX 256 /* default # of tx msg descriptors */
-
-#define MXLND_HASH_BITS 6 /* the number of bits to hash over */
-#define MXLND_HASH_SIZE (1<<MXLND_HASH_BITS)
- /* number of peer lists for lookup.
- we hash over the last N bits of
- the IP address converted to an int. */
-#define MXLND_HASH_MASK (MXLND_HASH_SIZE - 1)
- /* ensure we use only the last N bits */
-
-/* derived constants... */
-/* TX messages (shared by all connections) */
-#define MXLND_TX_MSGS() (*kmxlnd_tunables.kmx_ntx)
-#define MXLND_TX_MSG_BYTES() (MXLND_TX_MSGS() * MXLND_MSG_SIZE)
-#define MXLND_TX_MSG_PAGES() ((MXLND_TX_MSG_BYTES() + PAGE_SIZE - 1)/PAGE_SIZE)
-
-/* RX messages (per connection) */
-#define MXLND_RX_MSGS() (*kmxlnd_tunables.kmx_peercredits)
-#define MXLND_RX_MSG_BYTES() (MXLND_RX_MSGS() * MXLND_MSG_SIZE)
-#define MXLND_RX_MSG_PAGES() ((MXLND_RX_MSG_BYTES() + PAGE_SIZE - 1)/PAGE_SIZE)
-#define MXLND_CREDIT_HIGHWATER() (*kmxlnd_tunables.kmx_peercredits - 2)
- /* when to send a noop to return credits */
-
-/* debugging features */
-#define MXLND_CKSUM 0 /* checksum kmx_msg_t */
-#define MXLND_DEBUG 0 /* additional CDEBUG messages */
-
-/* provide wrappers around LIBCFS_ALLOC/FREE to keep MXLND specific
- * memory usage stats that include pages */
-
-#define MXLND_ALLOC(x, size) \
- do { \
- spin_lock(&kmxlnd_data.kmx_mem_lock); \
- kmxlnd_data.kmx_mem_used += size; \
- spin_unlock(&kmxlnd_data.kmx_mem_lock); \
- LIBCFS_ALLOC(x, size); \
- if (unlikely(x == NULL)) { \
- spin_lock(&kmxlnd_data.kmx_mem_lock); \
- kmxlnd_data.kmx_mem_used -= size; \
- spin_unlock(&kmxlnd_data.kmx_mem_lock); \
- } \
- } while (0)
-
-#define MXLND_FREE(x, size) \
- do { \
- spin_lock(&kmxlnd_data.kmx_mem_lock); \
- kmxlnd_data.kmx_mem_used -= size; \
- spin_unlock(&kmxlnd_data.kmx_mem_lock); \
- LIBCFS_FREE(x, size); \
- } while (0)
-
-
-typedef struct kmx_tunables
-{
- int *kmx_n_waitd; /* # completion threads */
- int *kmx_max_peers; /* max # of potential peers */
- int *kmx_cksum; /* checksum small msgs? */
- int *kmx_ntx; /* total # of tx */
- int *kmx_credits; /* concurrent sends to all peers */
- int *kmx_peercredits; /* concurrent sends to 1 peer */
- int *kmx_board; /* MX board (NIC) number */
- int *kmx_ep_id; /* MX endpoint number */
- char **kmx_default_ipif; /* IPoMX interface name */
- int *kmx_polling; /* if 0, block. if > 0, poll this many
- iterations before blocking */
-} kmx_tunables_t;
-
-typedef struct
-{
- int mxg_npages; /* # pages */
- struct page *mxg_pages[0];
-} kmx_pages_t;
-
-/* global interface state */
-typedef struct kmx_data
-{
- int kmx_init; /* initialization state */
- atomic_t kmx_shutdown; /* shutting down? */
- atomic_t kmx_nthreads; /* number of threads */
- struct completion *kmx_completions; /* array of completion struct */
- lnet_ni_t *kmx_ni; /* the LND instance */
- u64 kmx_incarnation; /* my incarnation value */
- long kmx_mem_used; /* memory used */
- mx_endpoint_t kmx_endpt; /* the MX endpoint */
- mx_endpoint_addr_t kmx_epa; /* the MX endpoint address */
-
- rwlock_t kmx_global_lock; /* global lock */
- spinlock_t kmx_mem_lock; /* memory accounting lock */
-
- cfs_list_t kmx_conn_reqs; /* list of connection reqs */
- spinlock_t kmx_conn_lock; /* connection list lock */
- struct semaphore kmx_conn_sem; /* connection request list */
- cfs_list_t kmx_conn_zombies; /* list of zombie connections */
- cfs_list_t kmx_orphan_msgs; /* list of txs to cancel */
-
- /* list of all known peers */
- cfs_list_t kmx_peers[MXLND_HASH_SIZE];
- atomic_t kmx_npeers; /* number of peers */
-
- kmx_pages_t *kmx_tx_pages; /* tx msg pages */
-
- struct kmx_ctx *kmx_txs; /* all tx descriptors */
- cfs_list_t kmx_tx_idle; /* list of idle tx */
- spinlock_t kmx_tx_idle_lock; /* lock for idle tx list */
- s32 kmx_tx_used; /* txs in use */
- u64 kmx_tx_next_cookie; /* unique id for tx */
- cfs_list_t kmx_tx_queue; /* generic send queue */
- spinlock_t kmx_tx_queue_lock; /* lock for generic sends */
- struct semaphore kmx_tx_queue_sem; /* semaphore for tx queue */
-} kmx_data_t;
-
-#define MXLND_INIT_NOTHING 0 /* in the beginning, there was nothing... */
-#define MXLND_INIT_DATA 1 /* main data structures created */
-#define MXLND_INIT_TXS 2 /* tx descriptors created */
-#define MXLND_INIT_MX 3 /* initiate MX library, open endpoint, get NIC id */
-#define MXLND_INIT_THREADS 4 /* waitd, timeoutd, tx_queued threads */
-#define MXLND_INIT_ALL 5 /* startup completed */
-
-/************************************************************************
- * MXLND Wire message format.
- * These are sent in sender's byte order (i.e. receiver flips).
- */
-
-typedef struct kmx_connreq_msg
-{
- u32 mxcrm_queue_depth; /* per peer max messages in flight */
- u32 mxcrm_eager_size; /* size of preposted eager messages */
-} WIRE_ATTR kmx_connreq_msg_t;
-
-typedef struct kmx_eager_msg
-{
- lnet_hdr_t mxem_hdr; /* lnet header */
- char mxem_payload[0]; /* piggy-backed payload */
-} WIRE_ATTR kmx_eager_msg_t;
-
-typedef struct kmx_putreq_msg
-{
- lnet_hdr_t mxprm_hdr; /* lnet header */
- u64 mxprm_cookie; /* opaque completion cookie */
-} WIRE_ATTR kmx_putreq_msg_t;
-
-typedef struct kmx_putack_msg
-{
- u64 mxpam_src_cookie; /* reflected completion cookie */
- u64 mxpam_dst_cookie; /* opaque completion cookie */
-} WIRE_ATTR kmx_putack_msg_t;
-
-typedef struct kmx_getreq_msg
-{
- lnet_hdr_t mxgrm_hdr; /* lnet header */
- u64 mxgrm_cookie; /* opaque completion cookie */
-} WIRE_ATTR kmx_getreq_msg_t;
-
-typedef struct kmx_msg
-{
- /* First two fields fixed for all time */
- u32 mxm_magic; /* MXLND message */
- u16 mxm_version; /* version number */
-
- u8 mxm_type; /* message type */
- u8 mxm_credits; /* returned credits */
- u32 mxm_nob; /* # of bytes in whole message */
- u32 mxm_cksum; /* checksum (0 == no checksum) */
- u64 mxm_srcnid; /* sender's NID */
- u64 mxm_srcstamp; /* sender's incarnation */
- u64 mxm_dstnid; /* destination's NID */
- u64 mxm_dststamp; /* destination's incarnation */
-
- union {
- kmx_connreq_msg_t conn_req;
- kmx_eager_msg_t eager;
- kmx_putreq_msg_t put_req;
- kmx_putack_msg_t put_ack;
- kmx_getreq_msg_t get_req;
- } WIRE_ATTR mxm_u;
-} WIRE_ATTR kmx_msg_t;
-
-/***********************************************************************/
-
-enum kmx_req_type {
- MXLND_REQ_TX = 0,
- MXLND_REQ_RX = 1,
-};
-
-/* The life cycle of a request */
-enum kmx_req_state {
- MXLND_CTX_INIT = 0, /* just created */
- MXLND_CTX_IDLE = 1, /* available for use */
- MXLND_CTX_PREP = 2, /* getting ready for send/recv */
- MXLND_CTX_PENDING = 3, /* mx_isend() or mx_irecv() called */
- MXLND_CTX_COMPLETED = 4, /* cleaning up after completion or timeout */
- MXLND_CTX_CANCELED = 5, /* timed out but still in ctx list */
-};
-
-/* Context Structure - generic tx/rx descriptor
- * It represents the context (or state) of each send or receive request.
- * In other LNDs, they have separate TX and RX descriptors and this replaces both.
- *
- * The txs live on the global kmx_txs array for cleanup. The rxs are managed
- * per struct kmx_conn. We will move them between the rx/tx idle lists and the
- * pending list which is monitored by mxlnd_timeoutd().
- */
-typedef struct kmx_ctx
-{
- enum kmx_req_type mxc_type; /* TX or RX */
- u64 mxc_incarnation; /* store the peer's incarnation here
- to verify before changing flow
- control credits after completion */
- unsigned long mxc_deadline; /* request time out in absolute jiffies */
- enum kmx_req_state mxc_state; /* what is the state of the request? */
- cfs_list_t mxc_list; /* place on rx/tx idle list, tx q, peer tx */
- cfs_list_t mxc_rx_list; /* place on mxp_rx_posted list */
-
- lnet_nid_t mxc_nid; /* dst's NID if peer is not known */
- struct kmx_peer *mxc_peer; /* owning peer */
- struct kmx_conn *mxc_conn; /* owning conn */
- kmx_msg_t *mxc_msg; /* msg hdr mapped to mxc_page */
- lnet_msg_t *mxc_lntmsg[2]; /* lnet msgs to finalize */
-
- u8 mxc_msg_type; /* what type of message is this? */
- u64 mxc_cookie; /* completion cookie */
- u64 mxc_match; /* MX match info */
- mx_ksegment_t mxc_seg; /* local MX ksegment for non-DATA */
- mx_ksegment_t *mxc_seg_list; /* MX ksegment array for DATA */
- int mxc_nseg; /* number of segments */
- unsigned long mxc_pin_type; /* MX_PIN_PHYSICAL [| MX_PIN_FULLPAGES] */
- u32 mxc_nob; /* number of bytes sent/received */
- mx_request_t mxc_mxreq; /* MX request */
- mx_status_t mxc_status; /* MX status */
- u32 mxc_errno; /* errno for LNET */
- u64 mxc_get; /* # of times returned from idle list */
- u64 mxc_put; /* # of times returned from idle list */
-} kmx_ctx_t;
-
-#define MXLND_CONN_DISCONNECT -2 /* conn is being destroyed - do not add txs */
-#define MXLND_CONN_FAIL -1 /* connect failed (bad handshake, unavail, etc.) */
-#define MXLND_CONN_INIT 0 /* in the beginning, there was nothing... */
-#define MXLND_CONN_REQ 1 /* a connection request message is needed */
-#define MXLND_CONN_ACK 2 /* a connection ack is needed */
-#define MXLND_CONN_WAIT 3 /* waiting for req or ack to complete */
-#define MXLND_CONN_READY 4 /* ready to send */
-
-/* store all data from an unexpected CONN_[REQ|ACK] receive */
-typedef struct kmx_connparams
-{
- cfs_list_t mxr_list; /* list to hang on kmx_conn_reqs */
- void *mxr_context; /* context - unused - will hold net */
- mx_endpoint_addr_t mxr_epa; /* the peer's epa */
- u64 mxr_match; /* the CONN_REQ's match bits */
- u32 mxr_nob; /* length of CONN_REQ message */
- struct kmx_peer *mxr_peer; /* peer if known */
- struct kmx_conn *mxr_conn; /* conn if known */
- kmx_msg_t mxr_msg; /* the msg header & connreq_msg_t */
-} kmx_connparams_t;
-
-/* connection state - queues for queued and pending msgs */
-typedef struct kmx_conn
-{
- struct kmx_peer *mxk_peer; /* owning peer */
- cfs_list_t mxk_list; /* for placing on mxp_conns */
- cfs_list_t mxk_zombie; /* for placing on zombies list */
- u64 mxk_incarnation; /* connections's incarnation value */
- u32 mxk_sid; /* peer's MX session id */
- atomic_t mxk_refcount; /* reference counting */
- int mxk_status; /* can we send messages? MXLND_CONN_* */
-
- mx_endpoint_addr_t mxk_epa; /* peer's endpoint address */
-
- spinlock_t mxk_lock; /* lock */
- unsigned long mxk_timeout; /* expiration of oldest pending tx/rx */
- unsigned long mxk_last_tx; /* when last tx completed with success */
- unsigned long mxk_last_rx; /* when last rx completed */
-
- kmx_pages_t *mxk_rx_pages; /* rx msg pages */
- kmx_ctx_t *mxk_rxs; /* the rx descriptors */
- cfs_list_t mxk_rx_idle; /* list of idle rx */
-
- int mxk_credits; /* # of my credits for sending to peer */
- int mxk_outstanding; /* # of credits to return */
-
- cfs_list_t mxk_tx_credit_queue; /* send queue for peer */
- cfs_list_t mxk_tx_free_queue; /* send queue for peer */
- int mxk_ntx_msgs; /* # of msgs on tx queues */
- int mxk_ntx_data ; /* # of DATA on tx queues */
- int mxk_ntx_posted; /* # of tx msgs in flight */
- int mxk_data_posted; /* # of tx data payloads in flight */
-
- cfs_list_t mxk_pending; /* in flight rxs and txs */
-} kmx_conn_t;
-
-/* peer state */
-typedef struct kmx_peer
-{
- cfs_list_t mxp_list; /* for placing on kmx_peers */
- lnet_nid_t mxp_nid; /* peer's LNET NID */
- lnet_ni_t *mxp_ni; /* LNET interface */
- atomic_t mxp_refcount; /* reference counts */
-
- cfs_list_t mxp_conns; /* list of connections */
- kmx_conn_t *mxp_conn; /* current connection */
- cfs_list_t mxp_tx_queue; /* msgs waiting for a conn */
-
- u32 mxp_board; /* peer's board rank */
- u32 mxp_ep_id; /* peer's MX endpoint ID */
- u64 mxp_nic_id; /* remote's MX nic_id for mx_connect() */
-
- unsigned long mxp_reconnect_time; /* when to retry connect */
- int mxp_incompatible; /* incorrect conn_req values */
-} kmx_peer_t;
-
-extern kmx_data_t kmxlnd_data;
-extern kmx_tunables_t kmxlnd_tunables;
-
-/* required for the LNET API */
-int mxlnd_startup(lnet_ni_t *ni);
-void mxlnd_shutdown(lnet_ni_t *ni);
-int mxlnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
-int mxlnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
-int mxlnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
- unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
- unsigned int offset, unsigned int mlen, unsigned int rlen);
-
-/* in mxlnd.c */
-extern void mxlnd_thread_stop(long id);
-extern void mxlnd_ctx_init(kmx_ctx_t *ctx);
-extern int mxlnd_peer_alloc(kmx_peer_t **peerp, lnet_nid_t nid,
- u32 board, u32 ep_id, u64 nic_id);
-extern int mxlnd_alloc_pages(kmx_pages_t **pp, int npages);
-extern void mxlnd_free_pages(kmx_pages_t *p);
-
-/* in mxlnd_cb.c */
-void mxlnd_eager_recv(void *context, uint64_t match_value, uint32_t length);
-extern mx_unexp_handler_action_t mxlnd_unexpected_recv(void *context,
- mx_endpoint_addr_t source, uint64_t match_value, uint32_t length,
- void *data_if_available);
-extern void mxlnd_peer_free(kmx_peer_t *peer);
-extern void mxlnd_conn_free_locked(kmx_conn_t *conn);
-extern void mxlnd_conn_disconnect(kmx_conn_t *conn, int mx_dis, int send_bye);
-extern int mxlnd_close_matching_conns(lnet_nid_t nid);
-extern void mxlnd_sleep(unsigned long timeout);
-extern int mxlnd_tx_queued(void *arg);
-extern void mxlnd_handle_rx_completion(kmx_ctx_t *rx);
-extern int mxlnd_check_sends(kmx_peer_t *peer);
-extern int mxlnd_tx_peer_queued(void *arg);
-extern int mxlnd_request_waitd(void *arg);
-extern int mxlnd_unex_recvd(void *arg);
-extern int mxlnd_timeoutd(void *arg);
-extern int mxlnd_free_conn_zombies(void);
-extern int mxlnd_connd(void *arg);
-extern int mxlnd_del_peer(lnet_nid_t nid);
-
-
-/**
- * mxlnd_nid_to_hash - hash the nid
- * @nid - LNET ID
- *
- * Takes the u64 nid and XORs the lowest N bits by the next lowest N bits.
- */
-static inline int
-mxlnd_nid_to_hash(lnet_nid_t nid)
-{
- return (nid & MXLND_HASH_MASK) ^
- ((nid & (MXLND_HASH_MASK << MXLND_HASH_BITS)) >> MXLND_HASH_BITS);
-}
-
-
-#define mxlnd_peer_addref(peer) \
-do { \
- LASSERT(peer != NULL); \
- LASSERT(atomic_read(&(peer)->mxp_refcount) > 0); \
- atomic_inc(&(peer)->mxp_refcount); \
-} while (0)
-
-
-#define mxlnd_peer_decref(peer) \
-do { \
- LASSERT(atomic_read(&(peer)->mxp_refcount) > 0); \
- if (atomic_dec_and_test(&(peer)->mxp_refcount)) \
- mxlnd_peer_free(peer); \
-} while (0)
-
-#define mxlnd_conn_addref(conn) \
-do { \
- LASSERT(conn != NULL); \
- LASSERT(atomic_read(&(conn)->mxk_refcount) > 0); \
- atomic_inc(&(conn)->mxk_refcount); \
-} while (0)
-
-
-#define mxlnd_conn_decref(conn) \
-do { \
- LASSERT(conn != NULL); \
- LASSERT(atomic_read(&(conn)->mxk_refcount) > 0); \
- if (atomic_dec_and_test(&(conn)->mxk_refcount)) { \
- spin_lock(&kmxlnd_data.kmx_conn_lock); \
- LASSERT((conn)->mxk_status == MXLND_CONN_DISCONNECT); \
- CDEBUG(D_NET, "adding conn %p to zombies\n", (conn)); \
- cfs_list_add_tail(&(conn)->mxk_zombie, \
- &kmxlnd_data.kmx_conn_zombies); \
- spin_unlock(&kmxlnd_data.kmx_conn_lock); \
- up(&kmxlnd_data.kmx_conn_sem); \
- } \
-} while (0)
-
-#define mxlnd_valid_msg_type(type) \
-do { \
- LASSERT((type) == MXLND_MSG_EAGER || \
- (type) == MXLND_MSG_ICON_REQ || \
- (type) == MXLND_MSG_CONN_REQ || \
- (type) == MXLND_MSG_ICON_ACK || \
- (type) == MXLND_MSG_CONN_ACK || \
- (type) == MXLND_MSG_BYE || \
- (type) == MXLND_MSG_NOOP || \
- (type) == MXLND_MSG_PUT_REQ || \
- (type) == MXLND_MSG_PUT_ACK || \
- (type) == MXLND_MSG_PUT_DATA || \
- (type) == MXLND_MSG_GET_REQ || \
- (type) == MXLND_MSG_GET_DATA); \
-} while (0)
+++ /dev/null
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
- *
- * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
- * CA 95054 USA or visit www.sun.com if you need additional information or
- * have any questions.
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2012, 2014, Intel Corporation.
- *
- * Copyright (C) 2006 Myricom, Inc.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * lnet/klnds/mxlnd/mxlnd.c
- *
- * Author: Eric Barton <eric@bartonsoftware.com>
- * Author: Scott Atchley <atchley at myri.com>
- */
-
-#include <asm/page.h>
-#include "mxlnd.h"
-
-mx_endpoint_addr_t MX_EPA_NULL; /* use to determine if an endpoint is NULL */
-
-inline int
-mxlnd_endpoint_addr_null(mx_endpoint_addr_t epa)
-{
- /* if memcmp() == 0, it is NULL */
- return !(memcmp(&epa, &MX_EPA_NULL, sizeof(epa)));
-}
-
-char *
-mxlnd_ctxstate_to_str(int mxc_state)
-{
- switch (mxc_state) {
- case MXLND_CTX_INIT:
- return "MXLND_CTX_INIT";
- case MXLND_CTX_IDLE:
- return "MXLND_CTX_IDLE";
- case MXLND_CTX_PREP:
- return "MXLND_CTX_PREP";
- case MXLND_CTX_PENDING:
- return "MXLND_CTX_PENDING";
- case MXLND_CTX_COMPLETED:
- return "MXLND_CTX_COMPLETED";
- case MXLND_CTX_CANCELED:
- return "MXLND_CTX_CANCELED";
- default:
- return "*unknown*";
- }
-}
-
-char *
-mxlnd_connstatus_to_str(int mxk_status)
-{
- switch (mxk_status) {
- case MXLND_CONN_READY:
- return "MXLND_CONN_READY";
- case MXLND_CONN_INIT:
- return "MXLND_CONN_INIT";
- case MXLND_CONN_WAIT:
- return "MXLND_CONN_WAIT";
- case MXLND_CONN_DISCONNECT:
- return "MXLND_CONN_DISCONNECT";
- case MXLND_CONN_FAIL:
- return "MXLND_CONN_FAIL";
- default:
- return "unknown";
- }
-}
-
-char *
-mxlnd_msgtype_to_str(int type) {
- switch (type) {
- case MXLND_MSG_EAGER:
- return "MXLND_MSG_EAGER";
- case MXLND_MSG_CONN_REQ:
- return "MXLND_MSG_CONN_REQ";
- case MXLND_MSG_CONN_ACK:
- return "MXLND_MSG_CONN_ACK";
- case MXLND_MSG_BYE:
- return "MXLND_MSG_BYE";
- case MXLND_MSG_NOOP:
- return "MXLND_MSG_NOOP";
- case MXLND_MSG_PUT_REQ:
- return "MXLND_MSG_PUT_REQ";
- case MXLND_MSG_PUT_ACK:
- return "MXLND_MSG_PUT_ACK";
- case MXLND_MSG_PUT_DATA:
- return "MXLND_MSG_PUT_DATA";
- case MXLND_MSG_GET_REQ:
- return "MXLND_MSG_GET_REQ";
- case MXLND_MSG_GET_DATA:
- return "MXLND_MSG_GET_DATA";
- default:
- return "unknown";
- }
-}
-
-char *
-mxlnd_lnetmsg_to_str(int type)
-{
- switch (type) {
- case LNET_MSG_ACK:
- return "LNET_MSG_ACK";
- case LNET_MSG_PUT:
- return "LNET_MSG_PUT";
- case LNET_MSG_GET:
- return "LNET_MSG_GET";
- case LNET_MSG_REPLY:
- return "LNET_MSG_REPLY";
- case LNET_MSG_HELLO:
- return "LNET_MSG_HELLO";
- default:
- LBUG();
- return "*unknown*";
- }
-}
-
-static inline u64
-mxlnd_create_match(kmx_ctx_t *ctx, u8 error)
-{
- u64 type = (u64) ctx->mxc_msg_type;
- u64 err = (u64) error;
- u64 match = 0ULL;
-
- mxlnd_valid_msg_type(ctx->mxc_msg_type);
- LASSERT(ctx->mxc_cookie >> MXLND_ERROR_OFFSET == 0);
- match = (type << MXLND_MSG_OFFSET) | (err << MXLND_ERROR_OFFSET) | ctx->mxc_cookie;
- return match;
-}
-
-static inline void
-mxlnd_parse_match(u64 match, u8 *msg_type, u8 *error, u64 *cookie)
-{
- *msg_type = (u8) MXLND_MSG_TYPE(match);
- *error = (u8) MXLND_ERROR_VAL(match);
- *cookie = match & MXLND_MAX_COOKIE;
- mxlnd_valid_msg_type(*msg_type);
- return;
-}
-
-kmx_ctx_t *
-mxlnd_get_idle_rx(kmx_conn_t *conn)
-{
- cfs_list_t *rxs = NULL;
- kmx_ctx_t *rx = NULL;
-
- LASSERT(conn != NULL);
-
- rxs = &conn->mxk_rx_idle;
-
- spin_lock(&conn->mxk_lock);
-
- if (cfs_list_empty (rxs)) {
- spin_unlock(&conn->mxk_lock);
- return NULL;
- }
-
- rx = cfs_list_entry (rxs->next, kmx_ctx_t, mxc_list);
- cfs_list_del_init(&rx->mxc_list);
- spin_unlock(&conn->mxk_lock);
-
-#if MXLND_DEBUG
- if (rx->mxc_get != rx->mxc_put) {
- CNETERR("*** RX get (%llu) != put (%llu) ***\n", rx->mxc_get, rx->mxc_put);
- CNETERR("*** incarnation= %lld ***\n", rx->mxc_incarnation);
- CNETERR("*** deadline= %ld ***\n", rx->mxc_deadline);
- CNETERR("*** state= %s ***\n", mxlnd_ctxstate_to_str(rx->mxc_state));
- CNETERR("*** listed?= %d ***\n", !cfs_list_empty(&rx->mxc_list));
- CNETERR("*** nid= 0x%llx ***\n", rx->mxc_nid);
- CNETERR("*** peer= 0x%p ***\n", rx->mxc_peer);
- CNETERR("*** msg_type= %s ***\n", mxlnd_msgtype_to_str(rx->mxc_msg_type));
- CNETERR("*** cookie= 0x%llx ***\n", rx->mxc_cookie);
- CNETERR("*** nob= %d ***\n", rx->mxc_nob);
- }
-#endif
- LASSERT (rx->mxc_get == rx->mxc_put);
-
- rx->mxc_get++;
-
- LASSERT (rx->mxc_state == MXLND_CTX_IDLE);
- rx->mxc_state = MXLND_CTX_PREP;
- rx->mxc_deadline = jiffies + MXLND_COMM_TIMEOUT;
-
- return rx;
-}
-
-int
-mxlnd_put_idle_rx(kmx_ctx_t *rx)
-{
- kmx_conn_t *conn = rx->mxc_conn;
- cfs_list_t *rxs = &conn->mxk_rx_idle;
-
- LASSERT(rx->mxc_type == MXLND_REQ_RX);
-
- mxlnd_ctx_init(rx);
-
- rx->mxc_put++;
- LASSERT(rx->mxc_get == rx->mxc_put);
-
- spin_lock(&conn->mxk_lock);
- cfs_list_add(&rx->mxc_list, rxs);
- spin_unlock(&conn->mxk_lock);
- return 0;
-}
-
-kmx_ctx_t *
-mxlnd_get_idle_tx(void)
-{
- cfs_list_t *tmp = &kmxlnd_data.kmx_tx_idle;
- kmx_ctx_t *tx = NULL;
-
- spin_lock(&kmxlnd_data.kmx_tx_idle_lock);
-
- if (cfs_list_empty (&kmxlnd_data.kmx_tx_idle)) {
- CNETERR("%d txs in use\n", kmxlnd_data.kmx_tx_used);
- spin_unlock(&kmxlnd_data.kmx_tx_idle_lock);
- return NULL;
- }
-
- tmp = &kmxlnd_data.kmx_tx_idle;
- tx = cfs_list_entry (tmp->next, kmx_ctx_t, mxc_list);
- cfs_list_del_init(&tx->mxc_list);
-
- /* Allocate a new completion cookie. It might not be needed,
- * but we've got a lock right now and we're unlikely to
- * wrap... */
- tx->mxc_cookie = kmxlnd_data.kmx_tx_next_cookie++;
- if (kmxlnd_data.kmx_tx_next_cookie > MXLND_MAX_COOKIE) {
- kmxlnd_data.kmx_tx_next_cookie = 1;
- }
- kmxlnd_data.kmx_tx_used++;
- spin_unlock(&kmxlnd_data.kmx_tx_idle_lock);
-
- LASSERT (tx->mxc_get == tx->mxc_put);
-
- tx->mxc_get++;
-
- LASSERT (tx->mxc_state == MXLND_CTX_IDLE);
- LASSERT (tx->mxc_lntmsg[0] == NULL);
- LASSERT (tx->mxc_lntmsg[1] == NULL);
-
- tx->mxc_state = MXLND_CTX_PREP;
- tx->mxc_deadline = jiffies + MXLND_COMM_TIMEOUT;
-
- return tx;
-}
-
-void
-mxlnd_conn_disconnect(kmx_conn_t *conn, int mx_dis, int send_bye);
-
-int
-mxlnd_put_idle_tx(kmx_ctx_t *tx)
-{
- int result = 0;
- lnet_msg_t *lntmsg[2];
-
- LASSERT(tx->mxc_type == MXLND_REQ_TX);
-
- if (tx->mxc_status.code != MX_STATUS_SUCCESS || tx->mxc_errno != 0) {
- kmx_conn_t *conn = tx->mxc_conn;
-
- result = -EIO;
- if (tx->mxc_errno != 0) result = tx->mxc_errno;
- /* FIXME should we set mx_dis? */
- mxlnd_conn_disconnect(conn, 0, 1);
- }
-
- lntmsg[0] = tx->mxc_lntmsg[0];
- lntmsg[1] = tx->mxc_lntmsg[1];
-
- mxlnd_ctx_init(tx);
-
- tx->mxc_put++;
- LASSERT(tx->mxc_get == tx->mxc_put);
-
- spin_lock(&kmxlnd_data.kmx_tx_idle_lock);
- cfs_list_add_tail(&tx->mxc_list, &kmxlnd_data.kmx_tx_idle);
- kmxlnd_data.kmx_tx_used--;
- spin_unlock(&kmxlnd_data.kmx_tx_idle_lock);
-
- if (lntmsg[0] != NULL)
- lnet_finalize(kmxlnd_data.kmx_ni, lntmsg[0], result);
- if (lntmsg[1] != NULL)
- lnet_finalize(kmxlnd_data.kmx_ni, lntmsg[1], result);
- return 0;
-}
-
-
-void
-mxlnd_connparams_free(kmx_connparams_t *cp)
-{
- LASSERT(cfs_list_empty(&cp->mxr_list));
- MXLND_FREE(cp, sizeof(*cp));
- return;
-}
-
-int
-mxlnd_connparams_alloc(kmx_connparams_t **cp, void *context,
- mx_endpoint_addr_t epa, u64 match, u32 length,
- kmx_conn_t *conn, kmx_peer_t *peer, void *data)
-{
- kmx_connparams_t *c = NULL;
-
- MXLND_ALLOC(c, sizeof(*c));
- if (!c) return -ENOMEM;
-
- CFS_INIT_LIST_HEAD(&c->mxr_list);
- c->mxr_context = context;
- c->mxr_epa = epa;
- c->mxr_match = match;
- c->mxr_nob = length;
- c->mxr_conn = conn;
- c->mxr_peer = peer;
- c->mxr_msg = *((kmx_msg_t *) data);
-
- *cp = c;
- return 0;
-}
-
-static inline void
-mxlnd_set_conn_status(kmx_conn_t *conn, int status)
-{
- conn->mxk_status = status;
- smp_mb();
-}
-
-/**
- * mxlnd_conn_free_locked - free the conn
- * @conn - a kmx_conn pointer
- *
- * The calling function should remove the conn from the conns list first
- * then destroy it. Caller should have write-locked kmx_global_lock.
- */
-void
-mxlnd_conn_free_locked(kmx_conn_t *conn)
-{
- int valid = !mxlnd_endpoint_addr_null(conn->mxk_epa);
- kmx_peer_t *peer = conn->mxk_peer;
-
- CDEBUG(D_NET, "freeing conn 0x%p *****\n", conn);
- LASSERT (cfs_list_empty (&conn->mxk_tx_credit_queue) &&
- cfs_list_empty (&conn->mxk_tx_free_queue) &&
- cfs_list_empty (&conn->mxk_pending));
- if (!cfs_list_empty(&conn->mxk_list)) {
- cfs_list_del_init(&conn->mxk_list);
- if (peer->mxp_conn == conn) {
- peer->mxp_conn = NULL;
- if (valid) {
- kmx_conn_t *temp = NULL;
-
- mx_get_endpoint_addr_context(conn->mxk_epa,
- (void **) &temp);
- if (conn == temp) {
- mx_set_endpoint_addr_context(conn->mxk_epa,
- (void *) NULL);
- }
- }
- /* unlink from global list and drop its ref */
- cfs_list_del_init(&peer->mxp_list);
- mxlnd_peer_decref(peer);
- }
- }
- mxlnd_peer_decref(peer); /* drop conn's ref to peer */
- if (conn->mxk_rx_pages) {
- LASSERT (conn->mxk_rxs != NULL);
- mxlnd_free_pages(conn->mxk_rx_pages);
- }
- if (conn->mxk_rxs) {
- int i = 0;
- kmx_ctx_t *rx = NULL;
-
- for (i = 0; i < MXLND_RX_MSGS(); i++) {
- rx = &conn->mxk_rxs[i];
- if (rx->mxc_seg_list != NULL) {
- LASSERT(rx->mxc_nseg > 0);
- MXLND_FREE(rx->mxc_seg_list,
- rx->mxc_nseg *
- sizeof(*rx->mxc_seg_list));
- }
- }
- MXLND_FREE(conn->mxk_rxs, MXLND_RX_MSGS() * sizeof(kmx_ctx_t));
- }
-
- MXLND_FREE(conn, sizeof (*conn));
- return;
-}
-
-
-int
-mxlnd_conn_cancel_pending_rxs(kmx_conn_t *conn)
-{
- int found = 0;
- int count = 0;
- kmx_ctx_t *ctx = NULL;
- kmx_ctx_t *next = NULL;
- mx_return_t mxret = MX_SUCCESS;
- u32 result = 0;
-
- do {
- found = 0;
- spin_lock(&conn->mxk_lock);
- cfs_list_for_each_entry_safe(ctx, next, &conn->mxk_pending,
- mxc_list) {
- cfs_list_del_init(&ctx->mxc_list);
- if (ctx->mxc_type == MXLND_REQ_RX) {
- found = 1;
- mxret = mx_cancel(kmxlnd_data.kmx_endpt,
- &ctx->mxc_mxreq,
- &result);
- if (mxret != MX_SUCCESS) {
- CNETERR("mx_cancel() returned %s (%d)\n", mx_strerror(mxret), mxret);
- }
- if (result == 1) {
- ctx->mxc_errno = -ECONNABORTED;
- ctx->mxc_state = MXLND_CTX_CANCELED;
- spin_unlock(&conn->mxk_lock);
- spin_lock(&kmxlnd_data.kmx_conn_lock);
- /* we may be holding the global lock,
- * move to orphan list so that it can free it */
- cfs_list_add_tail(&ctx->mxc_list,
- &kmxlnd_data.kmx_orphan_msgs);
- count++;
- spin_unlock(&kmxlnd_data.kmx_conn_lock);
- spin_lock(&conn->mxk_lock);
- }
- break;
- }
- }
- spin_unlock(&conn->mxk_lock);
- } while (found);
-
- return count;
-}
-
-int
-mxlnd_cancel_queued_txs(kmx_conn_t *conn)
-{
- int count = 0;
- cfs_list_t *tmp = NULL;
-
- spin_lock(&conn->mxk_lock);
- while (!cfs_list_empty(&conn->mxk_tx_free_queue) ||
- !cfs_list_empty(&conn->mxk_tx_credit_queue)) {
-
- kmx_ctx_t *tx = NULL;
-
- if (!cfs_list_empty(&conn->mxk_tx_free_queue)) {
- tmp = &conn->mxk_tx_free_queue;
- } else {
- tmp = &conn->mxk_tx_credit_queue;
- }
-
- tx = cfs_list_entry(tmp->next, kmx_ctx_t, mxc_list);
- cfs_list_del_init(&tx->mxc_list);
- spin_unlock(&conn->mxk_lock);
- tx->mxc_errno = -ECONNABORTED;
- tx->mxc_state = MXLND_CTX_CANCELED;
- /* move to orphan list and then abort */
- spin_lock(&kmxlnd_data.kmx_conn_lock);
- cfs_list_add_tail(&tx->mxc_list, &kmxlnd_data.kmx_orphan_msgs);
- spin_unlock(&kmxlnd_data.kmx_conn_lock);
- count++;
- spin_lock(&conn->mxk_lock);
- }
- spin_unlock(&conn->mxk_lock);
-
- return count;
-}
-
-void
-mxlnd_send_message(mx_endpoint_addr_t epa, u8 msg_type, int error, u64 cookie)
-{
- u64 match = (((u64) msg_type) << MXLND_MSG_OFFSET) |
- (((u64) error) << MXLND_ERROR_OFFSET) | cookie;
-
- mx_kisend(kmxlnd_data.kmx_endpt, NULL, 0, MX_PIN_PHYSICAL,
- epa, match, NULL, NULL);
- return;
-}
-
-/**
- * mxlnd_conn_disconnect - shutdown a connection
- * @conn - a kmx_conn pointer
- * @mx_dis - call mx_disconnect()
- * @send_bye - send peer a BYE msg
- *
- * This function sets the status to DISCONNECT, completes queued
- * txs with failure, calls mx_disconnect, which will complete
- * pending txs and matched rxs with failure.
- */
-void
-mxlnd_conn_disconnect(kmx_conn_t *conn, int mx_dis, int send_bye)
-{
- mx_endpoint_addr_t epa = conn->mxk_epa;
- int valid = !mxlnd_endpoint_addr_null(epa);
- int count = 0;
-
- spin_lock(&conn->mxk_lock);
- if (conn->mxk_status == MXLND_CONN_DISCONNECT) {
- spin_unlock(&conn->mxk_lock);
- return;
- }
- mxlnd_set_conn_status(conn, MXLND_CONN_DISCONNECT);
- conn->mxk_timeout = 0;
- spin_unlock(&conn->mxk_lock);
-
- count = mxlnd_cancel_queued_txs(conn);
- count += mxlnd_conn_cancel_pending_rxs(conn);
-
- if (count) /* let connd call kmxlnd_abort_msgs() */
- up(&kmxlnd_data.kmx_conn_sem);
-
- if (send_bye && valid &&
- conn->mxk_peer->mxp_nid != kmxlnd_data.kmx_ni->ni_nid) {
- /* send a BYE to the peer */
- CDEBUG(D_NET, "%s: sending a BYE msg to %s\n", __func__,
- libcfs_nid2str(conn->mxk_peer->mxp_nid));
- mxlnd_send_message(epa, MXLND_MSG_BYE, 0, 0);
- /* wait to allow the peer to ack our message */
- mxlnd_sleep(msecs_to_jiffies(20));
- }
-
- if (atomic_read(&kmxlnd_data.kmx_shutdown) != 1) {
- unsigned long last_msg = 0;
-
- /* notify LNET that we are giving up on this peer */
- if (cfs_time_after(conn->mxk_last_rx, conn->mxk_last_tx))
- last_msg = conn->mxk_last_rx;
- else
- last_msg = conn->mxk_last_tx;
-
- lnet_notify(kmxlnd_data.kmx_ni, conn->mxk_peer->mxp_nid, 0, last_msg);
-
- if (mx_dis && valid &&
- (memcmp(&epa, &kmxlnd_data.kmx_epa, sizeof(epa) != 0)))
- mx_disconnect(kmxlnd_data.kmx_endpt, epa);
- }
- mxlnd_conn_decref(conn); /* drop the owning peer's reference */
-
- return;
-}
-
-/**
- * mxlnd_conn_alloc - allocate and initialize a new conn struct
- * @connp - address of a kmx_conn pointer
- * @peer - owning kmx_peer
- *
- * Returns 0 on success and -ENOMEM on failure
- */
-int
-mxlnd_conn_alloc_locked(kmx_conn_t **connp, kmx_peer_t *peer)
-{
- int i = 0;
- int ret = 0;
- int ipage = 0;
- int offset = 0;
- void *addr = NULL;
- kmx_conn_t *conn = NULL;
- kmx_pages_t *pages = NULL;
- struct page *page = NULL;
- kmx_ctx_t *rx = NULL;
-
- LASSERT(peer != NULL);
-
- MXLND_ALLOC(conn, sizeof (*conn));
- if (conn == NULL) {
- CNETERR("Cannot allocate conn\n");
- return -ENOMEM;
- }
- CDEBUG(D_NET, "allocated conn 0x%p for peer 0x%p\n", conn, peer);
-
- memset(conn, 0, sizeof(*conn));
-
- ret = mxlnd_alloc_pages(&pages, MXLND_RX_MSG_PAGES());
- if (ret != 0) {
- CERROR("Can't allocate rx pages\n");
- MXLND_FREE(conn, sizeof(*conn));
- return -ENOMEM;
- }
- conn->mxk_rx_pages = pages;
-
- MXLND_ALLOC(conn->mxk_rxs, MXLND_RX_MSGS() * sizeof(kmx_ctx_t));
- if (conn->mxk_rxs == NULL) {
- CERROR("Can't allocate %d rx descriptors\n", MXLND_RX_MSGS());
- mxlnd_free_pages(pages);
- MXLND_FREE(conn, sizeof(*conn));
- return -ENOMEM;
- }
-
- memset(conn->mxk_rxs, 0, MXLND_RX_MSGS() * sizeof(kmx_ctx_t));
-
- conn->mxk_peer = peer;
- CFS_INIT_LIST_HEAD(&conn->mxk_list);
- CFS_INIT_LIST_HEAD(&conn->mxk_zombie);
- atomic_set(&conn->mxk_refcount, 2); /* ref for owning peer
- and one for the caller */
- if (peer->mxp_nid == kmxlnd_data.kmx_ni->ni_nid) {
- u64 nic_id = 0ULL;
- u32 ep_id = 0;
-
- /* this is localhost, set the epa and status as up */
- mxlnd_set_conn_status(conn, MXLND_CONN_READY);
- conn->mxk_epa = kmxlnd_data.kmx_epa;
- mx_set_endpoint_addr_context(conn->mxk_epa, (void *) conn);
- peer->mxp_reconnect_time = 0;
- mx_decompose_endpoint_addr(kmxlnd_data.kmx_epa, &nic_id, &ep_id);
- peer->mxp_nic_id = nic_id;
- peer->mxp_ep_id = ep_id;
- conn->mxk_incarnation = kmxlnd_data.kmx_incarnation;
- conn->mxk_timeout = 0;
- } else {
- /* conn->mxk_incarnation = 0 - will be set by peer */
- /* conn->mxk_sid = 0 - will be set by peer */
- mxlnd_set_conn_status(conn, MXLND_CONN_INIT);
- /* mxk_epa - to be set after mx_iconnect() */
- }
- spin_lock_init(&conn->mxk_lock);
- /* conn->mxk_timeout = 0 */
- /* conn->mxk_last_tx = 0 */
- /* conn->mxk_last_rx = 0 */
- CFS_INIT_LIST_HEAD(&conn->mxk_rx_idle);
-
- conn->mxk_credits = *kmxlnd_tunables.kmx_peercredits;
- /* mxk_outstanding = 0 */
-
- CFS_INIT_LIST_HEAD(&conn->mxk_tx_credit_queue);
- CFS_INIT_LIST_HEAD(&conn->mxk_tx_free_queue);
- /* conn->mxk_ntx_msgs = 0 */
- /* conn->mxk_ntx_data = 0 */
- /* conn->mxk_ntx_posted = 0 */
- /* conn->mxk_data_posted = 0 */
- CFS_INIT_LIST_HEAD(&conn->mxk_pending);
-
- for (i = 0; i < MXLND_RX_MSGS(); i++) {
-
- rx = &conn->mxk_rxs[i];
- rx->mxc_type = MXLND_REQ_RX;
- CFS_INIT_LIST_HEAD(&rx->mxc_list);
-
- /* map mxc_msg to page */
- page = pages->mxg_pages[ipage];
- addr = page_address(page);
- LASSERT(addr != NULL);
- rx->mxc_msg = (kmx_msg_t *)(addr + offset);
- rx->mxc_seg.segment_ptr = MX_PA_TO_U64(virt_to_phys(rx->mxc_msg));
-
- rx->mxc_conn = conn;
- rx->mxc_peer = peer;
- rx->mxc_nid = peer->mxp_nid;
-
- mxlnd_ctx_init(rx);
-
- offset += MXLND_MSG_SIZE;
- LASSERT (offset <= PAGE_SIZE);
-
- if (offset == PAGE_SIZE) {
- offset = 0;
- ipage++;
- LASSERT (ipage <= MXLND_TX_MSG_PAGES());
- }
-
- cfs_list_add_tail(&rx->mxc_list, &conn->mxk_rx_idle);
- }
-
- *connp = conn;
-
- mxlnd_peer_addref(peer); /* add a ref for this conn */
-
- /* add to front of peer's conns list */
- cfs_list_add(&conn->mxk_list, &peer->mxp_conns);
- peer->mxp_conn = conn;
- return 0;
-}
-
-int
-mxlnd_conn_alloc(kmx_conn_t **connp, kmx_peer_t *peer)
-{
- int ret = 0;
- rwlock_t *g_lock = &kmxlnd_data.kmx_global_lock;
-
- write_lock(g_lock);
- ret = mxlnd_conn_alloc_locked(connp, peer);
- write_unlock(g_lock);
- return ret;
-}
-
-int
-mxlnd_q_pending_ctx(kmx_ctx_t *ctx)
-{
- int ret = 0;
- kmx_conn_t *conn = ctx->mxc_conn;
-
- ctx->mxc_state = MXLND_CTX_PENDING;
- if (conn != NULL) {
- spin_lock(&conn->mxk_lock);
- if (conn->mxk_status >= MXLND_CONN_INIT) {
- cfs_list_add_tail(&ctx->mxc_list, &conn->mxk_pending);
- if (conn->mxk_timeout == 0 || ctx->mxc_deadline < conn->mxk_timeout) {
- conn->mxk_timeout = ctx->mxc_deadline;
- }
- } else {
- ctx->mxc_state = MXLND_CTX_COMPLETED;
- ret = -1;
- }
- spin_unlock(&conn->mxk_lock);
- }
- return ret;
-}
-
-int
-mxlnd_deq_pending_ctx(kmx_ctx_t *ctx)
-{
- LASSERT(ctx->mxc_state == MXLND_CTX_PENDING ||
- ctx->mxc_state == MXLND_CTX_COMPLETED);
- if (ctx->mxc_state != MXLND_CTX_PENDING &&
- ctx->mxc_state != MXLND_CTX_COMPLETED) {
- CNETERR("deq ctx->mxc_state = %s\n",
- mxlnd_ctxstate_to_str(ctx->mxc_state));
- }
- ctx->mxc_state = MXLND_CTX_COMPLETED;
- if (!cfs_list_empty(&ctx->mxc_list)) {
- kmx_conn_t *conn = ctx->mxc_conn;
- kmx_ctx_t *next = NULL;
-
- LASSERT(conn != NULL);
- spin_lock(&conn->mxk_lock);
- cfs_list_del_init(&ctx->mxc_list);
- conn->mxk_timeout = 0;
- if (!cfs_list_empty(&conn->mxk_pending)) {
- next = cfs_list_entry(conn->mxk_pending.next,
- kmx_ctx_t, mxc_list);
- conn->mxk_timeout = next->mxc_deadline;
- }
- spin_unlock(&conn->mxk_lock);
- }
- return 0;
-}
-
-/**
- * mxlnd_peer_free - free the peer
- * @peer - a kmx_peer pointer
- *
- * The calling function should decrement the rxs, drain the tx queues and
- * remove the peer from the peers list first then destroy it.
- */
-void
-mxlnd_peer_free(kmx_peer_t *peer)
-{
- CDEBUG(D_NET, "freeing peer 0x%p %s\n", peer, libcfs_nid2str(peer->mxp_nid));
-
- LASSERT (atomic_read(&peer->mxp_refcount) == 0);
-
- if (!cfs_list_empty(&peer->mxp_list)) {
- /* assume we are locked */
- cfs_list_del_init(&peer->mxp_list);
- }
-
- MXLND_FREE(peer, sizeof (*peer));
- atomic_dec(&kmxlnd_data.kmx_npeers);
- return;
-}
-
-static int
-mxlnd_lookup_mac(u32 ip, u64 *tmp_id)
-{
- int ret = -EHOSTUNREACH;
- unsigned char *haddr = NULL;
- struct net_device *dev = NULL;
- struct neighbour *n = NULL;
- __be32 dst_ip = htonl(ip);
-
- dev = dev_get_by_name(*kmxlnd_tunables.kmx_default_ipif);
- if (dev == NULL)
- return -ENODEV;
-
- haddr = (unsigned char *) tmp_id + 2; /* MAC is only 6 bytes */
-
- n = neigh_lookup(&arp_tbl, &dst_ip, dev);
- if (n) {
- n->used = jiffies;
- if (n->nud_state & NUD_VALID) {
- memcpy(haddr, n->ha, dev->addr_len);
- neigh_release(n);
- ret = 0;
- }
- }
-
- dev_put(dev);
-
- return ret;
-}
-
-
-/* We only want the MAC address of the peer's Myricom NIC. We
- * require that each node has the IPoMX interface (myriN) up.
- * We will not pass any traffic over IPoMX, but it allows us
- * to get the MAC address. */
-static int
-mxlnd_ip2nic_id(u32 ip, u64 *nic_id, int tries)
-{
- int ret = 0;
- int try = 1;
- int fatal = 0;
- u64 tmp_id = 0ULL;
- cfs_socket_t *sock = NULL;
-
- do {
- CDEBUG(D_NET, "try %d of %d tries\n", try, tries);
- ret = mxlnd_lookup_mac(ip, &tmp_id);
- if (ret == 0) {
- break;
- } else {
- /* not found, try to connect (force an arp) */
- ret = libcfs_sock_connect(&sock, &fatal, 0, 0, ip, 987);
- if (ret == -ECONNREFUSED) {
- /* peer is there, get the MAC address */
- mxlnd_lookup_mac(ip, &tmp_id);
- if (tmp_id != 0ULL)
- ret = 0;
- break;
- } else if (ret == -EHOSTUNREACH && try < tries) {
- /* add a little backoff */
- CDEBUG(D_NET, "sleeping for %lu jiffies\n",
- msecs_to_jiffies(MSEC_PER_SEC / 4));
- mxlnd_sleep(msecs_to_jiffies(MSEC_PER_SEC / 4));
- }
- }
- } while (try++ < tries);
- CDEBUG(D_NET, "done trying. ret = %d\n", ret);
-
- if (tmp_id == 0ULL)
- ret = -EHOSTUNREACH;
-#if __BYTE_ORDER == __LITTLE_ENDIAN
- *nic_id = ___arch__swab64(tmp_id);
-#else
- *nic_id = tmp_id;
-#endif
- return ret;
-}
-
-/**
- * mxlnd_peer_alloc - allocate and initialize a new peer struct
- * @peerp - address of a kmx_peer pointer
- * @nid - LNET node id
- *
- * Returns 0 on success and -ENOMEM on failure
- */
-int
-mxlnd_peer_alloc(kmx_peer_t **peerp, lnet_nid_t nid, u32 board, u32 ep_id, u64 nic_id)
-{
- int ret = 0;
- u32 ip = LNET_NIDADDR(nid);
- kmx_peer_t *peer = NULL;
-
- LASSERT (nid != LNET_NID_ANY && nid != 0LL);
-
- MXLND_ALLOC(peer, sizeof (*peer));
- if (peer == NULL) {
- CNETERR("Cannot allocate peer for NID 0x%llx\n",
- nid);
- return -ENOMEM;
- }
- CDEBUG(D_NET, "allocated peer 0x%p for NID 0x%llx\n", peer, nid);
-
- memset(peer, 0, sizeof(*peer));
-
- CFS_INIT_LIST_HEAD(&peer->mxp_list);
- peer->mxp_nid = nid;
- /* peer->mxp_ni unused - may be used for multi-rail */
- atomic_set(&peer->mxp_refcount, 1); /* ref for kmx_peers list */
-
- peer->mxp_board = board;
- peer->mxp_ep_id = ep_id;
- peer->mxp_nic_id = nic_id;
-
- CFS_INIT_LIST_HEAD(&peer->mxp_conns);
- ret = mxlnd_conn_alloc(&peer->mxp_conn, peer); /* adds 2nd conn ref here... */
- if (ret != 0) {
- mxlnd_peer_decref(peer);
- return ret;
- }
- CFS_INIT_LIST_HEAD(&peer->mxp_tx_queue);
-
- if (peer->mxp_nic_id != 0ULL)
- nic_id = peer->mxp_nic_id;
-
- if (nic_id == 0ULL) {
- ret = mxlnd_ip2nic_id(ip, &nic_id, 1);
- if (ret == 0) {
- peer->mxp_nic_id = nic_id;
- mx_nic_id_to_board_number(nic_id, &peer->mxp_board);
- }
- }
-
- peer->mxp_nic_id = nic_id; /* may be 0ULL if ip2nic_id() failed */
-
- /* peer->mxp_reconnect_time = 0 */
- /* peer->mxp_incompatible = 0 */
-
- *peerp = peer;
- return 0;
-}
-
-static inline kmx_peer_t *
-mxlnd_find_peer_by_nid_locked(lnet_nid_t nid)
-{
- int found = 0;
- int hash = 0;
- kmx_peer_t *peer = NULL;
-
- hash = mxlnd_nid_to_hash(nid);
-
- cfs_list_for_each_entry(peer, &kmxlnd_data.kmx_peers[hash], mxp_list) {
- if (peer->mxp_nid == nid) {
- found = 1;
- mxlnd_peer_addref(peer);
- break;
- }
- }
- return (found ? peer : NULL);
-}
-
-static kmx_peer_t *
-mxlnd_find_peer_by_nid(lnet_nid_t nid, int create)
-{
- int ret = 0;
- int hash = 0;
- kmx_peer_t *peer = NULL;
- kmx_peer_t *old = NULL;
- rwlock_t *g_lock = &kmxlnd_data.kmx_global_lock;
-
- read_lock(g_lock);
- peer = mxlnd_find_peer_by_nid_locked(nid); /* adds peer ref */
-
- if ((peer && peer->mxp_conn) || /* found peer with conn or */
- (!peer && !create)) { /* did not find peer and do not create one */
- read_unlock(g_lock);
- return peer;
- }
-
- read_unlock(g_lock);
-
- /* if peer but _not_ conn */
- if (peer && !peer->mxp_conn) {
- if (create) {
- write_lock(g_lock);
- if (!peer->mxp_conn) { /* check again */
- /* create the conn */
- ret = mxlnd_conn_alloc_locked(&peer->mxp_conn, peer);
- if (ret != 0) {
- /* we tried, return the peer only.
- * the caller needs to see if the conn exists */
- CNETERR("%s: %s could not alloc conn\n",
- __func__, libcfs_nid2str(peer->mxp_nid));
- } else {
- /* drop extra conn ref */
- mxlnd_conn_decref(peer->mxp_conn);
- }
- }
- write_unlock(g_lock);
- }
- return peer;
- }
-
- /* peer not found and we need to create one */
- hash = mxlnd_nid_to_hash(nid);
-
- /* create peer (and conn) */
- /* adds conn ref for peer and one for this function */
- ret = mxlnd_peer_alloc(&peer, nid, *kmxlnd_tunables.kmx_board,
- *kmxlnd_tunables.kmx_ep_id, 0ULL);
- if (ret != 0) /* no memory, peer is NULL */
- return NULL;
-
- write_lock(g_lock);
-
- /* look again */
- old = mxlnd_find_peer_by_nid_locked(nid);
- if (old) {
- /* someone already created one */
- mxlnd_conn_decref(peer->mxp_conn); /* drop ref taken above.. */
- mxlnd_conn_decref(peer->mxp_conn); /* drop peer's ref */
- mxlnd_peer_decref(peer);
- peer = old;
- } else {
- /* no other peer, use this one */
- cfs_list_add_tail(&peer->mxp_list,
- &kmxlnd_data.kmx_peers[hash]);
- atomic_inc(&kmxlnd_data.kmx_npeers);
- mxlnd_peer_addref(peer);
- mxlnd_conn_decref(peer->mxp_conn); /* drop ref from peer_alloc */
- }
-
- write_unlock(g_lock);
-
- return peer;
-}
-
-static inline int
-mxlnd_tx_requires_credit(kmx_ctx_t *tx)
-{
- return (tx->mxc_msg_type == MXLND_MSG_EAGER ||
- tx->mxc_msg_type == MXLND_MSG_GET_REQ ||
- tx->mxc_msg_type == MXLND_MSG_PUT_REQ ||
- tx->mxc_msg_type == MXLND_MSG_NOOP);
-}
-
-/**
- * mxlnd_init_msg - set type and number of bytes
- * @msg - msg pointer
- * @type - of message
- * @body_nob - bytes in msg body
- */
-static inline void
-mxlnd_init_msg(kmx_msg_t *msg, u8 type, int body_nob)
-{
- msg->mxm_type = type;
- msg->mxm_nob = offsetof(kmx_msg_t, mxm_u) + body_nob;
-}
-
-static inline void
-mxlnd_init_tx_msg (kmx_ctx_t *tx, u8 type, int body_nob, lnet_nid_t nid)
-{
- int nob = offsetof (kmx_msg_t, mxm_u) + body_nob;
- kmx_msg_t *msg = NULL;
-
- LASSERT (tx != NULL);
- LASSERT (nob <= MXLND_MSG_SIZE);
-
- tx->mxc_nid = nid;
- /* tx->mxc_peer should have already been set if we know it */
- tx->mxc_msg_type = type;
- tx->mxc_nseg = 1;
- /* tx->mxc_seg.segment_ptr is already pointing to mxc_page */
- tx->mxc_seg.segment_length = nob;
- tx->mxc_pin_type = MX_PIN_PHYSICAL;
-
- msg = tx->mxc_msg;
- msg->mxm_type = type;
- msg->mxm_nob = nob;
-
- return;
-}
-
-static inline __u32
-mxlnd_cksum (void *ptr, int nob)
-{
- char *c = ptr;
- __u32 sum = 0;
-
- while (nob-- > 0)
- sum = ((sum << 1) | (sum >> 31)) + *c++;
-
- /* ensure I don't return 0 (== no checksum) */
- return (sum == 0) ? 1 : sum;
-}
-
-/**
- * mxlnd_pack_msg_locked - complete msg info
- * @tx - msg to send
- */
-static inline void
-mxlnd_pack_msg_locked(kmx_ctx_t *tx)
-{
- kmx_msg_t *msg = tx->mxc_msg;
-
- /* type and nob should already be set in init_msg() */
- msg->mxm_magic = MXLND_MSG_MAGIC;
- msg->mxm_version = MXLND_MSG_VERSION;
- /* mxm_type */
- /* don't use mxlnd_tx_requires_credit() since we want PUT_ACK to
- * return credits as well */
- if (tx->mxc_msg_type != MXLND_MSG_CONN_REQ &&
- tx->mxc_msg_type != MXLND_MSG_CONN_ACK) {
- msg->mxm_credits = tx->mxc_conn->mxk_outstanding;
- tx->mxc_conn->mxk_outstanding = 0;
- } else {
- msg->mxm_credits = 0;
- }
- /* mxm_nob */
- msg->mxm_cksum = 0;
- msg->mxm_srcnid = kmxlnd_data.kmx_ni->ni_nid;
- msg->mxm_srcstamp = kmxlnd_data.kmx_incarnation;
- msg->mxm_dstnid = tx->mxc_nid;
- /* if it is a new peer, the dststamp will be 0 */
- msg->mxm_dststamp = tx->mxc_conn->mxk_incarnation;
-
- if (*kmxlnd_tunables.kmx_cksum) {
- msg->mxm_cksum = mxlnd_cksum(msg, msg->mxm_nob);
- }
-}
-
-int
-mxlnd_unpack_msg(kmx_msg_t *msg, int nob)
-{
- const int hdr_size = offsetof(kmx_msg_t, mxm_u);
- __u32 msg_cksum = 0;
- int flip = 0;
- int msg_nob = 0;
-
- /* 6 bytes are enough to have received magic + version */
- if (nob < 6) {
- CNETERR("not enough bytes for magic + hdr: %d\n", nob);
- return -EPROTO;
- }
-
- if (msg->mxm_magic == MXLND_MSG_MAGIC) {
- flip = 0;
- } else if (msg->mxm_magic == __swab32(MXLND_MSG_MAGIC)) {
- flip = 1;
- } else {
- CNETERR("Bad magic: %08x\n", msg->mxm_magic);
- return -EPROTO;
- }
-
- if (msg->mxm_version !=
- (flip ? __swab16(MXLND_MSG_VERSION) : MXLND_MSG_VERSION)) {
- CNETERR("Bad version: %d\n", msg->mxm_version);
- return -EPROTO;
- }
-
- if (nob < hdr_size) {
- CNETERR("not enough for a header: %d\n", nob);
- return -EPROTO;
- }
-
- msg_nob = flip ? __swab32(msg->mxm_nob) : msg->mxm_nob;
- if (msg_nob > nob) {
- CNETERR("Short message: got %d, wanted %d\n", nob, msg_nob);
- return -EPROTO;
- }
-
- /* checksum must be computed with mxm_cksum zero and BEFORE anything
- * gets flipped */
- msg_cksum = flip ? __swab32(msg->mxm_cksum) : msg->mxm_cksum;
- msg->mxm_cksum = 0;
- if (msg_cksum != 0 && msg_cksum != mxlnd_cksum(msg, msg_nob)) {
- CNETERR("Bad checksum\n");
- return -EPROTO;
- }
- msg->mxm_cksum = msg_cksum;
-
- if (flip) {
- /* leave magic unflipped as a clue to peer endianness */
- __swab16s(&msg->mxm_version);
- CLASSERT (sizeof(msg->mxm_type) == 1);
- CLASSERT (sizeof(msg->mxm_credits) == 1);
- msg->mxm_nob = msg_nob;
- __swab64s(&msg->mxm_srcnid);
- __swab64s(&msg->mxm_srcstamp);
- __swab64s(&msg->mxm_dstnid);
- __swab64s(&msg->mxm_dststamp);
- }
-
- if (msg->mxm_srcnid == LNET_NID_ANY) {
- CNETERR("Bad src nid: %s\n", libcfs_nid2str(msg->mxm_srcnid));
- return -EPROTO;
- }
-
- switch (msg->mxm_type) {
- default:
- CNETERR("Unknown message type %x\n", msg->mxm_type);
- return -EPROTO;
-
- case MXLND_MSG_NOOP:
- break;
-
- case MXLND_MSG_EAGER:
- if (msg_nob < offsetof(kmx_msg_t, mxm_u.eager.mxem_payload[0])) {
- CNETERR("Short EAGER: %d(%d)\n", msg_nob,
- (int)offsetof(kmx_msg_t, mxm_u.eager.mxem_payload[0]));
- return -EPROTO;
- }
- break;
-
- case MXLND_MSG_PUT_REQ:
- if (msg_nob < hdr_size + sizeof(msg->mxm_u.put_req)) {
- CNETERR("Short PUT_REQ: %d(%d)\n", msg_nob,
- (int)(hdr_size + sizeof(msg->mxm_u.put_req)));
- return -EPROTO;
- }
- if (flip)
- __swab64s(&msg->mxm_u.put_req.mxprm_cookie);
- break;
-
- case MXLND_MSG_PUT_ACK:
- if (msg_nob < hdr_size + sizeof(msg->mxm_u.put_ack)) {
- CNETERR("Short PUT_ACK: %d(%d)\n", msg_nob,
- (int)(hdr_size + sizeof(msg->mxm_u.put_ack)));
- return -EPROTO;
- }
- if (flip) {
- __swab64s(&msg->mxm_u.put_ack.mxpam_src_cookie);
- __swab64s(&msg->mxm_u.put_ack.mxpam_dst_cookie);
- }
- break;
-
- case MXLND_MSG_GET_REQ:
- if (msg_nob < hdr_size + sizeof(msg->mxm_u.get_req)) {
- CNETERR("Short GET_REQ: %d(%d)\n", msg_nob,
- (int)(hdr_size + sizeof(msg->mxm_u.get_req)));
- return -EPROTO;
- }
- if (flip) {
- __swab64s(&msg->mxm_u.get_req.mxgrm_cookie);
- }
- break;
-
- case MXLND_MSG_CONN_REQ:
- case MXLND_MSG_CONN_ACK:
- if (msg_nob < hdr_size + sizeof(msg->mxm_u.conn_req)) {
- CNETERR("Short connreq/ack: %d(%d)\n", msg_nob,
- (int)(hdr_size + sizeof(msg->mxm_u.conn_req)));
- return -EPROTO;
- }
- if (flip) {
- __swab32s(&msg->mxm_u.conn_req.mxcrm_queue_depth);
- __swab32s(&msg->mxm_u.conn_req.mxcrm_eager_size);
- }
- break;
- }
- return 0;
-}
-
-
-/**
- * mxlnd_recv_msg
- * @lntmsg - the LNET msg that this is continuing. If EAGER, then NULL.
- * @rx
- * @msg_type
- * @cookie
- * @length - length of incoming message
- * @pending - add to kmx_pending (0 is NO and 1 is YES)
- *
- * The caller gets the rx and sets nid, peer and conn if known.
- *
- * Returns 0 on success and -1 on failure
- */
-int
-mxlnd_recv_msg(lnet_msg_t *lntmsg, kmx_ctx_t *rx, u8 msg_type, u64 cookie, u32 length)
-{
- int ret = 0;
- mx_return_t mxret = MX_SUCCESS;
- uint64_t mask = ~(MXLND_ERROR_MASK);
-
- rx->mxc_msg_type = msg_type;
- rx->mxc_lntmsg[0] = lntmsg; /* may be NULL if EAGER */
- rx->mxc_cookie = cookie;
- /* rx->mxc_match may already be set */
- /* rx->mxc_seg.segment_ptr is already set */
- rx->mxc_seg.segment_length = length;
- ret = mxlnd_q_pending_ctx(rx);
- if (ret == -1) {
- /* the caller is responsible for calling conn_decref() if needed */
- return -1;
- }
- mxret = mx_kirecv(kmxlnd_data.kmx_endpt, &rx->mxc_seg, 1, MX_PIN_PHYSICAL,
- cookie, mask, (void *) rx, &rx->mxc_mxreq);
- if (mxret != MX_SUCCESS) {
- mxlnd_deq_pending_ctx(rx);
- CNETERR("mx_kirecv() failed with %s (%d)\n",
- mx_strerror(mxret), (int) mxret);
- return -1;
- }
- return 0;
-}
-
-
-/**
- * mxlnd_unexpected_recv - this is the callback function that will handle
- * unexpected receives
- * @context - NULL, ignore
- * @source - the peer's mx_endpoint_addr_t
- * @match_value - the msg's bits, should be MXLND_MSG_EAGER
- * @length - length of incoming message
- * @data_if_available - used for CONN_[REQ|ACK]
- *
- * If it is an eager-sized msg, we will call recv_msg() with the actual
- * length. If it is a large message, we will call recv_msg() with a
- * length of 0 bytes to drop it because we should never have a large,
- * unexpected message.
- *
- * NOTE - The MX library blocks until this function completes. Make it as fast as
- * possible. DO NOT allocate memory which can block!
- *
- * If we cannot get a rx or the conn is closed, drop the message on the floor
- * (i.e. recv 0 bytes and ignore).
- */
-mx_unexp_handler_action_t
-mxlnd_unexpected_recv(void *context, mx_endpoint_addr_t source,
- uint64_t match_value, uint32_t length, void *data_if_available)
-{
- int ret = 0;
- kmx_ctx_t *rx = NULL;
- mx_ksegment_t seg;
- u8 msg_type = 0;
- u8 error = 0;
- u64 cookie = 0ULL;
- kmx_conn_t *conn = NULL;
- kmx_peer_t *peer = NULL;
- u64 nic_id = 0ULL;
- u32 ep_id = 0;
- u32 sid = 0;
-
- /* TODO this will change to the net struct */
- if (context != NULL) {
- CNETERR("non-NULL context\n");
- }
-
-#if MXLND_DEBUG
- CDEBUG(D_NET, "bits=0x%llx length=%d\n", match_value, length);
-#endif
-
- mx_decompose_endpoint_addr2(source, &nic_id, &ep_id, &sid);
- mxlnd_parse_match(match_value, &msg_type, &error, &cookie);
- read_lock(&kmxlnd_data.kmx_global_lock);
- mx_get_endpoint_addr_context(source, (void **) &conn);
- if (conn) {
- mxlnd_conn_addref(conn); /* add ref for this function */
- peer = conn->mxk_peer;
- }
- read_unlock(&kmxlnd_data.kmx_global_lock);
-
- if (msg_type == MXLND_MSG_BYE) {
- if (conn) {
- CDEBUG(D_NET, "peer %s sent BYE msg\n",
- libcfs_nid2str(peer->mxp_nid));
- mxlnd_conn_disconnect(conn, 1, 0);
- mxlnd_conn_decref(conn); /* drop ref taken above */
- }
- return MX_RECV_FINISHED;
- }
-
- if (msg_type == MXLND_MSG_CONN_REQ) {
- kmx_connparams_t *cp = NULL;
- const int expected = offsetof(kmx_msg_t, mxm_u) +
- sizeof(kmx_connreq_msg_t);
-
- if (conn) mxlnd_conn_decref(conn); /* drop ref taken above */
- if (unlikely(length != expected || !data_if_available)) {
- CNETERR("received invalid CONN_REQ from %llx "
- "length=%d (expected %d)\n", nic_id, length, expected);
- mxlnd_send_message(source, MXLND_MSG_CONN_ACK, EPROTO, 0);
- return MX_RECV_FINISHED;
- }
-
- ret = mxlnd_connparams_alloc(&cp, context, source, match_value, length,
- conn, peer, data_if_available);
- if (unlikely(ret != 0)) {
- CNETERR("unable to alloc CONN_REQ from %llx:%d\n",
- nic_id, ep_id);
- mxlnd_send_message(source, MXLND_MSG_CONN_ACK, ENOMEM, 0);
- return MX_RECV_FINISHED;
- }
- spin_lock(&kmxlnd_data.kmx_conn_lock);
- cfs_list_add_tail(&cp->mxr_list, &kmxlnd_data.kmx_conn_reqs);
- spin_unlock(&kmxlnd_data.kmx_conn_lock);
- up(&kmxlnd_data.kmx_conn_sem);
- return MX_RECV_FINISHED;
- }
- if (msg_type == MXLND_MSG_CONN_ACK) {
- kmx_connparams_t *cp = NULL;
- const int expected = offsetof(kmx_msg_t, mxm_u) +
- sizeof(kmx_connreq_msg_t);
-
- LASSERT(conn);
- if (unlikely(error != 0)) {
- CNETERR("received CONN_ACK from %s with error -%d\n",
- libcfs_nid2str(peer->mxp_nid), (int) error);
- mxlnd_conn_disconnect(conn, 1, 0);
- } else if (unlikely(length != expected || !data_if_available)) {
- CNETERR("received %s CONN_ACK from %s "
- "length=%d (expected %d)\n",
- data_if_available ? "short" : "missing",
- libcfs_nid2str(peer->mxp_nid), length, expected);
- mxlnd_conn_disconnect(conn, 1, 1);
- } else {
- /* peer is ready for messages */
- ret = mxlnd_connparams_alloc(&cp, context, source, match_value, length,
- conn, peer, data_if_available);
- if (unlikely(ret != 0)) {
- CNETERR("unable to alloc kmx_connparams_t"
- " from %llx:%d\n", nic_id, ep_id);
- mxlnd_conn_disconnect(conn, 1, 1);
- } else {
- spin_lock(&kmxlnd_data.kmx_conn_lock);
- cfs_list_add_tail(&cp->mxr_list,
- &kmxlnd_data.kmx_conn_reqs);
- spin_unlock(&kmxlnd_data.kmx_conn_lock);
- up(&kmxlnd_data.kmx_conn_sem);
- }
- }
- mxlnd_conn_decref(conn); /* drop ref taken above */
-
- return MX_RECV_FINISHED;
- }
-
- /* Handle unexpected messages (PUT_REQ and GET_REQ) */
-
- LASSERT(peer != NULL && conn != NULL);
-
- rx = mxlnd_get_idle_rx(conn);
- if (rx != NULL) {
- if (length <= MXLND_MSG_SIZE) {
- ret = mxlnd_recv_msg(NULL, rx, msg_type, match_value, length);
- } else {
- CNETERR("unexpected large receive with "
- "match_value=0x%llx length=%d\n",
- match_value, length);
- ret = mxlnd_recv_msg(NULL, rx, msg_type, match_value, 0);
- }
-
- if (ret == 0) {
- /* hold conn ref until rx completes */
- rx->mxc_conn = conn;
- rx->mxc_peer = peer;
- rx->mxc_nid = peer->mxp_nid;
- } else {
- CNETERR("could not post receive\n");
- mxlnd_put_idle_rx(rx);
- }
- }
-
- /* Encountered error, drop incoming message on the floor */
- /* We could use MX_RECV_FINISHED but posting the receive of 0 bytes
- * uses the standard code path and acks the sender normally */
-
- if (rx == NULL || ret != 0) {
- mxlnd_conn_decref(conn); /* drop ref taken above */
- if (rx == NULL) {
- CNETERR("no idle rxs available - dropping rx"
- " 0x%llx from %s\n", match_value,
- libcfs_nid2str(peer->mxp_nid));
- } else {
- /* ret != 0 */
- CNETERR("disconnected peer - dropping rx\n");
- }
- seg.segment_ptr = 0ULL;
- seg.segment_length = 0;
- mx_kirecv(kmxlnd_data.kmx_endpt, &seg, 1, MX_PIN_PHYSICAL,
- match_value, ~0ULL, NULL, NULL);
- }
-
- return MX_RECV_CONTINUE;
-}
-
-
-int
-mxlnd_get_peer_info(int index, lnet_nid_t *nidp, int *count)
-{
- int i = 0;
- int ret = -ENOENT;
- kmx_peer_t *peer = NULL;
-
- read_lock(&kmxlnd_data.kmx_global_lock);
- for (i = 0; i < MXLND_HASH_SIZE; i++) {
- cfs_list_for_each_entry(peer, &kmxlnd_data.kmx_peers[i],
- mxp_list) {
- if (index-- == 0) {
- *nidp = peer->mxp_nid;
- *count = atomic_read(&peer->mxp_refcount);
- ret = 0;
- break;
- }
- }
- }
- read_unlock(&kmxlnd_data.kmx_global_lock);
-
- return ret;
-}
-
-void
-mxlnd_del_peer_locked(kmx_peer_t *peer)
-{
- if (peer->mxp_conn) {
- mxlnd_conn_disconnect(peer->mxp_conn, 1, 1);
- } else {
- cfs_list_del_init(&peer->mxp_list); /* remove from the global list */
- mxlnd_peer_decref(peer); /* drop global list ref */
- }
- return;
-}
-
-int
-mxlnd_del_peer(lnet_nid_t nid)
-{
- int i = 0;
- int ret = 0;
- kmx_peer_t *peer = NULL;
- kmx_peer_t *next = NULL;
-
- if (nid != LNET_NID_ANY) {
- peer = mxlnd_find_peer_by_nid(nid, 0); /* adds peer ref */
- }
- write_lock(&kmxlnd_data.kmx_global_lock);
- if (nid != LNET_NID_ANY) {
- if (peer == NULL) {
- ret = -ENOENT;
- } else {
- mxlnd_peer_decref(peer); /* and drops it */
- mxlnd_del_peer_locked(peer);
- }
- } else { /* LNET_NID_ANY */
- for (i = 0; i < MXLND_HASH_SIZE; i++) {
- cfs_list_for_each_entry_safe(peer, next,
- &kmxlnd_data.kmx_peers[i],
- mxp_list) {
- mxlnd_del_peer_locked(peer);
- }
- }
- }
- write_unlock(&kmxlnd_data.kmx_global_lock);
-
- return ret;
-}
-
-kmx_conn_t *
-mxlnd_get_conn_by_idx(int index)
-{
- int i = 0;
- kmx_peer_t *peer = NULL;
- kmx_conn_t *conn = NULL;
-
- read_lock(&kmxlnd_data.kmx_global_lock);
- for (i = 0; i < MXLND_HASH_SIZE; i++) {
- cfs_list_for_each_entry(peer, &kmxlnd_data.kmx_peers[i],
- mxp_list) {
- cfs_list_for_each_entry(conn, &peer->mxp_conns,
- mxk_list) {
- if (index-- > 0) {
- continue;
- }
-
- mxlnd_conn_addref(conn); /* add ref here, dec in ctl() */
- read_unlock(&kmxlnd_data.kmx_global_lock);
- return conn;
- }
- }
- }
- read_unlock(&kmxlnd_data.kmx_global_lock);
-
- return NULL;
-}
-
-void
-mxlnd_close_matching_conns_locked(kmx_peer_t *peer)
-{
- kmx_conn_t *conn = NULL;
- kmx_conn_t *next = NULL;
-
- cfs_list_for_each_entry_safe(conn, next, &peer->mxp_conns, mxk_list)
- mxlnd_conn_disconnect(conn, 0, 1);
-
- return;
-}
-
-int
-mxlnd_close_matching_conns(lnet_nid_t nid)
-{
- int i = 0;
- int ret = 0;
- kmx_peer_t *peer = NULL;
-
- write_lock(&kmxlnd_data.kmx_global_lock);
- if (nid != LNET_NID_ANY) {
- peer = mxlnd_find_peer_by_nid_locked(nid); /* adds peer ref */
- if (peer == NULL) {
- ret = -ENOENT;
- } else {
- mxlnd_close_matching_conns_locked(peer);
- mxlnd_peer_decref(peer); /* and drops it here */
- }
- } else { /* LNET_NID_ANY */
- for (i = 0; i < MXLND_HASH_SIZE; i++) {
- cfs_list_for_each_entry(peer, &kmxlnd_data.kmx_peers[i], mxp_list)
- mxlnd_close_matching_conns_locked(peer);
- }
- }
- write_unlock(&kmxlnd_data.kmx_global_lock);
-
- return ret;
-}
-
-/**
- * mxlnd_ctl - modify MXLND parameters
- * @ni - LNET interface handle
- * @cmd - command to change
- * @arg - the ioctl data
- */
-int
-mxlnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
-{
- struct libcfs_ioctl_data *data = arg;
- int ret = -EINVAL;
-
- LASSERT (ni == kmxlnd_data.kmx_ni);
-
- switch (cmd) {
- case IOC_LIBCFS_GET_PEER: {
- lnet_nid_t nid = 0;
- int count = 0;
-
- ret = mxlnd_get_peer_info(data->ioc_count, &nid, &count);
- data->ioc_nid = nid;
- data->ioc_count = count;
- break;
- }
- case IOC_LIBCFS_DEL_PEER: {
- ret = mxlnd_del_peer(data->ioc_nid);
- break;
- }
- case IOC_LIBCFS_GET_CONN: {
- kmx_conn_t *conn = NULL;
-
- conn = mxlnd_get_conn_by_idx(data->ioc_count);
- if (conn == NULL) {
- ret = -ENOENT;
- } else {
- ret = 0;
- data->ioc_nid = conn->mxk_peer->mxp_nid;
- mxlnd_conn_decref(conn); /* dec ref taken in get_conn_by_idx() */
- }
- break;
- }
- case IOC_LIBCFS_CLOSE_CONNECTION: {
- ret = mxlnd_close_matching_conns(data->ioc_nid);
- break;
- }
- default:
- CNETERR("unknown ctl(%d)\n", cmd);
- break;
- }
-
- return ret;
-}
-
-/**
- * mxlnd_peer_queue_tx_locked - add the tx to the peer's tx queue
- * @tx
- *
- * Add the tx to the peer's msg or data queue. The caller has locked the peer.
- */
-void
-mxlnd_peer_queue_tx_locked(kmx_ctx_t *tx)
-{
- u8 msg_type = tx->mxc_msg_type;
- kmx_conn_t *conn = tx->mxc_conn;
-
- LASSERT (msg_type != 0);
- LASSERT (tx->mxc_nid != 0);
- LASSERT (tx->mxc_peer != NULL);
- LASSERT (tx->mxc_conn != NULL);
-
- tx->mxc_incarnation = conn->mxk_incarnation;
-
- if (msg_type != MXLND_MSG_PUT_DATA &&
- msg_type != MXLND_MSG_GET_DATA) {
- /* msg style tx */
- if (mxlnd_tx_requires_credit(tx)) {
- cfs_list_add_tail(&tx->mxc_list,
- &conn->mxk_tx_credit_queue);
- conn->mxk_ntx_msgs++;
- } else if (msg_type == MXLND_MSG_CONN_REQ ||
- msg_type == MXLND_MSG_CONN_ACK) {
- /* put conn msgs at the front of the queue */
- cfs_list_add(&tx->mxc_list, &conn->mxk_tx_free_queue);
- } else {
- /* PUT_ACK, PUT_NAK */
- cfs_list_add_tail(&tx->mxc_list,
- &conn->mxk_tx_free_queue);
- conn->mxk_ntx_msgs++;
- }
- } else {
- /* data style tx */
- cfs_list_add_tail(&tx->mxc_list, &conn->mxk_tx_free_queue);
- conn->mxk_ntx_data++;
- }
-
- return;
-}
-
-/**
- * mxlnd_peer_queue_tx - add the tx to the global tx queue
- * @tx
- *
- * Add the tx to the peer's msg or data queue
- */
-static inline void
-mxlnd_peer_queue_tx(kmx_ctx_t *tx)
-{
- LASSERT(tx->mxc_peer != NULL);
- LASSERT(tx->mxc_conn != NULL);
- spin_lock(&tx->mxc_conn->mxk_lock);
- mxlnd_peer_queue_tx_locked(tx);
- spin_unlock(&tx->mxc_conn->mxk_lock);
-
- return;
-}
-
-/**
- * mxlnd_queue_tx - add the tx to the global tx queue
- * @tx
- *
- * Add the tx to the global queue and up the tx_queue_sem
- */
-void
-mxlnd_queue_tx(kmx_ctx_t *tx)
-{
- kmx_peer_t *peer = tx->mxc_peer;
- LASSERT (tx->mxc_nid != 0);
-
- if (peer != NULL) {
- if (peer->mxp_incompatible &&
- tx->mxc_msg_type != MXLND_MSG_CONN_ACK) {
- /* let this fail now */
- tx->mxc_errno = -ECONNABORTED;
- mxlnd_conn_decref(peer->mxp_conn);
- mxlnd_put_idle_tx(tx);
- return;
- }
- if (tx->mxc_conn == NULL) {
- int ret = 0;
- kmx_conn_t *conn = NULL;
-
- ret = mxlnd_conn_alloc(&conn, peer); /* adds 2nd ref for tx... */
- if (ret != 0) {
- tx->mxc_errno = ret;
- mxlnd_put_idle_tx(tx);
- goto done;
- }
- tx->mxc_conn = conn;
- mxlnd_peer_decref(peer); /* and takes it from peer */
- }
- LASSERT(tx->mxc_conn != NULL);
- mxlnd_peer_queue_tx(tx);
- mxlnd_check_sends(peer);
- } else {
- spin_lock(&kmxlnd_data.kmx_tx_queue_lock);
- cfs_list_add_tail(&tx->mxc_list, &kmxlnd_data.kmx_tx_queue);
- spin_unlock(&kmxlnd_data.kmx_tx_queue_lock);
- up(&kmxlnd_data.kmx_tx_queue_sem);
- }
-done:
- return;
-}
-
-int
-mxlnd_setup_iov(kmx_ctx_t *ctx, u32 niov, struct iovec *iov, u32 offset, u32 nob)
-{
- int i = 0;
- int sum = 0;
- int old_sum = 0;
- int nseg = 0;
- int first_iov = -1;
- int first_iov_offset = 0;
- int first_found = 0;
- int last_iov = -1;
- int last_iov_length = 0;
- mx_ksegment_t *seg = NULL;
-
- if (niov == 0) return 0;
- LASSERT(iov != NULL);
-
- for (i = 0; i < niov; i++) {
- sum = old_sum + (u32) iov[i].iov_len;
- if (!first_found && (sum > offset)) {
- first_iov = i;
- first_iov_offset = offset - old_sum;
- first_found = 1;
- sum = (u32) iov[i].iov_len - first_iov_offset;
- old_sum = 0;
- }
- if (sum >= nob) {
- last_iov = i;
- last_iov_length = (u32) iov[i].iov_len - (sum - nob);
- if (first_iov == last_iov) last_iov_length -= first_iov_offset;
- break;
- }
- old_sum = sum;
- }
- LASSERT(first_iov >= 0 && last_iov >= first_iov);
- nseg = last_iov - first_iov + 1;
- LASSERT(nseg > 0);
-
- MXLND_ALLOC(seg, nseg * sizeof(*seg));
- if (seg == NULL) {
- CNETERR("MXLND_ALLOC() failed\n");
- return -1;
- }
- memset(seg, 0, nseg * sizeof(*seg));
- ctx->mxc_nseg = nseg;
- sum = 0;
- for (i = 0; i < nseg; i++) {
- seg[i].segment_ptr = MX_PA_TO_U64(virt_to_phys(iov[first_iov + i].iov_base));
- seg[i].segment_length = (u32) iov[first_iov + i].iov_len;
- if (i == 0) {
- seg[i].segment_ptr += (u64) first_iov_offset;
- seg[i].segment_length -= (u32) first_iov_offset;
- }
- if (i == (nseg - 1)) {
- seg[i].segment_length = (u32) last_iov_length;
- }
- sum += seg[i].segment_length;
- }
- ctx->mxc_seg_list = seg;
- ctx->mxc_pin_type = MX_PIN_PHYSICAL;
-#ifdef MX_PIN_FULLPAGES
- ctx->mxc_pin_type |= MX_PIN_FULLPAGES;
-#endif
- LASSERT(nob == sum);
- return 0;
-}
-
-int
-mxlnd_setup_kiov(kmx_ctx_t *ctx, u32 niov, lnet_kiov_t *kiov, u32 offset, u32 nob)
-{
- int i = 0;
- int sum = 0;
- int old_sum = 0;
- int nseg = 0;
- int first_kiov = -1;
- int first_kiov_offset = 0;
- int first_found = 0;
- int last_kiov = -1;
- int last_kiov_length = 0;
- mx_ksegment_t *seg = NULL;
-
- if (niov == 0) return 0;
- LASSERT(kiov != NULL);
-
- for (i = 0; i < niov; i++) {
- sum = old_sum + kiov[i].kiov_len;
- if (i == 0) sum -= kiov[i].kiov_offset;
- if (!first_found && (sum > offset)) {
- first_kiov = i;
- first_kiov_offset = offset - old_sum;
- if (i == 0) first_kiov_offset = kiov[i].kiov_offset;
- first_found = 1;
- sum = kiov[i].kiov_len - first_kiov_offset;
- old_sum = 0;
- }
- if (sum >= nob) {
- last_kiov = i;
- last_kiov_length = kiov[i].kiov_len - (sum - nob);
- if (first_kiov == last_kiov) last_kiov_length -= first_kiov_offset;
- break;
- }
- old_sum = sum;
- }
- LASSERT(first_kiov >= 0 && last_kiov >= first_kiov);
- nseg = last_kiov - first_kiov + 1;
- LASSERT(nseg > 0);
-
- MXLND_ALLOC(seg, nseg * sizeof(*seg));
- if (seg == NULL) {
- CNETERR("MXLND_ALLOC() failed\n");
- return -1;
- }
- memset(seg, 0, niov * sizeof(*seg));
- ctx->mxc_nseg = niov;
- sum = 0;
- for (i = 0; i < niov; i++) {
- seg[i].segment_ptr =
- page_to_phys(kiov[first_kiov + i].kiov_page);
- seg[i].segment_length = kiov[first_kiov + i].kiov_len;
- if (i == 0) {
- seg[i].segment_ptr += (u64) first_kiov_offset;
- /* we have to add back the original kiov_offset */
- seg[i].segment_length -= first_kiov_offset +
- kiov[first_kiov].kiov_offset;
- }
- if (i == (nseg - 1)) {
- seg[i].segment_length = last_kiov_length;
- }
- sum += seg[i].segment_length;
- }
- ctx->mxc_seg_list = seg;
- ctx->mxc_pin_type = MX_PIN_PHYSICAL;
-#ifdef MX_PIN_FULLPAGES
- ctx->mxc_pin_type |= MX_PIN_FULLPAGES;
-#endif
- LASSERT(nob == sum);
- return 0;
-}
-
-void
-mxlnd_send_nak(kmx_ctx_t *tx, lnet_nid_t nid, int type, int status, __u64 cookie)
-{
- LASSERT(type == MXLND_MSG_PUT_ACK);
- mxlnd_init_tx_msg(tx, type, sizeof(kmx_putack_msg_t), tx->mxc_nid);
- tx->mxc_cookie = cookie;
- tx->mxc_msg->mxm_u.put_ack.mxpam_src_cookie = cookie;
- tx->mxc_msg->mxm_u.put_ack.mxpam_dst_cookie = ((u64) status << MXLND_ERROR_OFFSET); /* error code */
- tx->mxc_match = mxlnd_create_match(tx, status);
-
- mxlnd_queue_tx(tx);
-}
-
-
-/**
- * mxlnd_send_data - get tx, map [k]iov, queue tx
- * @ni
- * @lntmsg
- * @peer
- * @msg_type
- * @cookie
- *
- * This setups the DATA send for PUT or GET.
- *
- * On success, it queues the tx, on failure it calls lnet_finalize()
- */
-void
-mxlnd_send_data(lnet_ni_t *ni, lnet_msg_t *lntmsg, kmx_peer_t *peer, u8 msg_type, u64 cookie)
-{
- int ret = 0;
- lnet_process_id_t target = lntmsg->msg_target;
- unsigned int niov = lntmsg->msg_niov;
- struct iovec *iov = lntmsg->msg_iov;
- lnet_kiov_t *kiov = lntmsg->msg_kiov;
- unsigned int offset = lntmsg->msg_offset;
- unsigned int nob = lntmsg->msg_len;
- kmx_ctx_t *tx = NULL;
-
- LASSERT(lntmsg != NULL);
- LASSERT(peer != NULL);
- LASSERT(msg_type == MXLND_MSG_PUT_DATA || msg_type == MXLND_MSG_GET_DATA);
- LASSERT((cookie>>MXLND_ERROR_OFFSET) == 0);
-
- tx = mxlnd_get_idle_tx();
- if (tx == NULL) {
- CNETERR("Can't allocate %s tx for %s\n",
- msg_type == MXLND_MSG_PUT_DATA ? "PUT_DATA" : "GET_DATA",
- libcfs_nid2str(target.nid));
- goto failed_0;
- }
- tx->mxc_nid = target.nid;
- /* NOTE called when we have a ref on the conn, get one for this tx */
- mxlnd_conn_addref(peer->mxp_conn);
- tx->mxc_peer = peer;
- tx->mxc_conn = peer->mxp_conn;
- tx->mxc_msg_type = msg_type;
- tx->mxc_lntmsg[0] = lntmsg;
- tx->mxc_cookie = cookie;
- tx->mxc_match = mxlnd_create_match(tx, 0);
-
- /* This setups up the mx_ksegment_t to send the DATA payload */
- if (nob == 0) {
- /* do not setup the segments */
- CNETERR("nob = 0; why didn't we use an EAGER reply "
- "to %s?\n", libcfs_nid2str(target.nid));
- ret = 0;
- } else if (kiov == NULL) {
- ret = mxlnd_setup_iov(tx, niov, iov, offset, nob);
- } else {
- ret = mxlnd_setup_kiov(tx, niov, kiov, offset, nob);
- }
- if (ret != 0) {
- CNETERR("Can't setup send DATA for %s\n",
- libcfs_nid2str(target.nid));
- tx->mxc_errno = -EIO;
- goto failed_1;
- }
- mxlnd_queue_tx(tx);
- return;
-
-failed_1:
- mxlnd_conn_decref(peer->mxp_conn);
- mxlnd_put_idle_tx(tx);
- return;
-
-failed_0:
- CNETERR("no tx avail\n");
- lnet_finalize(ni, lntmsg, -EIO);
- return;
-}
-
-/**
- * mxlnd_recv_data - map [k]iov, post rx
- * @ni
- * @lntmsg
- * @rx
- * @msg_type
- * @cookie
- *
- * This setups the DATA receive for PUT or GET.
- *
- * On success, it returns 0, on failure it returns -1
- */
-int
-mxlnd_recv_data(lnet_ni_t *ni, lnet_msg_t *lntmsg, kmx_ctx_t *rx, u8 msg_type, u64 cookie)
-{
- int ret = 0;
- lnet_process_id_t target = lntmsg->msg_target;
- unsigned int niov = lntmsg->msg_niov;
- struct iovec *iov = lntmsg->msg_iov;
- lnet_kiov_t *kiov = lntmsg->msg_kiov;
- unsigned int offset = lntmsg->msg_offset;
- unsigned int nob = lntmsg->msg_len;
- mx_return_t mxret = MX_SUCCESS;
- u64 mask = ~(MXLND_ERROR_MASK);
-
- /* above assumes MXLND_MSG_PUT_DATA */
- if (msg_type == MXLND_MSG_GET_DATA) {
- niov = lntmsg->msg_md->md_niov;
- iov = lntmsg->msg_md->md_iov.iov;
- kiov = lntmsg->msg_md->md_iov.kiov;
- offset = 0;
- nob = lntmsg->msg_md->md_length;
- }
-
- LASSERT(lntmsg != NULL);
- LASSERT(rx != NULL);
- LASSERT(msg_type == MXLND_MSG_PUT_DATA || msg_type == MXLND_MSG_GET_DATA);
- LASSERT((cookie>>MXLND_ERROR_OFFSET) == 0); /* ensure top 12 bits are 0 */
-
- rx->mxc_msg_type = msg_type;
- rx->mxc_state = MXLND_CTX_PENDING;
- rx->mxc_nid = target.nid;
- /* if posting a GET_DATA, we may not yet know the peer */
- if (rx->mxc_peer != NULL) {
- rx->mxc_conn = rx->mxc_peer->mxp_conn;
- }
- rx->mxc_lntmsg[0] = lntmsg;
- rx->mxc_cookie = cookie;
- rx->mxc_match = mxlnd_create_match(rx, 0);
- /* This setups up the mx_ksegment_t to receive the DATA payload */
- if (kiov == NULL) {
- ret = mxlnd_setup_iov(rx, niov, iov, offset, nob);
- } else {
- ret = mxlnd_setup_kiov(rx, niov, kiov, offset, nob);
- }
- if (msg_type == MXLND_MSG_GET_DATA) {
- rx->mxc_lntmsg[1] = lnet_create_reply_msg(kmxlnd_data.kmx_ni, lntmsg);
- if (rx->mxc_lntmsg[1] == NULL) {
- CNETERR("Can't create reply for GET -> %s\n",
- libcfs_nid2str(target.nid));
- ret = -1;
- }
- }
- if (ret != 0) {
- CNETERR("Can't setup %s rx for %s\n",
- msg_type == MXLND_MSG_PUT_DATA ? "PUT_DATA" : "GET_DATA",
- libcfs_nid2str(target.nid));
- return -1;
- }
- ret = mxlnd_q_pending_ctx(rx);
- if (ret == -1) {
- return -1;
- }
- CDEBUG(D_NET, "receiving %s 0x%llx\n", mxlnd_msgtype_to_str(msg_type), rx->mxc_cookie);
- mxret = mx_kirecv(kmxlnd_data.kmx_endpt,
- rx->mxc_seg_list, rx->mxc_nseg,
- rx->mxc_pin_type, rx->mxc_match,
- mask, (void *) rx,
- &rx->mxc_mxreq);
- if (mxret != MX_SUCCESS) {
- if (rx->mxc_conn != NULL) {
- mxlnd_deq_pending_ctx(rx);
- }
- CNETERR("mx_kirecv() failed with %d for %s\n",
- (int) mxret, libcfs_nid2str(target.nid));
- return -1;
- }
-
- return 0;
-}
-
-/**
- * mxlnd_send - the LND required send function
- * @ni
- * @private
- * @lntmsg
- *
- * This must not block. Since we may not have a peer struct for the receiver,
- * it will append send messages on a global tx list. We will then up the
- * tx_queued's semaphore to notify it of the new send.
- */
-int
-mxlnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
-{
- int ret = 0;
- int type = lntmsg->msg_type;
- lnet_hdr_t *hdr = &lntmsg->msg_hdr;
- lnet_process_id_t target = lntmsg->msg_target;
- lnet_nid_t nid = target.nid;
- int target_is_router = lntmsg->msg_target_is_router;
- int routing = lntmsg->msg_routing;
- unsigned int payload_niov = lntmsg->msg_niov;
- struct iovec *payload_iov = lntmsg->msg_iov;
- lnet_kiov_t *payload_kiov = lntmsg->msg_kiov;
- unsigned int payload_offset = lntmsg->msg_offset;
- unsigned int payload_nob = lntmsg->msg_len;
- kmx_ctx_t *tx = NULL;
- kmx_msg_t *txmsg = NULL;
- kmx_ctx_t *rx = (kmx_ctx_t *) private; /* for REPLY */
- kmx_ctx_t *rx_data = NULL;
- kmx_conn_t *conn = NULL;
- int nob = 0;
- uint32_t length = 0;
- kmx_peer_t *peer = NULL;
- rwlock_t *g_lock =&kmxlnd_data.kmx_global_lock;
-
- CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
- payload_nob, payload_niov, libcfs_id2str(target));
-
- LASSERT (payload_nob == 0 || payload_niov > 0);
- LASSERT (payload_niov <= LNET_MAX_IOV);
- /* payload is either all vaddrs or all pages */
- LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
-
- /* private is used on LNET_GET_REPLY only, NULL for all other cases */
-
- /* NOTE we may not know the peer if it is the very first PUT_REQ or GET_REQ
- * to a new peer, so create one if not found */
- peer = mxlnd_find_peer_by_nid(nid, 1); /* adds peer ref */
- if (peer == NULL || peer->mxp_conn == NULL) {
- /* we could not find it nor could we create one or
- * one exists but we cannot create a conn,
- * fail this message */
- if (peer) {
- /* found peer without conn, drop ref taken above */
- LASSERT(peer->mxp_conn == NULL);
- mxlnd_peer_decref(peer);
- }
- return -ENOMEM;
- }
-
- /* we have a peer with a conn */
-
- if (unlikely(peer->mxp_incompatible)) {
- mxlnd_peer_decref(peer); /* drop ref taken above */
- } else {
- read_lock(g_lock);
- conn = peer->mxp_conn;
- if (conn && conn->mxk_status != MXLND_CONN_DISCONNECT)
- mxlnd_conn_addref(conn);
- else
- conn = NULL;
- read_unlock(g_lock);
- mxlnd_peer_decref(peer); /* drop peer ref taken above */
- if (!conn)
- return -ENOTCONN;
- }
-
- LASSERT(peer && conn);
-
- CDEBUG(D_NET, "%s: peer 0x%llx is 0x%p\n", __func__, nid, peer);
-
- switch (type) {
- case LNET_MSG_ACK:
- LASSERT (payload_nob == 0);
- break;
-
- case LNET_MSG_REPLY:
- case LNET_MSG_PUT:
- /* Is the payload small enough not to need DATA? */
- nob = offsetof(kmx_msg_t, mxm_u.eager.mxem_payload[payload_nob]);
- if (nob <= MXLND_MSG_SIZE)
- break; /* send EAGER */
-
- tx = mxlnd_get_idle_tx();
- if (unlikely(tx == NULL)) {
- CNETERR("Can't allocate %s tx for %s\n",
- type == LNET_MSG_PUT ? "PUT" : "REPLY",
- libcfs_nid2str(nid));
- if (conn) mxlnd_conn_decref(conn);
- return -ENOMEM;
- }
-
- tx->mxc_peer = peer;
- tx->mxc_conn = conn;
- /* we added a conn ref above */
- mxlnd_init_tx_msg (tx, MXLND_MSG_PUT_REQ, sizeof(kmx_putreq_msg_t), nid);
- txmsg = tx->mxc_msg;
- txmsg->mxm_u.put_req.mxprm_hdr = *hdr;
- txmsg->mxm_u.put_req.mxprm_cookie = tx->mxc_cookie;
- tx->mxc_match = mxlnd_create_match(tx, 0);
-
- /* we must post a receive _before_ sending the request.
- * we need to determine how much to receive, it will be either
- * a put_ack or a put_nak. The put_ack is larger, so use it. */
-
- rx = mxlnd_get_idle_rx(conn);
- if (unlikely(rx == NULL)) {
- CNETERR("Can't allocate rx for PUT_ACK for %s\n",
- libcfs_nid2str(nid));
- mxlnd_put_idle_tx(tx);
- if (conn) mxlnd_conn_decref(conn); /* for the ref taken above */
- return -ENOMEM;
- }
- rx->mxc_nid = nid;
- rx->mxc_peer = peer;
- mxlnd_conn_addref(conn); /* for this rx */
- rx->mxc_conn = conn;
- rx->mxc_msg_type = MXLND_MSG_PUT_ACK;
- rx->mxc_cookie = tx->mxc_cookie;
- rx->mxc_match = mxlnd_create_match(rx, 0);
-
- length = offsetof(kmx_msg_t, mxm_u) + sizeof(kmx_putack_msg_t);
- ret = mxlnd_recv_msg(lntmsg, rx, MXLND_MSG_PUT_ACK, rx->mxc_match, length);
- if (unlikely(ret != 0)) {
- CNETERR("recv_msg() failed for PUT_ACK for %s\n",
- libcfs_nid2str(nid));
- rx->mxc_lntmsg[0] = NULL;
- mxlnd_put_idle_rx(rx);
- mxlnd_put_idle_tx(tx);
- mxlnd_conn_decref(conn); /* for the rx... */
- mxlnd_conn_decref(conn); /* and for the tx */
- return -EHOSTUNREACH;
- }
-
- mxlnd_queue_tx(tx);
- return 0;
-
- case LNET_MSG_GET:
- if (routing || target_is_router)
- break; /* send EAGER */
-
- /* is the REPLY message too small for DATA? */
- nob = offsetof(kmx_msg_t, mxm_u.eager.mxem_payload[lntmsg->msg_md->md_length]);
- if (nob <= MXLND_MSG_SIZE)
- break; /* send EAGER */
-
- /* get tx (we need the cookie) , post rx for incoming DATA,
- * then post GET_REQ tx */
- tx = mxlnd_get_idle_tx();
- if (unlikely(tx == NULL)) {
- CNETERR("Can't allocate GET tx for %s\n",
- libcfs_nid2str(nid));
- mxlnd_conn_decref(conn); /* for the ref taken above */
- return -ENOMEM;
- }
- rx_data = mxlnd_get_idle_rx(conn);
- if (unlikely(rx_data == NULL)) {
- CNETERR("Can't allocate DATA rx for %s\n",
- libcfs_nid2str(nid));
- mxlnd_put_idle_tx(tx);
- mxlnd_conn_decref(conn); /* for the ref taken above */
- return -ENOMEM;
- }
- rx_data->mxc_peer = peer;
- /* NOTE no need to lock peer before adding conn ref since we took
- * a conn ref for the tx (it cannot be freed between there and here ) */
- mxlnd_conn_addref(conn); /* for the rx_data */
- rx_data->mxc_conn = conn;
-
- ret = mxlnd_recv_data(ni, lntmsg, rx_data, MXLND_MSG_GET_DATA, tx->mxc_cookie);
- if (unlikely(ret != 0)) {
- CNETERR("Can't setup GET sink for %s\n",
- libcfs_nid2str(nid));
- mxlnd_put_idle_rx(rx_data);
- mxlnd_put_idle_tx(tx);
- mxlnd_conn_decref(conn); /* for the rx_data... */
- mxlnd_conn_decref(conn); /* and for the tx */
- return -EIO;
- }
-
- tx->mxc_peer = peer;
- tx->mxc_conn = conn;
- /* conn ref taken above */
- mxlnd_init_tx_msg(tx, MXLND_MSG_GET_REQ, sizeof(kmx_getreq_msg_t), nid);
- txmsg = tx->mxc_msg;
- txmsg->mxm_u.get_req.mxgrm_hdr = *hdr;
- txmsg->mxm_u.get_req.mxgrm_cookie = tx->mxc_cookie;
- tx->mxc_match = mxlnd_create_match(tx, 0);
-
- mxlnd_queue_tx(tx);
- return 0;
-
- default:
- LBUG();
- mxlnd_conn_decref(conn); /* drop ref taken above */
- return -EIO;
- }
-
- /* send EAGER */
-
- LASSERT (offsetof(kmx_msg_t, mxm_u.eager.mxem_payload[payload_nob])
- <= MXLND_MSG_SIZE);
-
- tx = mxlnd_get_idle_tx();
- if (unlikely(tx == NULL)) {
- CNETERR("Can't send %s to %s: tx descs exhausted\n",
- mxlnd_lnetmsg_to_str(type), libcfs_nid2str(nid));
- mxlnd_conn_decref(conn); /* drop ref taken above */
- return -ENOMEM;
- }
-
- tx->mxc_peer = peer;
- tx->mxc_conn = conn;
- /* conn ref taken above */
- nob = offsetof(kmx_eager_msg_t, mxem_payload[payload_nob]);
- mxlnd_init_tx_msg (tx, MXLND_MSG_EAGER, nob, nid);
- tx->mxc_match = mxlnd_create_match(tx, 0);
-
- txmsg = tx->mxc_msg;
- txmsg->mxm_u.eager.mxem_hdr = *hdr;
-
- if (payload_kiov != NULL)
- lnet_copy_kiov2flat(MXLND_MSG_SIZE, txmsg,
- offsetof(kmx_msg_t, mxm_u.eager.mxem_payload),
- payload_niov, payload_kiov, payload_offset, payload_nob);
- else
- lnet_copy_iov2flat(MXLND_MSG_SIZE, txmsg,
- offsetof(kmx_msg_t, mxm_u.eager.mxem_payload),
- payload_niov, payload_iov, payload_offset, payload_nob);
-
- tx->mxc_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */
- mxlnd_queue_tx(tx);
- return 0;
-}
-
-/**
- * mxlnd_recv - the LND required recv function
- * @ni
- * @private
- * @lntmsg
- * @delayed
- * @niov
- * @kiov
- * @offset
- * @mlen
- * @rlen
- *
- * This must not block.
- */
-int
-mxlnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
- unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
- unsigned int offset, unsigned int mlen, unsigned int rlen)
-{
- int ret = 0;
- int nob = 0;
- int len = 0;
- kmx_ctx_t *rx = private;
- kmx_msg_t *rxmsg = rx->mxc_msg;
- lnet_nid_t nid = rx->mxc_nid;
- kmx_ctx_t *tx = NULL;
- kmx_msg_t *txmsg = NULL;
- kmx_peer_t *peer = rx->mxc_peer;
- kmx_conn_t *conn = peer->mxp_conn;
- u64 cookie = 0ULL;
- int msg_type = rxmsg->mxm_type;
- int repost = 1;
- int credit = 0;
- int finalize = 0;
-
- LASSERT (mlen <= rlen);
- /* Either all pages or all vaddrs */
- LASSERT (!(kiov != NULL && iov != NULL));
- LASSERT (peer && conn);
-
- /* conn_addref(conn) already taken for the primary rx */
-
- switch (msg_type) {
- case MXLND_MSG_EAGER:
- nob = offsetof(kmx_msg_t, mxm_u.eager.mxem_payload[rlen]);
- len = rx->mxc_status.xfer_length;
- if (unlikely(nob > len)) {
- CNETERR("Eager message from %s too big: %d(%d)\n",
- libcfs_nid2str(nid), nob, len);
- ret = -EPROTO;
- break;
- }
-
- if (kiov != NULL)
- lnet_copy_flat2kiov(niov, kiov, offset,
- MXLND_MSG_SIZE, rxmsg,
- offsetof(kmx_msg_t, mxm_u.eager.mxem_payload),
- mlen);
- else
- lnet_copy_flat2iov(niov, iov, offset,
- MXLND_MSG_SIZE, rxmsg,
- offsetof(kmx_msg_t, mxm_u.eager.mxem_payload),
- mlen);
- finalize = 1;
- credit = 1;
- break;
-
- case MXLND_MSG_PUT_REQ:
- /* we are going to reuse the rx, store the needed info */
- cookie = rxmsg->mxm_u.put_req.mxprm_cookie;
-
- /* get tx, post rx, send PUT_ACK */
-
- tx = mxlnd_get_idle_tx();
- if (unlikely(tx == NULL)) {
- CNETERR("Can't allocate tx for %s\n", libcfs_nid2str(nid));
- /* Not replying will break the connection */
- ret = -ENOMEM;
- break;
- }
- if (unlikely(mlen == 0)) {
- finalize = 1;
- tx->mxc_peer = peer;
- tx->mxc_conn = conn;
- mxlnd_send_nak(tx, nid, MXLND_MSG_PUT_ACK, 0, cookie);
- /* repost = 1 */
- break;
- }
-
- mxlnd_init_tx_msg(tx, MXLND_MSG_PUT_ACK, sizeof(kmx_putack_msg_t), nid);
- tx->mxc_peer = peer;
- tx->mxc_conn = conn;
- /* no need to lock peer first since we already have a ref */
- mxlnd_conn_addref(conn); /* for the tx */
- txmsg = tx->mxc_msg;
- txmsg->mxm_u.put_ack.mxpam_src_cookie = cookie;
- txmsg->mxm_u.put_ack.mxpam_dst_cookie = tx->mxc_cookie;
- tx->mxc_cookie = cookie;
- tx->mxc_match = mxlnd_create_match(tx, 0);
-
- /* we must post a receive _before_ sending the PUT_ACK */
- mxlnd_ctx_init(rx);
- rx->mxc_state = MXLND_CTX_PREP;
- rx->mxc_peer = peer;
- rx->mxc_conn = conn;
- /* do not take another ref for this rx, it is already taken */
- rx->mxc_nid = peer->mxp_nid;
- ret = mxlnd_recv_data(ni, lntmsg, rx, MXLND_MSG_PUT_DATA,
- txmsg->mxm_u.put_ack.mxpam_dst_cookie);
-
- if (unlikely(ret != 0)) {
- /* Notify peer that it's over */
- CNETERR("Can't setup PUT_DATA rx for %s: %d\n",
- libcfs_nid2str(nid), ret);
- mxlnd_ctx_init(tx);
- tx->mxc_state = MXLND_CTX_PREP;
- tx->mxc_peer = peer;
- tx->mxc_conn = conn;
- /* finalize = 0, let the PUT_ACK tx finalize this */
- tx->mxc_lntmsg[0] = rx->mxc_lntmsg[0];
- tx->mxc_lntmsg[1] = rx->mxc_lntmsg[1];
- /* conn ref already taken above */
- mxlnd_send_nak(tx, nid, MXLND_MSG_PUT_ACK, ret, cookie);
- /* repost = 1 */
- break;
- }
-
- mxlnd_queue_tx(tx);
- /* do not return a credit until after PUT_DATA returns */
- repost = 0;
- break;
-
- case MXLND_MSG_GET_REQ:
- cookie = rxmsg->mxm_u.get_req.mxgrm_cookie;
-
- if (likely(lntmsg != NULL)) {
- mxlnd_send_data(ni, lntmsg, rx->mxc_peer, MXLND_MSG_GET_DATA,
- cookie);
- } else {
- /* GET didn't match anything */
- /* The initiator has a rx mapped to [k]iov. We cannot send a nak.
- * We have to embed the error code in the match bits.
- * Send the error in bits 52-59 and the cookie in bits 0-51 */
- tx = mxlnd_get_idle_tx();
- if (unlikely(tx == NULL)) {
- CNETERR("Can't get tx for GET NAK for %s\n",
- libcfs_nid2str(nid));
- /* we can't get a tx, notify the peer that the GET failed */
- mxlnd_send_message(conn->mxk_epa, MXLND_MSG_GET_DATA,
- ENODATA, cookie);
- ret = -ENOMEM;
- break;
- }
- tx->mxc_msg_type = MXLND_MSG_GET_DATA;
- tx->mxc_state = MXLND_CTX_PENDING;
- tx->mxc_nid = nid;
- tx->mxc_peer = peer;
- tx->mxc_conn = conn;
- /* no need to lock peer first since we already have a ref */
- mxlnd_conn_addref(conn); /* for this tx */
- tx->mxc_cookie = cookie;
- tx->mxc_match = mxlnd_create_match(tx, ENODATA);
- tx->mxc_pin_type = MX_PIN_PHYSICAL;
- mxlnd_queue_tx(tx);
- }
- /* finalize lntmsg after tx completes */
- break;
-
- default:
- LBUG();
- }
-
- if (repost) {
- /* we received a message, increment peer's outstanding credits */
- if (credit == 1) {
- spin_lock(&conn->mxk_lock);
- conn->mxk_outstanding++;
- spin_unlock(&conn->mxk_lock);
- }
- /* we are done with the rx */
- mxlnd_put_idle_rx(rx);
- mxlnd_conn_decref(conn);
- }
-
- if (finalize == 1) lnet_finalize(kmxlnd_data.kmx_ni, lntmsg, 0);
-
- /* we received a credit, see if we can use it to send a msg */
- if (credit) mxlnd_check_sends(peer);
-
- return ret;
-}
-
-void
-mxlnd_sleep(unsigned long timeout)
-{
- set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(timeout);
- return;
-}
-
-/**
- * mxlnd_tx_queued - the generic send queue thread
- * @arg - thread id (as a void *)
- *
- * This thread moves send messages from the global tx_queue to the owning
- * peer's tx_[msg|data]_queue. If the peer does not exist, it creates one and adds
- * it to the global peer list.
- */
-int
-mxlnd_tx_queued(void *arg)
-{
- long id = (long) arg;
- int ret = 0;
- int found = 0;
- kmx_ctx_t *tx = NULL;
- kmx_peer_t *peer = NULL;
- cfs_list_t *queue = &kmxlnd_data.kmx_tx_queue;
- spinlock_t *tx_q_lock = &kmxlnd_data.kmx_tx_queue_lock;
- rwlock_t *g_lock = &kmxlnd_data.kmx_global_lock;
-
- while (!(atomic_read(&kmxlnd_data.kmx_shutdown))) {
- ret = down_interruptible(&kmxlnd_data.kmx_tx_queue_sem);
- if (atomic_read(&kmxlnd_data.kmx_shutdown))
- break;
- if (ret != 0) /* Should we check for -EINTR? */
- continue;
- spin_lock(tx_q_lock);
- if (cfs_list_empty(&kmxlnd_data.kmx_tx_queue)) {
- spin_unlock(tx_q_lock);
- continue;
- }
- tx = cfs_list_entry(queue->next, kmx_ctx_t, mxc_list);
- cfs_list_del_init(&tx->mxc_list);
- spin_unlock(tx_q_lock);
-
- found = 0;
- peer = mxlnd_find_peer_by_nid(tx->mxc_nid, 0); /* adds ref*/
- if (peer != NULL) {
- tx->mxc_peer = peer;
- write_lock(g_lock);
- if (peer->mxp_conn == NULL) {
- ret = mxlnd_conn_alloc_locked(&peer->mxp_conn,
- peer);
- if (ret != 0) {
- /* out of memory: give up, fail tx */
- tx->mxc_errno = -ENOMEM;
- mxlnd_peer_decref(peer);
- write_unlock(g_lock);
- mxlnd_put_idle_tx(tx);
- continue;
- }
- }
- tx->mxc_conn = peer->mxp_conn;
- mxlnd_conn_addref(tx->mxc_conn); /* for this tx */
- mxlnd_peer_decref(peer); /* drop peer ref taken above */
- write_unlock(g_lock);
- mxlnd_queue_tx(tx);
- found = 1;
- }
- if (found == 0) {
- int hash = 0;
- kmx_peer_t *peer = NULL;
- kmx_peer_t *old = NULL;
-
- hash = mxlnd_nid_to_hash(tx->mxc_nid);
-
- LASSERT(tx->mxc_msg_type != MXLND_MSG_PUT_DATA &&
- tx->mxc_msg_type != MXLND_MSG_GET_DATA);
- /* create peer */
- /* adds conn ref for this function */
- ret = mxlnd_peer_alloc(&peer, tx->mxc_nid,
- *kmxlnd_tunables.kmx_board,
- *kmxlnd_tunables.kmx_ep_id, 0ULL);
- if (ret != 0) {
- /* finalize message */
- tx->mxc_errno = ret;
- mxlnd_put_idle_tx(tx);
- continue;
- }
- tx->mxc_peer = peer;
- tx->mxc_conn = peer->mxp_conn;
- /* this tx will keep the conn ref taken in peer_alloc() */
-
- /* add peer to global peer list, but look to see
- * if someone already created it after we released
- * the read lock */
- write_lock(g_lock);
- old = mxlnd_find_peer_by_nid_locked(peer->mxp_nid);
- if (old) {
- /* we have a peer ref on old */
- if (old->mxp_conn) {
- found = 1;
- } else {
- /* no conn */
- /* drop our ref taken above... */
- mxlnd_peer_decref(old);
- /* and delete it */
- mxlnd_del_peer_locked(old);
- }
- }
-
- if (found == 0) {
- cfs_list_add_tail(&peer->mxp_list,
- &kmxlnd_data.kmx_peers[hash]);
- atomic_inc(&kmxlnd_data.kmx_npeers);
- } else {
- tx->mxc_peer = old;
- tx->mxc_conn = old->mxp_conn;
- LASSERT(old->mxp_conn != NULL);
- mxlnd_conn_addref(old->mxp_conn);
- mxlnd_conn_decref(peer->mxp_conn); /* drop ref taken above.. */
- mxlnd_conn_decref(peer->mxp_conn); /* drop peer's ref */
- mxlnd_peer_decref(peer);
- }
- write_unlock(g_lock);
-
- mxlnd_queue_tx(tx);
- }
- }
- mxlnd_thread_stop(id);
- return 0;
-}
-
-/* When calling this, we must not have the peer lock. */
-void
-mxlnd_iconnect(kmx_peer_t *peer, u8 msg_type)
-{
- mx_return_t mxret = MX_SUCCESS;
- mx_request_t request;
- kmx_conn_t *conn = peer->mxp_conn;
- u64 match = ((u64) msg_type) << MXLND_MSG_OFFSET;
-
- /* NOTE we are holding a conn ref every time we call this function,
- * we do not need to lock the peer before taking another ref */
- mxlnd_conn_addref(conn); /* hold until CONN_REQ or CONN_ACK completes */
-
- LASSERT(msg_type == MXLND_MSG_ICON_REQ || msg_type == MXLND_MSG_ICON_ACK);
-
- if (peer->mxp_reconnect_time == 0) {
- peer->mxp_reconnect_time = jiffies;
- }
-
- if (peer->mxp_nic_id == 0ULL) {
- int ret = 0;
-
- ret = mxlnd_ip2nic_id(LNET_NIDADDR(peer->mxp_nid),
- &peer->mxp_nic_id, MXLND_LOOKUP_COUNT);
- if (ret == 0) {
- mx_nic_id_to_board_number(peer->mxp_nic_id, &peer->mxp_board);
- }
- if (peer->mxp_nic_id == 0ULL && conn->mxk_status == MXLND_CONN_WAIT) {
- /* not mapped yet, return */
- spin_lock(&conn->mxk_lock);
- mxlnd_set_conn_status(conn, MXLND_CONN_INIT);
- spin_unlock(&conn->mxk_lock);
- }
- }
-
- if (cfs_time_after(jiffies,
- peer->mxp_reconnect_time + MXLND_CONNECT_TIMEOUT) &&
- conn->mxk_status != MXLND_CONN_DISCONNECT) {
- /* give up and notify LNET */
- CDEBUG(D_NET, "timeout trying to connect to %s\n",
- libcfs_nid2str(peer->mxp_nid));
- mxlnd_conn_disconnect(conn, 0, 0);
- mxlnd_conn_decref(conn);
- return;
- }
-
- mxret = mx_iconnect(kmxlnd_data.kmx_endpt, peer->mxp_nic_id,
- peer->mxp_ep_id, MXLND_MSG_MAGIC, match,
- (void *) peer, &request);
- if (unlikely(mxret != MX_SUCCESS)) {
- spin_lock(&conn->mxk_lock);
- mxlnd_set_conn_status(conn, MXLND_CONN_FAIL);
- spin_unlock(&conn->mxk_lock);
- CNETERR("mx_iconnect() failed with %s (%d) to %s\n",
- mx_strerror(mxret), mxret, libcfs_nid2str(peer->mxp_nid));
- mxlnd_conn_decref(conn);
- }
- mx_set_request_timeout(kmxlnd_data.kmx_endpt, request,
- jiffies_to_msecs(MXLND_CONNECT_TIMEOUT));
- return;
-}
-
-#define MXLND_STATS 0
-
-int
-mxlnd_check_sends(kmx_peer_t *peer)
-{
- int ret = 0;
- int found = 0;
- mx_return_t mxret = MX_SUCCESS;
- kmx_ctx_t *tx = NULL;
- kmx_conn_t *conn = NULL;
- u8 msg_type = 0;
- int credit = 0;
- int status = 0;
- int ntx_posted = 0;
- int credits = 0;
-#if MXLND_STATS
- static unsigned long last = 0;
-#endif
-
- if (unlikely(peer == NULL)) {
- LASSERT(peer != NULL);
- return -1;
- }
- write_lock(&kmxlnd_data.kmx_global_lock);
- conn = peer->mxp_conn;
- /* NOTE take a ref for the duration of this function since it is
- * called when there might not be any queued txs for this peer */
- if (conn) {
- if (conn->mxk_status == MXLND_CONN_DISCONNECT) {
- write_unlock(&kmxlnd_data.kmx_global_lock);
- return -1;
- }
- mxlnd_conn_addref(conn); /* for duration of this function */
- }
- write_unlock(&kmxlnd_data.kmx_global_lock);
-
- /* do not add another ref for this tx */
-
- if (conn == NULL) {
- /* we do not have any conns */
- CNETERR("peer %s has no conn\n", libcfs_nid2str(peer->mxp_nid));
- return -1;
- }
-
-#if MXLND_STATS
- if (cfs_time_after(jiffies, last)) {
- last = jiffies + msecs_to_jiffies(MSEC_PER_SEC);
- CDEBUG(D_NET, "status= %s credits= %d outstanding= %d ntx_msgs= %d "
- "ntx_posted= %d ntx_data= %d data_posted= %d\n",
- mxlnd_connstatus_to_str(conn->mxk_status), conn->mxk_credits,
- conn->mxk_outstanding, conn->mxk_ntx_msgs, conn->mxk_ntx_posted,
- conn->mxk_ntx_data, conn->mxk_data_posted);
- }
-#endif
-
- spin_lock(&conn->mxk_lock);
- ntx_posted = conn->mxk_ntx_posted;
- credits = conn->mxk_credits;
-
- LASSERT(ntx_posted <= *kmxlnd_tunables.kmx_peercredits);
- LASSERT(ntx_posted >= 0);
-
- LASSERT(credits <= *kmxlnd_tunables.kmx_peercredits);
- LASSERT(credits >= 0);
-
- /* check number of queued msgs, ignore data */
- if (conn->mxk_outstanding >= MXLND_CREDIT_HIGHWATER()) {
- /* check if any txs queued that could return credits... */
- if (cfs_list_empty(&conn->mxk_tx_credit_queue) ||
- conn->mxk_ntx_msgs == 0) {
- /* if not, send a NOOP */
- tx = mxlnd_get_idle_tx();
- if (likely(tx != NULL)) {
- tx->mxc_peer = peer;
- tx->mxc_conn = peer->mxp_conn;
- mxlnd_conn_addref(conn); /* for this tx */
- mxlnd_init_tx_msg (tx, MXLND_MSG_NOOP, 0, peer->mxp_nid);
- tx->mxc_match = mxlnd_create_match(tx, 0);
- mxlnd_peer_queue_tx_locked(tx);
- found = 1;
- goto done_locked;
- }
- }
- }
-
- /* if the peer is not ready, try to connect */
- if (unlikely(conn->mxk_status == MXLND_CONN_INIT ||
- conn->mxk_status == MXLND_CONN_FAIL)) {
- CDEBUG(D_NET, "status=%s\n", mxlnd_connstatus_to_str(conn->mxk_status));
- mxlnd_set_conn_status(conn, MXLND_CONN_WAIT);
- spin_unlock(&conn->mxk_lock);
- mxlnd_iconnect(peer, (u8) MXLND_MSG_ICON_REQ);
- goto done;
- }
-
- while (!cfs_list_empty(&conn->mxk_tx_free_queue) ||
- !cfs_list_empty(&conn->mxk_tx_credit_queue)) {
- /* We have something to send. If we have a queued tx that does not
- * require a credit (free), choose it since its completion will
- * return a credit (here or at the peer), complete a DATA or
- * CONN_REQ or CONN_ACK. */
- cfs_list_t *tmp_tx = NULL;
- if (!cfs_list_empty(&conn->mxk_tx_free_queue)) {
- tmp_tx = &conn->mxk_tx_free_queue;
- } else {
- tmp_tx = &conn->mxk_tx_credit_queue;
- }
- tx = cfs_list_entry(tmp_tx->next, kmx_ctx_t, mxc_list);
-
- msg_type = tx->mxc_msg_type;
-
- /* don't try to send a rx */
- LASSERT(tx->mxc_type == MXLND_REQ_TX);
-
- /* ensure that it is a valid msg type */
- LASSERT(msg_type == MXLND_MSG_CONN_REQ ||
- msg_type == MXLND_MSG_CONN_ACK ||
- msg_type == MXLND_MSG_NOOP ||
- msg_type == MXLND_MSG_EAGER ||
- msg_type == MXLND_MSG_PUT_REQ ||
- msg_type == MXLND_MSG_PUT_ACK ||
- msg_type == MXLND_MSG_PUT_DATA ||
- msg_type == MXLND_MSG_GET_REQ ||
- msg_type == MXLND_MSG_GET_DATA);
- LASSERT(tx->mxc_peer == peer);
- LASSERT(tx->mxc_nid == peer->mxp_nid);
-
- credit = mxlnd_tx_requires_credit(tx);
- if (credit) {
-
- if (conn->mxk_ntx_posted == *kmxlnd_tunables.kmx_peercredits) {
- CDEBUG(D_NET, "%s: posted enough\n",
- libcfs_nid2str(peer->mxp_nid));
- goto done_locked;
- }
-
- if (conn->mxk_credits == 0) {
- CDEBUG(D_NET, "%s: no credits\n",
- libcfs_nid2str(peer->mxp_nid));
- goto done_locked;
- }
-
- if (conn->mxk_credits == 1 && /* last credit reserved for */
- conn->mxk_outstanding == 0) { /* giving back credits */
- CDEBUG(D_NET, "%s: not using last credit\n",
- libcfs_nid2str(peer->mxp_nid));
- goto done_locked;
- }
- }
-
- if (unlikely(conn->mxk_status != MXLND_CONN_READY)) {
- if ( ! (msg_type == MXLND_MSG_CONN_REQ ||
- msg_type == MXLND_MSG_CONN_ACK)) {
- CDEBUG(D_NET, "peer status is %s for tx 0x%llx (%s)\n",
- mxlnd_connstatus_to_str(conn->mxk_status),
- tx->mxc_cookie,
- mxlnd_msgtype_to_str(tx->mxc_msg_type));
- if (conn->mxk_status == MXLND_CONN_DISCONNECT ||
- cfs_time_aftereq(jiffies, tx->mxc_deadline)) {
- cfs_list_del_init(&tx->mxc_list);
- tx->mxc_errno = -ECONNABORTED;
- spin_unlock(&conn->mxk_lock);
- mxlnd_put_idle_tx(tx);
- mxlnd_conn_decref(conn);
- goto done;
- }
- goto done_locked;
- }
- }
-
- cfs_list_del_init(&tx->mxc_list);
-
- /* handle credits, etc now while we have the lock to avoid races */
- if (credit) {
- conn->mxk_credits--;
- conn->mxk_ntx_posted++;
- }
- if (msg_type != MXLND_MSG_PUT_DATA &&
- msg_type != MXLND_MSG_GET_DATA) {
- if (msg_type != MXLND_MSG_CONN_REQ &&
- msg_type != MXLND_MSG_CONN_ACK) {
- conn->mxk_ntx_msgs--;
- }
- }
- if (tx->mxc_incarnation == 0 &&
- conn->mxk_incarnation != 0) {
- tx->mxc_incarnation = conn->mxk_incarnation;
- }
-
- /* if this is a NOOP and (1) mxp_conn->mxk_outstanding < CREDIT_HIGHWATER
- * or (2) there is a non-DATA msg that can return credits in the
- * queue, then drop this duplicate NOOP */
- if (unlikely(msg_type == MXLND_MSG_NOOP)) {
- if ((conn->mxk_outstanding < MXLND_CREDIT_HIGHWATER()) ||
- (conn->mxk_ntx_msgs >= 1)) {
- conn->mxk_credits++;
- conn->mxk_ntx_posted--;
- spin_unlock(&conn->mxk_lock);
- /* redundant NOOP */
- mxlnd_put_idle_tx(tx);
- mxlnd_conn_decref(conn);
- CDEBUG(D_NET, "%s: redundant noop\n",
- libcfs_nid2str(peer->mxp_nid));
- found = 1;
- goto done;
- }
- }
-
- found = 1;
- if (likely((msg_type != MXLND_MSG_PUT_DATA) &&
- (msg_type != MXLND_MSG_GET_DATA))) {
- mxlnd_pack_msg_locked(tx);
- }
-
- mxret = MX_SUCCESS;
-
- status = conn->mxk_status;
- spin_unlock(&conn->mxk_lock);
-
- if (likely((status == MXLND_CONN_READY) ||
- (msg_type == MXLND_MSG_CONN_REQ) ||
- (msg_type == MXLND_MSG_CONN_ACK))) {
- ret = 0;
- if (msg_type != MXLND_MSG_CONN_REQ &&
- msg_type != MXLND_MSG_CONN_ACK) {
- /* add to the pending list */
- ret = mxlnd_q_pending_ctx(tx);
- } else {
- /* CONN_REQ/ACK */
- tx->mxc_state = MXLND_CTX_PENDING;
- }
-
- if (ret == 0) {
- if (likely(msg_type != MXLND_MSG_PUT_DATA &&
- msg_type != MXLND_MSG_GET_DATA)) {
- /* send a msg style tx */
- LASSERT(tx->mxc_nseg == 1);
- LASSERT(tx->mxc_pin_type == MX_PIN_PHYSICAL);
- CDEBUG(D_NET, "sending %s 0x%llx\n",
- mxlnd_msgtype_to_str(msg_type),
- tx->mxc_cookie);
- mxret = mx_kisend(kmxlnd_data.kmx_endpt,
- &tx->mxc_seg,
- tx->mxc_nseg,
- tx->mxc_pin_type,
- conn->mxk_epa,
- tx->mxc_match,
- (void *) tx,
- &tx->mxc_mxreq);
- } else {
- /* send a DATA tx */
- spin_lock(&conn->mxk_lock);
- conn->mxk_ntx_data--;
- conn->mxk_data_posted++;
- spin_unlock(&conn->mxk_lock);
- CDEBUG(D_NET, "sending %s 0x%llx\n",
- mxlnd_msgtype_to_str(msg_type),
- tx->mxc_cookie);
- mxret = mx_kisend(kmxlnd_data.kmx_endpt,
- tx->mxc_seg_list,
- tx->mxc_nseg,
- tx->mxc_pin_type,
- conn->mxk_epa,
- tx->mxc_match,
- (void *) tx,
- &tx->mxc_mxreq);
- }
- } else {
- /* ret != 0 */
- mxret = MX_CONNECTION_FAILED;
- }
- if (likely(mxret == MX_SUCCESS)) {
- ret = 0;
- } else {
- CNETERR("mx_kisend() failed with %s (%d) "
- "sending to %s\n", mx_strerror(mxret), (int) mxret,
- libcfs_nid2str(peer->mxp_nid));
- /* NOTE mx_kisend() only fails if there are not enough
- * resources. Do not change the connection status. */
- if (mxret == MX_NO_RESOURCES) {
- tx->mxc_errno = -ENOMEM;
- } else {
- tx->mxc_errno = -ECONNABORTED;
- }
- if (credit) {
- spin_lock(&conn->mxk_lock);
- conn->mxk_ntx_posted--;
- conn->mxk_credits++;
- spin_unlock(&conn->mxk_lock);
- } else if (msg_type == MXLND_MSG_PUT_DATA ||
- msg_type == MXLND_MSG_GET_DATA) {
- spin_lock(&conn->mxk_lock);
- conn->mxk_data_posted--;
- spin_unlock(&conn->mxk_lock);
- }
- if (msg_type != MXLND_MSG_PUT_DATA &&
- msg_type != MXLND_MSG_GET_DATA &&
- msg_type != MXLND_MSG_CONN_REQ &&
- msg_type != MXLND_MSG_CONN_ACK) {
- spin_lock(&conn->mxk_lock);
- conn->mxk_outstanding +=
- tx->mxc_msg->mxm_credits;
- spin_unlock(&conn->mxk_lock);
- }
- if (msg_type != MXLND_MSG_CONN_REQ &&
- msg_type != MXLND_MSG_CONN_ACK) {
- /* remove from the pending list */
- mxlnd_deq_pending_ctx(tx);
- }
- mxlnd_put_idle_tx(tx);
- mxlnd_conn_decref(conn);
- }
- }
- spin_lock(&conn->mxk_lock);
- }
-done_locked:
- spin_unlock(&conn->mxk_lock);
-done:
- mxlnd_conn_decref(conn); /* drop ref taken at start of function */
- return found;
-}
-
-
-/**
- * mxlnd_handle_tx_completion - a tx completed, progress or complete the msg
- * @ctx - the tx descriptor
- *
- * Determine which type of send request it was and start the next step, if needed,
- * or, if done, signal completion to LNET. After we are done, put back on the
- * idle tx list.
- */
-void
-mxlnd_handle_tx_completion(kmx_ctx_t *tx)
-{
- int code = tx->mxc_status.code;
- int failed = (code != MX_STATUS_SUCCESS || tx->mxc_errno != 0);
- kmx_msg_t *msg = tx->mxc_msg;
- kmx_peer_t *peer = tx->mxc_peer;
- kmx_conn_t *conn = tx->mxc_conn;
- u8 type = tx->mxc_msg_type;
- int credit = mxlnd_tx_requires_credit(tx);
- u64 cookie = tx->mxc_cookie;
-
- CDEBUG(D_NET, "entering %s (0x%llx):\n",
- mxlnd_msgtype_to_str(tx->mxc_msg_type), cookie);
-
- LASSERT (peer != NULL);
- LASSERT (conn != NULL);
-
- if (type != MXLND_MSG_PUT_DATA && type != MXLND_MSG_GET_DATA) {
- LASSERT (type == msg->mxm_type);
- }
-
- if (failed) {
- if (tx->mxc_errno == 0) tx->mxc_errno = -EIO;
- } else {
- spin_lock(&conn->mxk_lock);
- conn->mxk_last_tx = cfs_time_current(); /* jiffies */
- spin_unlock(&conn->mxk_lock);
- }
-
- switch (type) {
-
- case MXLND_MSG_GET_DATA:
- spin_lock(&conn->mxk_lock);
- if (conn->mxk_incarnation == tx->mxc_incarnation) {
- conn->mxk_outstanding++;
- conn->mxk_data_posted--;
- }
- spin_unlock(&conn->mxk_lock);
- break;
-
- case MXLND_MSG_PUT_DATA:
- spin_lock(&conn->mxk_lock);
- if (conn->mxk_incarnation == tx->mxc_incarnation) {
- conn->mxk_data_posted--;
- }
- spin_unlock(&conn->mxk_lock);
- break;
-
- case MXLND_MSG_NOOP:
- case MXLND_MSG_PUT_REQ:
- case MXLND_MSG_PUT_ACK:
- case MXLND_MSG_GET_REQ:
- case MXLND_MSG_EAGER:
- break;
-
- case MXLND_MSG_CONN_ACK:
- if (peer->mxp_incompatible) {
- /* we sent our params, now close this conn */
- mxlnd_conn_disconnect(conn, 0, 1);
- }
- case MXLND_MSG_CONN_REQ:
- if (failed) {
- CNETERR("%s failed with %s (%d) (errno = %d) to %s\n",
- type == MXLND_MSG_CONN_REQ ? "CONN_REQ" : "CONN_ACK",
- mx_strstatus(code), code, tx->mxc_errno,
- libcfs_nid2str(tx->mxc_nid));
- if (!peer->mxp_incompatible) {
- spin_lock(&conn->mxk_lock);
- if (code == MX_STATUS_BAD_SESSION)
- mxlnd_set_conn_status(conn,
- MXLND_CONN_INIT);
- else
- mxlnd_set_conn_status(conn,
- MXLND_CONN_FAIL);
- spin_unlock(&conn->mxk_lock);
- }
- }
- break;
-
- default:
- CNETERR("Unknown msg type of %d\n", type);
- LBUG();
- }
-
- if (credit) {
- spin_lock(&conn->mxk_lock);
- if (conn->mxk_incarnation == tx->mxc_incarnation) {
- conn->mxk_ntx_posted--;
- }
- spin_unlock(&conn->mxk_lock);
- }
-
- mxlnd_put_idle_tx(tx);
- mxlnd_conn_decref(conn);
-
- mxlnd_check_sends(peer);
-
- CDEBUG(D_NET, "leaving\n");
- return;
-}
-
-/* Handle completion of MSG or DATA rx.
- * CONN_REQ and CONN_ACK are handled elsewhere. */
-void
-mxlnd_handle_rx_completion(kmx_ctx_t *rx)
-{
- int ret = 0;
- int repost = 1;
- int credit = 1;
- u32 nob = rx->mxc_status.xfer_length;
- u64 bits = rx->mxc_status.match_info;
- kmx_msg_t *msg = rx->mxc_msg;
- kmx_peer_t *peer = rx->mxc_peer;
- kmx_conn_t *conn = rx->mxc_conn;
- u8 type = rx->mxc_msg_type;
- u64 seq = bits;
- lnet_msg_t *lntmsg[2];
- int result = 0;
- int peer_ref = 0;
- int conn_ref = 0;
-
- /* NOTE We may only know the peer's nid if it is a PUT_REQ, GET_REQ,
- * failed GET reply */
-
- /* NOTE peer may still be NULL if it is a new peer and
- * conn may be NULL if this is a re-connect */
- if (likely(peer != NULL && conn != NULL)) {
- /* we have a reference on the conn */
- conn_ref = 1;
- } else if (peer != NULL && conn == NULL) {
- /* we have a reference on the peer */
- peer_ref = 1;
- } else if (peer == NULL && conn != NULL) {
- /* fatal error */
- CERROR("rx 0x%llx from %s has conn but no peer\n",
- bits, libcfs_nid2str(rx->mxc_nid));
- LBUG();
- } /* else peer and conn == NULL */
-
- if (conn == NULL && peer != NULL) {
- write_lock(&kmxlnd_data.kmx_global_lock);
- conn = peer->mxp_conn;
- if (conn) {
- mxlnd_conn_addref(conn); /* conn takes ref... */
- mxlnd_peer_decref(peer); /* from peer */
- conn_ref = 1;
- peer_ref = 0;
- }
- write_unlock(&kmxlnd_data.kmx_global_lock);
- rx->mxc_conn = conn;
- }
-
-#if MXLND_DEBUG
- CDEBUG(D_NET, "receiving msg bits=0x%llx nob=%d peer=0x%p\n", bits, nob, peer);
-#endif
-
- lntmsg[0] = NULL;
- lntmsg[1] = NULL;
-
- if (rx->mxc_status.code != MX_STATUS_SUCCESS &&
- rx->mxc_status.code != MX_STATUS_TRUNCATED) {
- CNETERR("rx from %s failed with %s (%d)\n",
- libcfs_nid2str(rx->mxc_nid),
- mx_strstatus(rx->mxc_status.code),
- rx->mxc_status.code);
- credit = 0;
- goto cleanup;
- }
-
- if (nob == 0) {
- /* this may be a failed GET reply */
- if (type == MXLND_MSG_GET_DATA) {
- /* get the error (52-59) bits from the match bits */
- ret = (u32) MXLND_ERROR_VAL(rx->mxc_status.match_info);
- lntmsg[0] = rx->mxc_lntmsg[0];
- result = -ret;
- goto cleanup;
- } else {
- /* we had a rx complete with 0 bytes (no hdr, nothing) */
- CNETERR("rx from %s returned with 0 bytes\n",
- libcfs_nid2str(rx->mxc_nid));
- goto cleanup;
- }
- }
-
- /* NOTE PUT_DATA and GET_DATA do not have mxc_msg, do not call unpack() */
- if (type == MXLND_MSG_PUT_DATA) {
- /* result = 0; */
- lntmsg[0] = rx->mxc_lntmsg[0];
- goto cleanup;
- } else if (type == MXLND_MSG_GET_DATA) {
- /* result = 0; */
- lntmsg[0] = rx->mxc_lntmsg[0];
- lntmsg[1] = rx->mxc_lntmsg[1];
- goto cleanup;
- }
-
- ret = mxlnd_unpack_msg(msg, nob);
- if (ret != 0) {
- CNETERR("Error %d unpacking rx from %s\n",
- ret, libcfs_nid2str(rx->mxc_nid));
- goto cleanup;
- }
- rx->mxc_nob = nob;
- type = msg->mxm_type;
-
- if (rx->mxc_nid != msg->mxm_srcnid ||
- kmxlnd_data.kmx_ni->ni_nid != msg->mxm_dstnid) {
- CNETERR("rx with mismatched NID (type %s) (my nid is "
- "0x%llx and rx msg dst is 0x%llx)\n",
- mxlnd_msgtype_to_str(type), kmxlnd_data.kmx_ni->ni_nid,
- msg->mxm_dstnid);
- goto cleanup;
- }
-
- if ((conn != NULL && msg->mxm_srcstamp != conn->mxk_incarnation) ||
- msg->mxm_dststamp != kmxlnd_data.kmx_incarnation) {
- CNETERR("Stale rx from %s with type %s "
- "(mxm_srcstamp (%lld) != mxk_incarnation (%lld) "
- "|| mxm_dststamp (%lld) != kmx_incarnation (%lld))\n",
- libcfs_nid2str(rx->mxc_nid), mxlnd_msgtype_to_str(type),
- msg->mxm_srcstamp, conn->mxk_incarnation,
- msg->mxm_dststamp, kmxlnd_data.kmx_incarnation);
- credit = 0;
- goto cleanup;
- }
-
- CDEBUG(D_NET, "Received %s with %d credits\n",
- mxlnd_msgtype_to_str(type), msg->mxm_credits);
-
- LASSERT(peer != NULL && conn != NULL);
- if (msg->mxm_credits != 0) {
- spin_lock(&conn->mxk_lock);
- if (msg->mxm_srcstamp == conn->mxk_incarnation) {
- if ((conn->mxk_credits + msg->mxm_credits) >
- *kmxlnd_tunables.kmx_peercredits) {
- CNETERR("mxk_credits %d mxm_credits %d\n",
- conn->mxk_credits, msg->mxm_credits);
- }
- conn->mxk_credits += msg->mxm_credits;
- LASSERT(conn->mxk_credits >= 0);
- LASSERT(conn->mxk_credits <= *kmxlnd_tunables.kmx_peercredits);
- }
- spin_unlock(&conn->mxk_lock);
- }
-
- CDEBUG(D_NET, "switch %s for rx (0x%llx)\n", mxlnd_msgtype_to_str(type), seq);
- switch (type) {
- case MXLND_MSG_NOOP:
- break;
-
- case MXLND_MSG_EAGER:
- ret = lnet_parse(kmxlnd_data.kmx_ni, &msg->mxm_u.eager.mxem_hdr,
- msg->mxm_srcnid, rx, 0);
- repost = ret < 0;
- break;
-
- case MXLND_MSG_PUT_REQ:
- ret = lnet_parse(kmxlnd_data.kmx_ni, &msg->mxm_u.put_req.mxprm_hdr,
- msg->mxm_srcnid, rx, 1);
- repost = ret < 0;
- break;
-
- case MXLND_MSG_PUT_ACK: {
- u64 cookie = (u64) msg->mxm_u.put_ack.mxpam_dst_cookie;
- if (cookie > MXLND_MAX_COOKIE) {
- CNETERR("NAK for msg_type %d from %s\n", rx->mxc_msg_type,
- libcfs_nid2str(rx->mxc_nid));
- result = -((u32) MXLND_ERROR_VAL(cookie));
- lntmsg[0] = rx->mxc_lntmsg[0];
- } else {
- mxlnd_send_data(kmxlnd_data.kmx_ni, rx->mxc_lntmsg[0],
- rx->mxc_peer, MXLND_MSG_PUT_DATA,
- rx->mxc_msg->mxm_u.put_ack.mxpam_dst_cookie);
- }
- /* repost == 1 */
- break;
- }
- case MXLND_MSG_GET_REQ:
- ret = lnet_parse(kmxlnd_data.kmx_ni, &msg->mxm_u.get_req.mxgrm_hdr,
- msg->mxm_srcnid, rx, 1);
- repost = ret < 0;
- break;
-
- default:
- CNETERR("Bad MXLND message type %x from %s\n", msg->mxm_type,
- libcfs_nid2str(rx->mxc_nid));
- ret = -EPROTO;
- break;
- }
-
- if (ret < 0) {
- CDEBUG(D_NET, "setting PEER_CONN_FAILED\n");
- spin_lock(&conn->mxk_lock);
- mxlnd_set_conn_status(conn, MXLND_CONN_FAIL);
- spin_unlock(&conn->mxk_lock);
- }
-
-cleanup:
- if (conn != NULL) {
- spin_lock(&conn->mxk_lock);
- conn->mxk_last_rx = cfs_time_current(); /* jiffies */
- spin_unlock(&conn->mxk_lock);
- }
-
- if (repost) {
- /* lnet_parse() failed, etc., repost now */
- mxlnd_put_idle_rx(rx);
- if (conn != NULL && credit == 1) {
- if (type == MXLND_MSG_PUT_DATA ||
- type == MXLND_MSG_EAGER ||
- type == MXLND_MSG_PUT_REQ ||
- type == MXLND_MSG_NOOP) {
- spin_lock(&conn->mxk_lock);
- conn->mxk_outstanding++;
- spin_unlock(&conn->mxk_lock);
- }
- }
- if (conn_ref) mxlnd_conn_decref(conn);
- LASSERT(peer_ref == 0);
- }
-
- if (type == MXLND_MSG_PUT_DATA || type == MXLND_MSG_GET_DATA) {
- CDEBUG(D_NET, "leaving for rx (0x%llx)\n", bits);
- } else {
- CDEBUG(D_NET, "leaving for rx (0x%llx)\n", seq);
- }
-
- if (lntmsg[0] != NULL) lnet_finalize(kmxlnd_data.kmx_ni, lntmsg[0], result);
- if (lntmsg[1] != NULL) lnet_finalize(kmxlnd_data.kmx_ni, lntmsg[1], result);
-
- if (conn != NULL && credit == 1) mxlnd_check_sends(peer);
-
- return;
-}
-
-void
-mxlnd_handle_connect_msg(kmx_peer_t *peer, u8 msg_type, mx_status_t status)
-{
- kmx_ctx_t *tx = NULL;
- kmx_msg_t *txmsg = NULL;
- kmx_conn_t *conn = peer->mxp_conn;
- u64 nic_id = 0ULL;
- u32 ep_id = 0;
- u32 sid = 0;
- u8 type = (msg_type == MXLND_MSG_ICON_REQ ?
- MXLND_MSG_CONN_REQ : MXLND_MSG_CONN_ACK);
-
- /* a conn ref was taken when calling mx_iconnect(),
- * hold it until CONN_REQ or CONN_ACK completes */
-
- CDEBUG(D_NET, "entering\n");
- if (status.code != MX_STATUS_SUCCESS) {
- int send_bye = (msg_type == MXLND_MSG_ICON_REQ ? 0 : 1);
-
- CNETERR("mx_iconnect() failed for %s with %s (%d) "
- "to %s mxp_nid = 0x%llx mxp_nic_id = 0x%0llx mxp_ep_id = %d\n",
- mxlnd_msgtype_to_str(msg_type),
- mx_strstatus(status.code), status.code,
- libcfs_nid2str(peer->mxp_nid),
- peer->mxp_nid,
- peer->mxp_nic_id,
- peer->mxp_ep_id);
- spin_lock(&conn->mxk_lock);
- mxlnd_set_conn_status(conn, MXLND_CONN_FAIL);
- spin_unlock(&conn->mxk_lock);
-
- if (cfs_time_after(jiffies, peer->mxp_reconnect_time +
- MXLND_CONNECT_TIMEOUT)) {
- CNETERR("timeout, calling conn_disconnect()\n");
- mxlnd_conn_disconnect(conn, 0, send_bye);
- }
-
- mxlnd_conn_decref(conn);
- return;
- }
- mx_decompose_endpoint_addr2(status.source, &nic_id, &ep_id, &sid);
- write_lock(&kmxlnd_data.kmx_global_lock);
- spin_lock(&conn->mxk_lock);
- conn->mxk_epa = status.source;
- mx_set_endpoint_addr_context(conn->mxk_epa, (void *) conn);
- if (msg_type == MXLND_MSG_ICON_ACK && likely(!peer->mxp_incompatible)) {
- mxlnd_set_conn_status(conn, MXLND_CONN_READY);
- }
- spin_unlock(&conn->mxk_lock);
- write_unlock(&kmxlnd_data.kmx_global_lock);
-
- /* mx_iconnect() succeeded, reset delay to 0 */
- write_lock(&kmxlnd_data.kmx_global_lock);
- peer->mxp_reconnect_time = 0;
- peer->mxp_conn->mxk_sid = sid;
- write_unlock(&kmxlnd_data.kmx_global_lock);
-
- /* marshal CONN_REQ or CONN_ACK msg */
- /* we are still using the conn ref from iconnect() - do not take another */
- tx = mxlnd_get_idle_tx();
- if (tx == NULL) {
- CNETERR("Can't obtain %s tx for %s\n",
- mxlnd_msgtype_to_str(type),
- libcfs_nid2str(peer->mxp_nid));
- spin_lock(&conn->mxk_lock);
- mxlnd_set_conn_status(conn, MXLND_CONN_FAIL);
- spin_unlock(&conn->mxk_lock);
- mxlnd_conn_decref(conn);
- return;
- }
-
- tx->mxc_peer = peer;
- tx->mxc_conn = conn;
- tx->mxc_deadline = jiffies + MXLND_CONNECT_TIMEOUT;
- CDEBUG(D_NET, "sending %s\n", mxlnd_msgtype_to_str(type));
- mxlnd_init_tx_msg (tx, type, sizeof(kmx_connreq_msg_t), peer->mxp_nid);
- txmsg = tx->mxc_msg;
- txmsg->mxm_u.conn_req.mxcrm_queue_depth = *kmxlnd_tunables.kmx_peercredits;
- txmsg->mxm_u.conn_req.mxcrm_eager_size = MXLND_MSG_SIZE;
- tx->mxc_match = mxlnd_create_match(tx, 0);
-
- mxlnd_queue_tx(tx);
- return;
-}
-
-/**
- * mxlnd_request_waitd - the MX request completion thread(s)
- * @arg - thread id (as a void *)
- *
- * This thread waits for a MX completion and then completes the request.
- * We will create one thread per CPU.
- */
-int
-mxlnd_request_waitd(void *arg)
-{
- long id = (long) arg;
- __u32 result = 0;
- mx_return_t mxret = MX_SUCCESS;
- mx_status_t status;
- kmx_ctx_t *ctx = NULL;
- enum kmx_req_state req_type = MXLND_REQ_TX;
- kmx_peer_t *peer = NULL;
- kmx_conn_t *conn = NULL;
-#if MXLND_POLLING
- int count = 0;
-#endif
-
- memset(&status, 0, sizeof(status));
-
- CDEBUG(D_NET, "%s starting\n", name);
-
- while (!(atomic_read(&kmxlnd_data.kmx_shutdown))) {
- u8 msg_type = 0;
-
- mxret = MX_SUCCESS;
- result = 0;
-#if MXLND_POLLING
- if (id == 0 && count++ < *kmxlnd_tunables.kmx_polling) {
- mxret = mx_test_any(kmxlnd_data.kmx_endpt, 0ULL, 0ULL,
- &status, &result);
- } else {
- count = 0;
- mxret = mx_wait_any(kmxlnd_data.kmx_endpt, MXLND_WAIT_TIMEOUT,
- 0ULL, 0ULL, &status, &result);
- }
-#else
- mxret = mx_wait_any(kmxlnd_data.kmx_endpt, MXLND_WAIT_TIMEOUT,
- 0ULL, 0ULL, &status, &result);
-#endif
- if (unlikely(atomic_read(&kmxlnd_data.kmx_shutdown)))
- break;
-
- if (result != 1) {
- /* nothing completed... */
- continue;
- }
-
- CDEBUG(D_NET, "wait_any() returned with %s (%d) with "
- "match_info 0x%llx and length %d\n",
- mx_strstatus(status.code), status.code,
- (u64) status.match_info, status.msg_length);
-
- if (status.code != MX_STATUS_SUCCESS) {
- CNETERR("wait_any() failed with %s (%d) with "
- "match_info 0x%llx and length %d\n",
- mx_strstatus(status.code), status.code,
- (u64) status.match_info, status.msg_length);
- }
-
- msg_type = MXLND_MSG_TYPE(status.match_info);
-
- /* This may be a mx_iconnect() request completing,
- * check the bit mask for CONN_REQ and CONN_ACK */
- if (msg_type == MXLND_MSG_ICON_REQ ||
- msg_type == MXLND_MSG_ICON_ACK) {
- peer = (kmx_peer_t*) status.context;
- mxlnd_handle_connect_msg(peer, msg_type, status);
- continue;
- }
-
- /* This must be a tx or rx */
-
- /* NOTE: if this is a RX from the unexpected callback, it may
- * have very little info. If we dropped it in unexpected_recv(),
- * it will not have a context. If so, ignore it. */
- ctx = (kmx_ctx_t *) status.context;
- if (ctx != NULL) {
-
- req_type = ctx->mxc_type;
- conn = ctx->mxc_conn; /* this may be NULL */
- mxlnd_deq_pending_ctx(ctx);
-
- /* copy status to ctx->mxc_status */
- ctx->mxc_status = status;
-
- switch (req_type) {
- case MXLND_REQ_TX:
- mxlnd_handle_tx_completion(ctx);
- break;
- case MXLND_REQ_RX:
- mxlnd_handle_rx_completion(ctx);
- break;
- default:
- CNETERR("Unknown ctx type %d\n", req_type);
- LBUG();
- break;
- }
-
- /* conn is always set except for the first CONN_REQ rx
- * from a new peer */
- if (status.code != MX_STATUS_SUCCESS && conn != NULL) {
- mxlnd_conn_disconnect(conn, 1, 1);
- }
- }
- CDEBUG(D_NET, "waitd() completed task\n");
- }
- CDEBUG(D_NET, "%s stopping\n", name);
- mxlnd_thread_stop(id);
- return 0;
-}
-
-
-unsigned long
-mxlnd_check_timeouts(unsigned long now)
-{
- int i = 0;
- int disconnect = 0;
- unsigned long next = 0; /* jiffies */
- kmx_peer_t *peer = NULL;
- kmx_conn_t *conn = NULL;
- rwlock_t *g_lock = &kmxlnd_data.kmx_global_lock;
-
- read_lock(g_lock);
- for (i = 0; i < MXLND_HASH_SIZE; i++) {
- cfs_list_for_each_entry(peer, &kmxlnd_data.kmx_peers[i],
- mxp_list) {
-
- if (unlikely(atomic_read(&kmxlnd_data.kmx_shutdown))) {
- read_unlock(g_lock);
- return next;
- }
-
- conn = peer->mxp_conn;
- if (conn) {
- mxlnd_conn_addref(conn);
- } else {
- continue;
- }
-
- spin_lock(&conn->mxk_lock);
-
- /* if nothing pending (timeout == 0) or
- * if conn is already disconnected,
- * skip this conn */
- if (conn->mxk_timeout == 0 ||
- conn->mxk_status == MXLND_CONN_DISCONNECT) {
- spin_unlock(&conn->mxk_lock);
- mxlnd_conn_decref(conn);
- continue;
- }
-
- /* we want to find the timeout that will occur first.
- * if it is in the future, we will sleep until then.
- * if it is in the past, then we will sleep one
- * second and repeat the process. */
- if ((next == 0) ||
- (cfs_time_before(conn->mxk_timeout, next))) {
- next = conn->mxk_timeout;
- }
-
- disconnect = 0;
-
- if (cfs_time_aftereq(now, conn->mxk_timeout))
- disconnect = 1;
- spin_unlock(&conn->mxk_lock);
-
- if (disconnect)
- mxlnd_conn_disconnect(conn, 1, 1);
- mxlnd_conn_decref(conn);
- }
- }
- read_unlock(g_lock);
- if (next == 0)
- next = now + MXLND_COMM_TIMEOUT;
-
- return next;
-}
-
-void
-mxlnd_passive_connect(kmx_connparams_t *cp)
-{
- int ret = 0;
- int incompatible = 0;
- u64 nic_id = 0ULL;
- u32 ep_id = 0;
- u32 sid = 0;
- int conn_ref = 0;
- kmx_msg_t *msg = &cp->mxr_msg;
- kmx_peer_t *peer = cp->mxr_peer;
- kmx_conn_t *conn = NULL;
- rwlock_t *g_lock = &kmxlnd_data.kmx_global_lock;
-
- mx_decompose_endpoint_addr2(cp->mxr_epa, &nic_id, &ep_id, &sid);
-
- ret = mxlnd_unpack_msg(msg, cp->mxr_nob);
- if (ret != 0) {
- if (peer) {
- CNETERR("Error %d unpacking CONN_REQ from %s\n",
- ret, libcfs_nid2str(peer->mxp_nid));
- } else {
- CNETERR("Error %d unpacking CONN_REQ from "
- "unknown host with nic_id 0x%llx\n", ret, nic_id);
- }
- goto cleanup;
- }
- if (kmxlnd_data.kmx_ni->ni_nid != msg->mxm_dstnid) {
- CNETERR("Can't accept %s: bad dst nid %s\n",
- libcfs_nid2str(msg->mxm_srcnid),
- libcfs_nid2str(msg->mxm_dstnid));
- goto cleanup;
- }
- if (msg->mxm_u.conn_req.mxcrm_queue_depth != *kmxlnd_tunables.kmx_peercredits) {
- CNETERR("Can't accept %s: incompatible queue depth "
- "%d (%d wanted)\n",
- libcfs_nid2str(msg->mxm_srcnid),
- msg->mxm_u.conn_req.mxcrm_queue_depth,
- *kmxlnd_tunables.kmx_peercredits);
- incompatible = 1;
- }
- if (msg->mxm_u.conn_req.mxcrm_eager_size != MXLND_MSG_SIZE) {
- CNETERR("Can't accept %s: incompatible EAGER size "
- "%d (%d wanted)\n",
- libcfs_nid2str(msg->mxm_srcnid),
- msg->mxm_u.conn_req.mxcrm_eager_size,
- (int) MXLND_MSG_SIZE);
- incompatible = 1;
- }
-
- if (peer == NULL) {
- peer = mxlnd_find_peer_by_nid(msg->mxm_srcnid, 0); /* adds peer ref */
- if (peer == NULL) {
- int hash = 0;
- u32 board = 0;
- kmx_peer_t *existing_peer = NULL;
-
- hash = mxlnd_nid_to_hash(msg->mxm_srcnid);
-
- mx_nic_id_to_board_number(nic_id, &board);
-
- /* adds conn ref for peer and one for this function */
- ret = mxlnd_peer_alloc(&peer, msg->mxm_srcnid,
- board, ep_id, 0ULL);
- if (ret != 0) {
- goto cleanup;
- }
- peer->mxp_conn->mxk_sid = sid;
- LASSERT(peer->mxp_ep_id == ep_id);
- write_lock(g_lock);
- existing_peer = mxlnd_find_peer_by_nid_locked(msg->mxm_srcnid);
- if (existing_peer) {
- mxlnd_conn_decref(peer->mxp_conn);
- mxlnd_peer_decref(peer);
- peer = existing_peer;
- mxlnd_conn_addref(peer->mxp_conn);
- conn = peer->mxp_conn;
- } else {
- cfs_list_add_tail(&peer->mxp_list,
- &kmxlnd_data.kmx_peers[hash]);
- atomic_inc(&kmxlnd_data.kmx_npeers);
- }
- write_unlock(g_lock);
- } else {
- ret = mxlnd_conn_alloc(&conn, peer); /* adds 2nd ref */
- write_lock(g_lock);
- mxlnd_peer_decref(peer); /* drop ref taken above */
- write_unlock(g_lock);
- if (ret != 0) {
- CNETERR("Cannot allocate mxp_conn\n");
- goto cleanup;
- }
- }
- conn_ref = 1; /* peer/conn_alloc() added ref for this function */
- conn = peer->mxp_conn;
- } else { /* unexpected handler found peer */
- kmx_conn_t *old_conn = peer->mxp_conn;
-
- if (sid != peer->mxp_conn->mxk_sid) {
- /* do not call mx_disconnect() or send a BYE */
- mxlnd_conn_disconnect(old_conn, 0, 0);
-
- /* This allocs a conn, points peer->mxp_conn to this one.
- * The old conn is still on the peer->mxp_conns list.
- * As the pending requests complete, they will call
- * conn_decref() which will eventually free it. */
- ret = mxlnd_conn_alloc(&conn, peer);
- if (ret != 0) {
- CNETERR("Cannot allocate peer->mxp_conn\n");
- goto cleanup;
- }
- /* conn_alloc() adds one ref for the peer and one
- * for this function */
- conn_ref = 1;
-
- peer->mxp_conn->mxk_sid = sid;
- } else {
- /* same sid */
- conn = peer->mxp_conn;
- }
- }
- write_lock(g_lock);
- peer->mxp_incompatible = incompatible;
- write_unlock(g_lock);
- spin_lock(&conn->mxk_lock);
- conn->mxk_incarnation = msg->mxm_srcstamp;
- mxlnd_set_conn_status(conn, MXLND_CONN_WAIT);
- spin_unlock(&conn->mxk_lock);
-
- /* handle_conn_ack() will create the CONN_ACK msg */
- mxlnd_iconnect(peer, (u8) MXLND_MSG_ICON_ACK);
-
-cleanup:
- if (conn_ref) mxlnd_conn_decref(conn);
-
- mxlnd_connparams_free(cp);
- return;
-}
-
-void
-mxlnd_check_conn_ack(kmx_connparams_t *cp)
-{
- int ret = 0;
- int incompatible = 0;
- u64 nic_id = 0ULL;
- u32 ep_id = 0;
- u32 sid = 0;
- kmx_msg_t *msg = &cp->mxr_msg;
- kmx_peer_t *peer = cp->mxr_peer;
- kmx_conn_t *conn = cp->mxr_conn;
-
- mx_decompose_endpoint_addr2(cp->mxr_epa, &nic_id, &ep_id, &sid);
-
- ret = mxlnd_unpack_msg(msg, cp->mxr_nob);
- if (ret != 0) {
- if (peer) {
- CNETERR("Error %d unpacking CONN_ACK from %s\n",
- ret, libcfs_nid2str(peer->mxp_nid));
- } else {
- CNETERR("Error %d unpacking CONN_ACK from "
- "unknown host with nic_id 0x%llx\n", ret, nic_id);
- }
- ret = -1;
- incompatible = 1;
- goto failed;
- }
- if (kmxlnd_data.kmx_ni->ni_nid != msg->mxm_dstnid) {
- CNETERR("Can't accept CONN_ACK from %s: "
- "bad dst nid %s\n", libcfs_nid2str(msg->mxm_srcnid),
- libcfs_nid2str(msg->mxm_dstnid));
- ret = -1;
- goto failed;
- }
- if (msg->mxm_u.conn_req.mxcrm_queue_depth != *kmxlnd_tunables.kmx_peercredits) {
- CNETERR("Can't accept CONN_ACK from %s: "
- "incompatible queue depth %d (%d wanted)\n",
- libcfs_nid2str(msg->mxm_srcnid),
- msg->mxm_u.conn_req.mxcrm_queue_depth,
- *kmxlnd_tunables.kmx_peercredits);
- incompatible = 1;
- ret = -1;
- goto failed;
- }
- if (msg->mxm_u.conn_req.mxcrm_eager_size != MXLND_MSG_SIZE) {
- CNETERR("Can't accept CONN_ACK from %s: "
- "incompatible EAGER size %d (%d wanted)\n",
- libcfs_nid2str(msg->mxm_srcnid),
- msg->mxm_u.conn_req.mxcrm_eager_size,
- (int) MXLND_MSG_SIZE);
- incompatible = 1;
- ret = -1;
- goto failed;
- }
- write_lock(&kmxlnd_data.kmx_global_lock);
- peer->mxp_incompatible = incompatible;
- write_unlock(&kmxlnd_data.kmx_global_lock);
- spin_lock(&conn->mxk_lock);
- conn->mxk_credits = *kmxlnd_tunables.kmx_peercredits;
- conn->mxk_outstanding = 0;
- conn->mxk_incarnation = msg->mxm_srcstamp;
- conn->mxk_timeout = 0;
- if (!incompatible) {
- CDEBUG(D_NET, "setting peer %s CONN_READY\n",
- libcfs_nid2str(msg->mxm_srcnid));
- mxlnd_set_conn_status(conn, MXLND_CONN_READY);
- }
- spin_unlock(&conn->mxk_lock);
-
- if (!incompatible)
- mxlnd_check_sends(peer);
-
-failed:
- if (ret < 0) {
- spin_lock(&conn->mxk_lock);
- mxlnd_set_conn_status(conn, MXLND_CONN_FAIL);
- spin_unlock(&conn->mxk_lock);
- }
-
- if (incompatible) mxlnd_conn_disconnect(conn, 0, 0);
-
- mxlnd_connparams_free(cp);
- return;
-}
-
-int
-mxlnd_abort_msgs(void)
-{
- int count = 0;
- cfs_list_t *orphans = &kmxlnd_data.kmx_orphan_msgs;
- spinlock_t *g_conn_lock = &kmxlnd_data.kmx_conn_lock;
-
- /* abort orphans */
- spin_lock(g_conn_lock);
- while (!cfs_list_empty(orphans)) {
- kmx_ctx_t *ctx = NULL;
- kmx_conn_t *conn = NULL;
-
- ctx = cfs_list_entry(orphans->next, kmx_ctx_t, mxc_list);
- cfs_list_del_init(&ctx->mxc_list);
- spin_unlock(g_conn_lock);
-
- ctx->mxc_errno = -ECONNABORTED;
- conn = ctx->mxc_conn;
- CDEBUG(D_NET, "aborting %s %s %s\n",
- mxlnd_msgtype_to_str(ctx->mxc_msg_type),
- ctx->mxc_type == MXLND_REQ_TX ? "(TX) to" : "(RX) from",
- libcfs_nid2str(ctx->mxc_nid));
- if (ctx->mxc_type == MXLND_REQ_TX) {
- mxlnd_put_idle_tx(ctx); /* do not hold any locks */
- if (conn) mxlnd_conn_decref(conn); /* for this tx */
- } else {
- ctx->mxc_state = MXLND_CTX_CANCELED;
- mxlnd_handle_rx_completion(ctx);
- }
-
- count++;
- spin_lock(g_conn_lock);
- }
- spin_unlock(g_conn_lock);
-
- return count;
-}
-
-int
-mxlnd_free_conn_zombies(void)
-{
- int count = 0;
- cfs_list_t *zombies = &kmxlnd_data.kmx_conn_zombies;
- spinlock_t *g_conn_lock = &kmxlnd_data.kmx_conn_lock;
- rwlock_t *g_lock = &kmxlnd_data.kmx_global_lock;
-
- /* cleanup any zombies */
- spin_lock(g_conn_lock);
- while (!cfs_list_empty(zombies)) {
- kmx_conn_t *conn = NULL;
-
- conn = cfs_list_entry(zombies->next, kmx_conn_t, mxk_zombie);
- cfs_list_del_init(&conn->mxk_zombie);
- spin_unlock(g_conn_lock);
-
- write_lock(g_lock);
- mxlnd_conn_free_locked(conn);
- write_unlock(g_lock);
-
- count++;
- spin_lock(g_conn_lock);
- }
- spin_unlock(g_conn_lock);
- CDEBUG(D_NET, "%s: freed %d zombies\n", __func__, count);
- return count;
-}
-
-/**
- * mxlnd_connd - handles incoming connection requests
- * @arg - thread id (as a void *)
- *
- * This thread handles incoming connection requests
- */
-int
-mxlnd_connd(void *arg)
-{
- long id = (long) arg;
-
- CDEBUG(D_NET, "connd starting\n");
-
- while (!(atomic_read(&kmxlnd_data.kmx_shutdown))) {
- int ret = 0;
- kmx_connparams_t *cp = NULL;
- spinlock_t *g_conn_lock = &kmxlnd_data.kmx_conn_lock;
- cfs_list_t *conn_reqs = &kmxlnd_data.kmx_conn_reqs;
-
- ret = down_interruptible(&kmxlnd_data.kmx_conn_sem);
-
- if (atomic_read(&kmxlnd_data.kmx_shutdown))
- break;
-
- if (ret != 0)
- continue;
-
- ret = mxlnd_abort_msgs();
- ret += mxlnd_free_conn_zombies();
-
- spin_lock(g_conn_lock);
- if (cfs_list_empty(conn_reqs)) {
- if (ret == 0)
- CNETERR("connd woke up but did not find a "
- "kmx_connparams_t or zombie conn\n");
- spin_unlock(g_conn_lock);
- continue;
- }
- cp = cfs_list_entry(conn_reqs->next, kmx_connparams_t,
- mxr_list);
- cfs_list_del_init(&cp->mxr_list);
- spin_unlock(g_conn_lock);
-
- switch (MXLND_MSG_TYPE(cp->mxr_match)) {
- case MXLND_MSG_CONN_REQ:
- /* We have a connection request. Handle it. */
- mxlnd_passive_connect(cp);
- break;
- case MXLND_MSG_CONN_ACK:
- /* The peer is ready for messages */
- mxlnd_check_conn_ack(cp);
- break;
- }
- }
-
- mxlnd_free_conn_zombies();
-
- CDEBUG(D_NET, "connd stopping\n");
- mxlnd_thread_stop(id);
- return 0;
-}
-
-/**
- * mxlnd_timeoutd - enforces timeouts on messages
- * @arg - thread id (as a void *)
- *
- * This thread queries each peer for its earliest timeout. If a peer has timed out,
- * it calls mxlnd_conn_disconnect().
- *
- * After checking for timeouts, try progressing sends (call check_sends()).
- */
-int
-mxlnd_timeoutd(void *arg)
-{
- int i = 0;
- long id = (long) arg;
- unsigned long now = 0;
- unsigned long next = 0;
- unsigned long delay = msecs_to_jiffies(MSEC_PER_SEC);
- kmx_peer_t *peer = NULL;
- kmx_peer_t *temp = NULL;
- kmx_conn_t *conn = NULL;
- rwlock_t *g_lock = &kmxlnd_data.kmx_global_lock;
-
- CDEBUG(D_NET, "timeoutd starting\n");
-
- while (!(atomic_read(&kmxlnd_data.kmx_shutdown))) {
-
- now = jiffies;
- /* if the next timeout has not arrived, go back to sleep */
- if (cfs_time_after(now, next)) {
- next = mxlnd_check_timeouts(now);
- }
-
- /* try to progress peers' txs */
- write_lock(g_lock);
- for (i = 0; i < MXLND_HASH_SIZE; i++) {
- cfs_list_t *peers = &kmxlnd_data.kmx_peers[i];
-
- /* NOTE we are safe against the removal of peer, but
- * not against the removal of temp */
- cfs_list_for_each_entry_safe(peer, temp, peers,
- mxp_list) {
- if (atomic_read(&kmxlnd_data.kmx_shutdown))
- break;
- mxlnd_peer_addref(peer); /* add ref... */
- conn = peer->mxp_conn;
- if (conn && conn->mxk_status != MXLND_CONN_DISCONNECT) {
- mxlnd_conn_addref(conn); /* take ref... */
- } else {
- CDEBUG(D_NET, "ignoring %s\n",
- libcfs_nid2str(peer->mxp_nid));
- mxlnd_peer_decref(peer); /* ...to here */
- continue;
- }
-
- if ((conn->mxk_status == MXLND_CONN_READY ||
- conn->mxk_status == MXLND_CONN_FAIL) &&
- cfs_time_after(now,
- conn->mxk_last_tx +
- msecs_to_jiffies(MSEC_PER_SEC))) {
- write_unlock(g_lock);
- mxlnd_check_sends(peer);
- write_lock(g_lock);
- }
- mxlnd_conn_decref(conn); /* until here */
- mxlnd_peer_decref(peer); /* ...to here */
- }
- }
- write_unlock(g_lock);
-
- mxlnd_sleep(delay);
- }
- CDEBUG(D_NET, "timeoutd stopping\n");
- mxlnd_thread_stop(id);
- return 0;
-}
+++ /dev/null
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
- *
- * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
- * CA 95054 USA or visit www.sun.com if you need additional information or
- * have any questions.
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (C) 2006 Myricom, Inc.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * lnet/klnds/mxlnd/mxlnd.c
- *
- * Author: Eric Barton <eric@bartonsoftware.com>
- * Author: Scott Atchley <atchley at myri.com>
- */
-
-#include "mxlnd.h"
-
-static int n_waitd = MXLND_N_SCHED;
-CFS_MODULE_PARM(n_waitd, "i", int, 0444,
- "# of completion daemons");
-
-/* this was used to allocate global rxs which are no londer used */
-static int max_peers = MXLND_MAX_PEERS;
-CFS_MODULE_PARM(max_peers, "i", int, 0444,
- "Unused - was maximum number of peers that may connect");
-
-static int cksum = MXLND_CKSUM;
-CFS_MODULE_PARM(cksum, "i", int, 0644,
- "set non-zero to enable message (not data payload) checksums");
-
-static int ntx = MXLND_NTX;
-CFS_MODULE_PARM(ntx, "i", int, 0444,
- "# of total tx message descriptors");
-
-/* this duplicates ntx */
-static int credits = MXLND_NTX;
-CFS_MODULE_PARM(credits, "i", int, 0444,
- "Unused - was # concurrent sends to all peers");
-
-static int peercredits = MXLND_MSG_QUEUE_DEPTH;
-CFS_MODULE_PARM(peercredits, "i", int, 0444,
- "# concurrent sends to one peer");
-
-static int board = MXLND_MX_BOARD;
-CFS_MODULE_PARM(board, "i", int, 0444,
- "index value of the Myrinet board (NIC)");
-
-static int ep_id = MXLND_MX_EP_ID;
-CFS_MODULE_PARM(ep_id, "i", int, 0444, "MX endpoint ID");
-
-static char *ipif_name = "myri0";
-CFS_MODULE_PARM(ipif_name, "s", charp, 0444,
- "IPoMX interface name");
-
-static int polling = MXLND_POLLING;
-CFS_MODULE_PARM(polling, "i", int, 0444,
- "Use 0 to block (wait). A value > 0 will poll that many times before blocking");
-
-static char *hosts = NULL;
-CFS_MODULE_PARM(hosts, "s", charp, 0444,
- "Unused - was IP-to-hostname resolution file");
-
-kmx_tunables_t kmxlnd_tunables = {
- .kmx_n_waitd = &n_waitd,
- .kmx_max_peers = &max_peers,
- .kmx_cksum = &cksum,
- .kmx_ntx = &ntx,
- .kmx_credits = &credits,
- .kmx_peercredits = &peercredits,
- .kmx_board = &board,
- .kmx_ep_id = &ep_id,
- .kmx_default_ipif = &ipif_name,
- .kmx_polling = &polling
-};
-
-#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
-
-static char ipif_basename_space[32];
-
-static struct ctl_table kmxlnd_ctl_table[] = {
- {
- INIT_CTL_NAME
- .procname = "n_waitd",
- .data = &n_waitd,
- .maxlen = sizeof(int),
- .mode = 0444,
- .proc_handler = &proc_dointvec
- },
- {
- INIT_CTL_NAME
- .procname = "max_peers",
- .data = &max_peers,
- .maxlen = sizeof(int),
- .mode = 0444,
- .proc_handler = &proc_dointvec
- },
- {
- INIT_CTL_NAME
- .procname = "cksum",
- .data = &cksum,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec
- },
- {
- INIT_CTL_NAME
- .procname = "ntx",
- .data = &ntx,
- .maxlen = sizeof(int),
- .mode = 0444,
- .proc_handler = &proc_dointvec
- },
- {
- INIT_CTL_NAME
- .procname = "credits",
- .data = &credits,
- .maxlen = sizeof(int),
- .mode = 0444,
- .proc_handler = &proc_dointvec
- },
- {
- INIT_CTL_NAME
- .procname = "peercredits",
- .data = &peercredits,
- .maxlen = sizeof(int),
- .mode = 0444,
- .proc_handler = &proc_dointvec
- },
- {
- INIT_CTL_NAME
- .procname = "board",
- .data = &board,
- .maxlen = sizeof(int),
- .mode = 0444,
- .proc_handler = &proc_dointvec
- },
- {
- INIT_CTL_NAME
- .procname = "ep_id",
- .data = &ep_id,
- .maxlen = sizeof(int),
- .mode = 0444,
- .proc_handler = &proc_dointvec
- },
- {
- .procname = "ipif_name",
- .data = ipif_basename_space,
- .maxlen = sizeof(ipif_basename_space),
- .mode = 0444,
- .proc_handler = &proc_dostring
- },
- {
- INIT_CTL_NAME
- .procname = "polling",
- .data = &polling,
- .maxlen = sizeof(int),
- .mode = 0444,
- .proc_handler = &proc_dointvec
- },
- { 0 }
-};
-
-static struct ctl_table kmxlnd_top_ctl_table[] = {
- {
- INIT_CTL_NAME
- .procname = "mxlnd",
- .data = NULL,
- .maxlen = 0,
- .mode = 0555,
- .child = kmxlnd_ctl_table
- },
- { 0 }
-};
-
-void
-kmxlnd_initstrtunable(char *space, char *str, int size)
-{
- strncpy(space, str, size);
- space[size-1] = 0;
-}
-
-void
-kmxlnd_sysctl_init (void)
-{
- kmxlnd_initstrtunable(ipif_basename_space, ipif_name,
- sizeof(ipif_basename_space));
-
- kmxlnd_tunables.kib_sysctl =
- register_sysctl_table(kmxlnd_top_ctl_table);
-
- if (kmxlnd_tunables.kib_sysctl == NULL)
- CWARN("Can't setup /proc tunables\n");
-}
-
-void
-kmxlnd_sysctl_fini (void)
-{
- if (kmxlnd_tunables.kib_sysctl != NULL)
- unregister_sysctl_table(kmxlnd_tunables.kib_sysctl);
-}
-
-#else
-
-void
-kmxlnd_sysctl_init (void)
-{
-}
-
-void
-kmxlnd_sysctl_fini (void)
-{
-}
-
-#endif
-
-int
-kmxlnd_tunables_init (void)
-{
- kmxlnd_sysctl_init();
- return 0;
-}
-
-void
-kmxlnd_tunables_fini (void)
-{
- kmxlnd_sysctl_fini();
-}
+++ /dev/null
-MODULES := kqswlnd
-kqswlnd-objs := qswlnd.o qswlnd_cb.o qswlnd_modparams.o
-
-EXTRA_POST_CFLAGS := @QSWCPPFLAGS@ -I/usr/include
-
-@INCLUDE_RULES@
+++ /dev/null
-#
-# GPL HEADER START
-#
-# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 only,
-# as published by the Free Software Foundation.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# General Public License version 2 for more details (a copy is included
-# in the LICENSE file that accompanied this code).
-#
-# You should have received a copy of the GNU General Public License
-# version 2 along with this program; If not, see
-# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
-#
-# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
-# CA 95054 USA or visit www.sun.com if you need additional information or
-# have any questions.
-#
-# GPL HEADER END
-#
-
-#
-# Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
-# Use is subject to license terms.
-#
-
-#
-# This file is part of Lustre, http://www.lustre.org/
-# Lustre is a trademark of Sun Microsystems, Inc.
-#
-
-if MODULES
-if BUILD_QSWLND
-modulenet_DATA = kqswlnd$(KMODEXT)
-endif
-endif
-
-MOSTLYCLEANFILES = @MOSTLYCLEANFILES@
-EXTRA_DIST = $(kqswlnd-objs:%.o=%.c) qswlnd.h
+++ /dev/null
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
- *
- * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
- * CA 95054 USA or visit www.sun.com if you need additional information or
- * have any questions.
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2012, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * lnet/klnds/qswlnd/qswlnd.c
- *
- * Author: Eric Barton <eric@bartonsoftware.com>
- */
-
-#include "qswlnd.h"
-
-
-lnd_t the_kqswlnd =
-{
- .lnd_type = QSWLND,
- .lnd_startup = kqswnal_startup,
- .lnd_shutdown = kqswnal_shutdown,
- .lnd_ctl = kqswnal_ctl,
- .lnd_send = kqswnal_send,
- .lnd_recv = kqswnal_recv,
-};
-
-kqswnal_data_t kqswnal_data;
-
-int
-kqswnal_get_tx_desc (struct libcfs_ioctl_data *data)
-{
- unsigned long flags;
- cfs_list_t *tmp;
- kqswnal_tx_t *ktx;
- lnet_hdr_t *hdr;
- int index = data->ioc_count;
- int rc = -ENOENT;
-
- spin_lock_irqsave(&kqswnal_data.kqn_idletxd_lock, flags);
-
- cfs_list_for_each (tmp, &kqswnal_data.kqn_activetxds) {
- if (index-- != 0)
- continue;
-
- ktx = cfs_list_entry (tmp, kqswnal_tx_t, ktx_list);
- hdr = (lnet_hdr_t *)ktx->ktx_buffer;
-
- data->ioc_count = le32_to_cpu(hdr->payload_length);
- data->ioc_nid = le64_to_cpu(hdr->dest_nid);
- data->ioc_u64[0] = ktx->ktx_nid;
- data->ioc_u32[0] = le32_to_cpu(hdr->type);
- data->ioc_u32[1] = ktx->ktx_launcher;
- data->ioc_flags =
- (cfs_list_empty (&ktx->ktx_schedlist) ? 0 : 1) |
- (ktx->ktx_state << 2);
- rc = 0;
- break;
- }
-
- spin_unlock_irqrestore(&kqswnal_data.kqn_idletxd_lock, flags);
- return (rc);
-}
-
-int
-kqswnal_ctl (lnet_ni_t *ni, unsigned int cmd, void *arg)
-{
- struct libcfs_ioctl_data *data = arg;
-
- LASSERT (ni == kqswnal_data.kqn_ni);
-
- switch (cmd) {
- case IOC_LIBCFS_GET_TXDESC:
- return (kqswnal_get_tx_desc (data));
-
- case IOC_LIBCFS_REGISTER_MYNID:
- if (data->ioc_nid == ni->ni_nid)
- return 0;
-
- LASSERT (LNET_NIDNET(data->ioc_nid) == LNET_NIDNET(ni->ni_nid));
-
- CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID for %s(%s)\n",
- libcfs_nid2str(data->ioc_nid),
- libcfs_nid2str(ni->ni_nid));
- return 0;
-
- default:
- return (-EINVAL);
- }
-}
-
-void
-kqswnal_shutdown(lnet_ni_t *ni)
-{
- unsigned long flags;
- kqswnal_tx_t *ktx;
- kqswnal_rx_t *krx;
-
- CDEBUG (D_NET, "shutdown\n");
- LASSERT (ni->ni_data == &kqswnal_data);
- LASSERT (ni == kqswnal_data.kqn_ni);
-
- switch (kqswnal_data.kqn_init)
- {
- default:
- LASSERT (0);
-
- case KQN_INIT_ALL:
- case KQN_INIT_DATA:
- break;
- }
-
- /**********************************************************************/
- /* Signal the start of shutdown... */
- spin_lock_irqsave(&kqswnal_data.kqn_idletxd_lock, flags);
- kqswnal_data.kqn_shuttingdown = 1;
- spin_unlock_irqrestore(&kqswnal_data.kqn_idletxd_lock, flags);
-
- /**********************************************************************/
- /* wait for sends that have allocated a tx desc to launch or give up */
- while (atomic_read (&kqswnal_data.kqn_pending_txs) != 0) {
- CDEBUG(D_NET, "waiting for %d pending sends\n",
- atomic_read (&kqswnal_data.kqn_pending_txs));
- cfs_pause(cfs_time_seconds(1));
- }
-
- /**********************************************************************/
- /* close elan comms */
- /* Shut down receivers first; rx callbacks might try sending... */
- if (kqswnal_data.kqn_eprx_small != NULL)
- ep_free_rcvr (kqswnal_data.kqn_eprx_small);
-
- if (kqswnal_data.kqn_eprx_large != NULL)
- ep_free_rcvr (kqswnal_data.kqn_eprx_large);
-
- /* NB ep_free_rcvr() returns only after we've freed off all receive
- * buffers (see shutdown handling in kqswnal_requeue_rx()). This
- * means we must have completed any messages we passed to
- * lnet_parse() */
-
- if (kqswnal_data.kqn_eptx != NULL)
- ep_free_xmtr (kqswnal_data.kqn_eptx);
-
- /* NB ep_free_xmtr() returns only after all outstanding transmits
- * have called their callback... */
- LASSERT(cfs_list_empty(&kqswnal_data.kqn_activetxds));
-
- /**********************************************************************/
- /* flag threads to terminate, wake them and wait for them to die */
- kqswnal_data.kqn_shuttingdown = 2;
- wake_up_all (&kqswnal_data.kqn_sched_waitq);
-
- while (atomic_read (&kqswnal_data.kqn_nthreads) != 0) {
- CDEBUG(D_NET, "waiting for %d threads to terminate\n",
- atomic_read (&kqswnal_data.kqn_nthreads));
- cfs_pause(cfs_time_seconds(1));
- }
-
- /**********************************************************************/
- /* No more threads. No more portals, router or comms callbacks!
- * I control the horizontals and the verticals...
- */
-
- LASSERT (cfs_list_empty (&kqswnal_data.kqn_readyrxds));
- LASSERT (cfs_list_empty (&kqswnal_data.kqn_donetxds));
- LASSERT (cfs_list_empty (&kqswnal_data.kqn_delayedtxds));
-
- /**********************************************************************/
- /* Unmap message buffers and free all descriptors and buffers
- */
-
- /* FTTB, we need to unmap any remaining mapped memory. When
- * ep_dvma_release() get fixed (and releases any mappings in the
- * region), we can delete all the code from here --------> */
-
- for (ktx = kqswnal_data.kqn_txds; ktx != NULL; ktx = ktx->ktx_alloclist) {
- /* If ktx has a buffer, it got mapped; unmap now. NB only
- * the pre-mapped stuff is still mapped since all tx descs
- * must be idle */
-
- if (ktx->ktx_buffer != NULL)
- ep_dvma_unload(kqswnal_data.kqn_ep,
- kqswnal_data.kqn_ep_tx_nmh,
- &ktx->ktx_ebuffer);
- }
-
- for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) {
- /* If krx_kiov[0].kiov_page got allocated, it got mapped.
- * NB subsequent pages get merged */
-
- if (krx->krx_kiov[0].kiov_page != NULL)
- ep_dvma_unload(kqswnal_data.kqn_ep,
- kqswnal_data.kqn_ep_rx_nmh,
- &krx->krx_elanbuffer);
- }
- /* <----------- to here */
-
- if (kqswnal_data.kqn_ep_rx_nmh != NULL)
- ep_dvma_release(kqswnal_data.kqn_ep, kqswnal_data.kqn_ep_rx_nmh);
-
- if (kqswnal_data.kqn_ep_tx_nmh != NULL)
- ep_dvma_release(kqswnal_data.kqn_ep, kqswnal_data.kqn_ep_tx_nmh);
-
- while (kqswnal_data.kqn_txds != NULL) {
- ktx = kqswnal_data.kqn_txds;
-
- if (ktx->ktx_buffer != NULL)
- LIBCFS_FREE(ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE);
-
- kqswnal_data.kqn_txds = ktx->ktx_alloclist;
- LIBCFS_FREE(ktx, sizeof(*ktx));
- }
-
- while (kqswnal_data.kqn_rxds != NULL) {
- int i;
-
- krx = kqswnal_data.kqn_rxds;
- for (i = 0; i < krx->krx_npages; i++)
- if (krx->krx_kiov[i].kiov_page != NULL)
- __free_page (krx->krx_kiov[i].kiov_page);
-
- kqswnal_data.kqn_rxds = krx->krx_alloclist;
- LIBCFS_FREE(krx, sizeof (*krx));
- }
-
- /* resets flags, pointers to NULL etc */
- memset(&kqswnal_data, 0, sizeof (kqswnal_data));
-
- CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read(&libcfs_kmemory));
-
- module_put(THIS_MODULE);
-}
-
-int
-kqswnal_startup (lnet_ni_t *ni)
-{
- EP_RAILMASK all_rails = EP_RAILMASK_ALL;
- int rc;
- int i;
- kqswnal_rx_t *krx;
- kqswnal_tx_t *ktx;
- int elan_page_idx;
-
- LASSERT (ni->ni_lnd == &the_kqswlnd);
-
- /* Only 1 instance supported */
- if (kqswnal_data.kqn_init != KQN_INIT_NOTHING) {
- CERROR ("Only 1 instance supported\n");
- return -EPERM;
- }
-
- if (ni->ni_interfaces[0] != NULL) {
- CERROR("Explicit interface config not supported\n");
- return -EPERM;
- }
-
- if (*kqswnal_tunables.kqn_credits >=
- *kqswnal_tunables.kqn_ntxmsgs) {
- LCONSOLE_ERROR_MSG(0x12e, "Configuration error: please set "
- "ntxmsgs(%d) > credits(%d)\n",
- *kqswnal_tunables.kqn_ntxmsgs,
- *kqswnal_tunables.kqn_credits);
- }
-
- CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read(&libcfs_kmemory));
-
- /* ensure all pointers NULL etc */
- memset (&kqswnal_data, 0, sizeof (kqswnal_data));
-
- kqswnal_data.kqn_ni = ni;
- ni->ni_data = &kqswnal_data;
- ni->ni_peertxcredits = *kqswnal_tunables.kqn_peercredits;
- ni->ni_maxtxcredits = *kqswnal_tunables.kqn_credits;
-
- CFS_INIT_LIST_HEAD (&kqswnal_data.kqn_idletxds);
- CFS_INIT_LIST_HEAD (&kqswnal_data.kqn_activetxds);
- spin_lock_init(&kqswnal_data.kqn_idletxd_lock);
-
- CFS_INIT_LIST_HEAD (&kqswnal_data.kqn_delayedtxds);
- CFS_INIT_LIST_HEAD (&kqswnal_data.kqn_donetxds);
- CFS_INIT_LIST_HEAD (&kqswnal_data.kqn_readyrxds);
-
- spin_lock_init(&kqswnal_data.kqn_sched_lock);
- init_waitqueue_head (&kqswnal_data.kqn_sched_waitq);
-
- /* pointers/lists/locks initialised */
- kqswnal_data.kqn_init = KQN_INIT_DATA;
- try_module_get(THIS_MODULE);
-
- kqswnal_data.kqn_ep = ep_system();
- if (kqswnal_data.kqn_ep == NULL) {
- CERROR("Can't initialise EKC\n");
- kqswnal_shutdown(ni);
- return (-ENODEV);
- }
-
- if (ep_waitfor_nodeid(kqswnal_data.kqn_ep) == ELAN_INVALID_NODE) {
- CERROR("Can't get elan ID\n");
- kqswnal_shutdown(ni);
- return (-ENODEV);
- }
-
- kqswnal_data.kqn_nnodes = ep_numnodes (kqswnal_data.kqn_ep);
- kqswnal_data.kqn_elanid = ep_nodeid (kqswnal_data.kqn_ep);
-
- ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), kqswnal_data.kqn_elanid);
-
- /**********************************************************************/
- /* Get the transmitter */
-
- kqswnal_data.kqn_eptx = ep_alloc_xmtr (kqswnal_data.kqn_ep);
- if (kqswnal_data.kqn_eptx == NULL)
- {
- CERROR ("Can't allocate transmitter\n");
- kqswnal_shutdown (ni);
- return (-ENOMEM);
- }
-
- /**********************************************************************/
- /* Get the receivers */
-
- kqswnal_data.kqn_eprx_small =
- ep_alloc_rcvr (kqswnal_data.kqn_ep,
- EP_MSG_SVC_PORTALS_SMALL,
- *kqswnal_tunables.kqn_ep_envelopes_small);
- if (kqswnal_data.kqn_eprx_small == NULL)
- {
- CERROR ("Can't install small msg receiver\n");
- kqswnal_shutdown (ni);
- return (-ENOMEM);
- }
-
- kqswnal_data.kqn_eprx_large =
- ep_alloc_rcvr (kqswnal_data.kqn_ep,
- EP_MSG_SVC_PORTALS_LARGE,
- *kqswnal_tunables.kqn_ep_envelopes_large);
- if (kqswnal_data.kqn_eprx_large == NULL)
- {
- CERROR ("Can't install large msg receiver\n");
- kqswnal_shutdown (ni);
- return (-ENOMEM);
- }
-
- /**********************************************************************/
- /* Reserve Elan address space for transmit descriptors NB we may
- * either send the contents of associated buffers immediately, or
- * map them for the peer to suck/blow... */
- kqswnal_data.kqn_ep_tx_nmh =
- ep_dvma_reserve(kqswnal_data.kqn_ep,
- KQSW_NTXMSGPAGES*(*kqswnal_tunables.kqn_ntxmsgs),
- EP_PERM_WRITE);
- if (kqswnal_data.kqn_ep_tx_nmh == NULL) {
- CERROR("Can't reserve tx dma space\n");
- kqswnal_shutdown(ni);
- return (-ENOMEM);
- }
-
- /**********************************************************************/
- /* Reserve Elan address space for receive buffers */
- kqswnal_data.kqn_ep_rx_nmh =
- ep_dvma_reserve(kqswnal_data.kqn_ep,
- KQSW_NRXMSGPAGES_SMALL *
- (*kqswnal_tunables.kqn_nrxmsgs_small) +
- KQSW_NRXMSGPAGES_LARGE *
- (*kqswnal_tunables.kqn_nrxmsgs_large),
- EP_PERM_WRITE);
- if (kqswnal_data.kqn_ep_tx_nmh == NULL) {
- CERROR("Can't reserve rx dma space\n");
- kqswnal_shutdown(ni);
- return (-ENOMEM);
- }
-
- /**********************************************************************/
- /* Allocate/Initialise transmit descriptors */
-
- kqswnal_data.kqn_txds = NULL;
- for (i = 0; i < (*kqswnal_tunables.kqn_ntxmsgs); i++)
- {
- int premapped_pages;
- int basepage = i * KQSW_NTXMSGPAGES;
-
- LIBCFS_ALLOC (ktx, sizeof(*ktx));
- if (ktx == NULL) {
- kqswnal_shutdown (ni);
- return (-ENOMEM);
- }
-
- memset(ktx, 0, sizeof(*ktx)); /* NULL pointers; zero flags */
- ktx->ktx_alloclist = kqswnal_data.kqn_txds;
- kqswnal_data.kqn_txds = ktx;
-
- LIBCFS_ALLOC (ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE);
- if (ktx->ktx_buffer == NULL)
- {
- kqswnal_shutdown (ni);
- return (-ENOMEM);
- }
-
- /* Map pre-allocated buffer NOW, to save latency on transmit */
- premapped_pages = kqswnal_pages_spanned(ktx->ktx_buffer,
- KQSW_TX_BUFFER_SIZE);
- ep_dvma_load(kqswnal_data.kqn_ep, NULL,
- ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE,
- kqswnal_data.kqn_ep_tx_nmh, basepage,
- &all_rails, &ktx->ktx_ebuffer);
-
- ktx->ktx_basepage = basepage + premapped_pages; /* message mapping starts here */
- ktx->ktx_npages = KQSW_NTXMSGPAGES - premapped_pages; /* for this many pages */
-
- CFS_INIT_LIST_HEAD (&ktx->ktx_schedlist);
-
- ktx->ktx_state = KTX_IDLE;
- ktx->ktx_rail = -1; /* unset rail */
-
- cfs_list_add_tail (&ktx->ktx_list, &kqswnal_data.kqn_idletxds);
- }
-
- /**********************************************************************/
- /* Allocate/Initialise receive descriptors */
- kqswnal_data.kqn_rxds = NULL;
- elan_page_idx = 0;
- for (i = 0; i < *kqswnal_tunables.kqn_nrxmsgs_small + *kqswnal_tunables.kqn_nrxmsgs_large; i++)
- {
- EP_NMD elanbuffer;
- int j;
-
- LIBCFS_ALLOC(krx, sizeof(*krx));
- if (krx == NULL) {
- kqswnal_shutdown(ni);
- return (-ENOMEM);
- }
-
- memset(krx, 0, sizeof(*krx)); /* clear flags, null pointers etc */
- krx->krx_alloclist = kqswnal_data.kqn_rxds;
- kqswnal_data.kqn_rxds = krx;
-
- if (i < *kqswnal_tunables.kqn_nrxmsgs_small)
- {
- krx->krx_npages = KQSW_NRXMSGPAGES_SMALL;
- krx->krx_eprx = kqswnal_data.kqn_eprx_small;
- }
- else
- {
- krx->krx_npages = KQSW_NRXMSGPAGES_LARGE;
- krx->krx_eprx = kqswnal_data.kqn_eprx_large;
- }
-
- LASSERT (krx->krx_npages > 0);
- for (j = 0; j < krx->krx_npages; j++)
- {
- struct page *page = alloc_page(GFP_KERNEL);
-
- if (page == NULL) {
- kqswnal_shutdown (ni);
- return (-ENOMEM);
- }
-
- krx->krx_kiov[j] = (lnet_kiov_t) {.kiov_page = page,
- .kiov_offset = 0,
- .kiov_len = PAGE_SIZE};
- LASSERT(page_address(page) != NULL);
-
- ep_dvma_load(kqswnal_data.kqn_ep, NULL,
- page_address(page),
- PAGE_SIZE, kqswnal_data.kqn_ep_rx_nmh,
- elan_page_idx, &all_rails, &elanbuffer);
-
- if (j == 0) {
- krx->krx_elanbuffer = elanbuffer;
- } else {
- rc = ep_nmd_merge(&krx->krx_elanbuffer,
- &krx->krx_elanbuffer,
- &elanbuffer);
- /* NB contiguous mapping */
- LASSERT(rc);
- }
- elan_page_idx++;
-
- }
- }
- LASSERT (elan_page_idx ==
- (*kqswnal_tunables.kqn_nrxmsgs_small * KQSW_NRXMSGPAGES_SMALL) +
- (*kqswnal_tunables.kqn_nrxmsgs_large * KQSW_NRXMSGPAGES_LARGE));
-
- /**********************************************************************/
- /* Queue receives, now that it's OK to run their completion callbacks */
-
- for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) {
- /* NB this enqueue can allocate/sleep (attr == 0) */
- krx->krx_state = KRX_POSTED;
- rc = ep_queue_receive(krx->krx_eprx, kqswnal_rxhandler, krx,
- &krx->krx_elanbuffer, 0);
- if (rc != EP_SUCCESS) {
- CERROR ("failed ep_queue_receive %d\n", rc);
- kqswnal_shutdown (ni);
- return (-EIO);
- }
- }
-
- /**********************************************************************/
- /* Spawn scheduling threads */
- for (i = 0; i < num_online_cpus(); i++) {
- rc = kqswnal_thread_start(kqswnal_scheduler, NULL,
- "kqswnal_sched");
- if (rc != 0)
- {
- CERROR ("failed to spawn scheduling thread: %d\n", rc);
- kqswnal_shutdown (ni);
- return (-ESRCH);
- }
- }
-
- kqswnal_data.kqn_init = KQN_INIT_ALL;
- return (0);
-}
-
-void __exit
-kqswnal_finalise (void)
-{
- lnet_unregister_lnd(&the_kqswlnd);
- kqswnal_tunables_fini();
-}
-
-static int __init
-kqswnal_initialise (void)
-{
- int rc = kqswnal_tunables_init();
-
- if (rc != 0)
- return rc;
-
- lnet_register_lnd(&the_kqswlnd);
- return (0);
-}
-
-MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
-MODULE_DESCRIPTION("Kernel Quadrics/Elan LND v1.01");
-MODULE_LICENSE("GPL");
-
-module_init (kqswnal_initialise);
-module_exit (kqswnal_finalise);
+++ /dev/null
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
- *
- * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
- * CA 95054 USA or visit www.sun.com if you need additional information or
- * have any questions.
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2012, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * lnet/klnds/qswlnd/qswlnd.h
- *
- * Basic library routines.
- */
-
-#ifndef _QSWNAL_H
-#define _QSWNAL_H
-
-#include <qsnet/kernel.h>
-#undef printf /* nasty QSW #define */
-#include <linux/module.h>
-
-#include <elan/epcomms.h>
-
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/errno.h>
-#include <linux/buffer_head.h> /* wait_on_buffer */
-#include <linux/unistd.h>
-#include <net/sock.h>
-#include <linux/uio.h>
-
-#include <asm/uaccess.h>
-
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/stat.h>
-#include <linux/list.h>
-#include <linux/sysctl.h>
-
-#define DEBUG_SUBSYSTEM S_LND
-
-#include <libcfs/libcfs.h>
-#include <lnet/lnet.h>
-#include <lnet/lib-lnet.h>
-
-/* fixed constants */
-#define KQSW_SMALLMSG (4<<10) /* small/large ep receiver breakpoint */
-#define KQSW_RESCHED 100 /* # busy loops that forces scheduler to yield */
-
-#define KQSW_CKSUM 0 /* enable checksumming (protocol incompatible) */
-
-/*
- * derived constants
- */
-
-#define KQSW_TX_BUFFER_SIZE (offsetof(kqswnal_msg_t, \
- kqm_u.immediate.kqim_payload[*kqswnal_tunables.kqn_tx_maxcontig]))
-/* The pre-allocated tx buffer (hdr + small payload) */
-
-#define KQSW_NTXMSGPAGES (btopr(KQSW_TX_BUFFER_SIZE) + 1 + btopr(LNET_MAX_PAYLOAD) + 1)
-/* Reserve elan address space for pre-allocated and pre-mapped transmit
- * buffer and a full payload too. Extra pages allow for page alignment */
-
-#define KQSW_NRXMSGPAGES_SMALL (btopr(KQSW_SMALLMSG))
-/* receive hdr/payload always contiguous and page aligned */
-#define KQSW_NRXMSGBYTES_SMALL (KQSW_NRXMSGPAGES_SMALL * PAGE_SIZE)
-
-#define KQSW_NRXMSGPAGES_LARGE (btopr(sizeof(lnet_msg_t) + LNET_MAX_PAYLOAD))
-/* receive hdr/payload always contiguous and page aligned */
-#define KQSW_NRXMSGBYTES_LARGE (KQSW_NRXMSGPAGES_LARGE * PAGE_SIZE)
-/* biggest complete packet we can receive (or transmit) */
-
-/* Wire messages */
-/* Remote memory descriptor */
-typedef struct
-{
- __u32 kqrmd_nfrag; /* # frags */
- EP_NMD kqrmd_frag[0]; /* actual frags */
-} kqswnal_remotemd_t;
-
-/* Immediate data */
-typedef struct
-{
- lnet_hdr_t kqim_hdr; /* LNET header */
- char kqim_payload[0]; /* piggy-backed payload */
-} WIRE_ATTR kqswnal_immediate_msg_t;
-
-/* RDMA request */
-typedef struct
-{
- lnet_hdr_t kqrm_hdr; /* LNET header */
- kqswnal_remotemd_t kqrm_rmd; /* peer's buffer */
-} WIRE_ATTR kqswnal_rdma_msg_t;
-
-typedef struct
-{
- __u32 kqm_magic; /* I'm a qswlnd message */
- __u16 kqm_version; /* this is my version number */
- __u16 kqm_type; /* msg type */
-#if KQSW_CKSUM
- __u32 kqm_cksum; /* crc32 checksum */
- __u32 kqm_nob; /* original msg length */
-#endif
- union {
- kqswnal_immediate_msg_t immediate;
- kqswnal_rdma_msg_t rdma;
- } WIRE_ATTR kqm_u;
-} WIRE_ATTR kqswnal_msg_t;
-
-#if KQSW_CKSUM /* enable checksums ? */
-# include <linux/crc32.h>
-static inline __u32 kqswnal_csum(__u32 crc, unsigned char const *p, size_t len)
-{
-#if 1
- return crc32_le(crc, p, len);
-#else
- while (len-- > 0)
- crc = ((crc + 0x100) & ~0xff) | ((crc + *p++) & 0xff) ;
- return crc;
-#endif
-}
-# define QSWLND_PROTO_VERSION 0xbeef
-#else
-# define QSWLND_PROTO_VERSION 1
-#endif
-
-#define QSWLND_MSG_IMMEDIATE 0
-#define QSWLND_MSG_RDMA 1
-
-typedef union {
- EP_STATUSBLK ep_statusblk;
- struct {
- __u32 status;
- __u32 magic;
- __u32 version;
- union {
- struct {
- __u32 len;
- __u32 cksum;
- } WIRE_ATTR get;
- } WIRE_ATTR u;
- } WIRE_ATTR msg;
-} kqswnal_rpc_reply_t;
-
-typedef struct kqswnal_rx
-{
- cfs_list_t krx_list; /* enqueue -> thread */
- struct kqswnal_rx *krx_alloclist;/* stack in kqn_rxds */
- EP_RCVR *krx_eprx; /* port to post receives to */
- EP_RXD *krx_rxd; /* receive descriptor (for repost) */
- EP_NMD krx_elanbuffer;/* contiguous Elan buffer */
- int krx_npages; /* # pages in receive buffer */
- int krx_nob; /* Number Of Bytes received into buffer */
- int krx_rpc_reply_needed:1; /* peer waiting for EKC RPC reply */
- int krx_state; /* what this RX is doing */
- atomic_t krx_refcount; /* how to tell when rpc is done */
-#if KQSW_CKSUM
- __u32 krx_cksum; /* checksum */
-#endif
- kqswnal_rpc_reply_t krx_rpc_reply; /* rpc reply status block */
- lnet_kiov_t krx_kiov[KQSW_NRXMSGPAGES_LARGE];/* buffer frags */
-} kqswnal_rx_t;
-
-#define KRX_POSTED 1 /* receiving */
-#define KRX_PARSE 2 /* ready to be parsed */
-#define KRX_COMPLETING 3 /* waiting to be completed */
-
-
-typedef struct kqswnal_tx
-{
- cfs_list_t ktx_list; /* enqueue idle/active */
- cfs_list_t ktx_schedlist; /* enqueue on scheduler */
- struct kqswnal_tx *ktx_alloclist; /* stack in kqn_txds */
- unsigned int ktx_state:7; /* What I'm doing */
- unsigned int ktx_firsttmpfrag:1; /* ktx_frags[0] is in my ebuffer ? 0 : 1 */
- __u32 ktx_basepage; /* page offset in reserved elan tx vaddrs for mapping pages */
- int ktx_npages; /* pages reserved for mapping messages */
- int ktx_nmappedpages; /* # pages mapped for current message */
- int ktx_port; /* destination ep port */
- lnet_nid_t ktx_nid; /* destination node */
- void *ktx_args[3]; /* completion passthru */
- char *ktx_buffer; /* pre-allocated contiguous buffer for hdr + small payloads */
- cfs_time_t ktx_launchtime; /* when (in jiffies) the
- * transmit was launched */
- int ktx_status; /* completion status */
-#if KQSW_CKSUM
- __u32 ktx_cksum; /* optimized GET payload checksum */
-#endif
- /* debug/info fields */
- pid_t ktx_launcher; /* pid of launching process */
-
- int ktx_nfrag; /* # message frags */
- int ktx_rail; /* preferred rail */
- EP_NMD ktx_ebuffer; /* elan mapping of ktx_buffer */
- EP_NMD ktx_frags[EP_MAXFRAG];/* elan mapping of msg frags */
-} kqswnal_tx_t;
-
-#define KTX_IDLE 0 /* on kqn_idletxds */
-#define KTX_SENDING 1 /* normal send */
-#define KTX_GETTING 2 /* sending optimised get */
-#define KTX_PUTTING 3 /* sending optimised put */
-#define KTX_RDMA_FETCH 4 /* handling optimised put */
-#define KTX_RDMA_STORE 5 /* handling optimised get */
-
-typedef struct
-{
- int *kqn_tx_maxcontig; /* maximum payload to defrag */
- int *kqn_ntxmsgs; /* # normal tx msgs */
- int *kqn_credits; /* # concurrent sends */
- int *kqn_peercredits; /* # concurrent sends to 1 peer */
- int *kqn_nrxmsgs_large; /* # 'large' rx msgs */
- int *kqn_ep_envelopes_large; /* # 'large' rx ep envelopes */
- int *kqn_nrxmsgs_small; /* # 'small' rx msgs */
- int *kqn_ep_envelopes_small; /* # 'small' rx ep envelopes */
- int *kqn_optimized_puts; /* optimized PUTs? */
- int *kqn_optimized_gets; /* optimized GETs? */
-#if KQSW_CKSUM
- int *kqn_inject_csum_error; /* # csum errors to inject */
-#endif
-
-#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
- struct ctl_table_header *kqn_sysctl; /* sysctl interface */
-#endif
-} kqswnal_tunables_t;
-
-typedef struct
-{
- char kqn_init; /* what's been initialised */
- char kqn_shuttingdown;/* I'm trying to shut down */
- atomic_t kqn_nthreads; /* # threads running */
- lnet_ni_t *kqn_ni; /* _the_ instance of me */
-
- kqswnal_rx_t *kqn_rxds; /* stack of all the receive descriptors */
- kqswnal_tx_t *kqn_txds; /* stack of all the transmit descriptors */
-
- cfs_list_t kqn_idletxds; /* transmit descriptors free to use */
- cfs_list_t kqn_activetxds; /* transmit descriptors being used */
- spinlock_t kqn_idletxd_lock; /* serialise idle txd access */
- atomic_t kqn_pending_txs; /* # transmits being prepped */
-
- spinlock_t kqn_sched_lock; /* serialise packet schedulers */
- wait_queue_head_t kqn_sched_waitq;/* scheduler blocks here */
-
- cfs_list_t kqn_readyrxds; /* rxds full of data */
- cfs_list_t kqn_donetxds; /* completed transmits */
- cfs_list_t kqn_delayedtxds;/* delayed transmits */
-
- EP_SYS *kqn_ep; /* elan system */
- EP_NMH *kqn_ep_tx_nmh; /* elan reserved tx vaddrs */
- EP_NMH *kqn_ep_rx_nmh; /* elan reserved rx vaddrs */
- EP_XMTR *kqn_eptx; /* elan transmitter */
- EP_RCVR *kqn_eprx_small; /* elan receiver (small messages) */
- EP_RCVR *kqn_eprx_large; /* elan receiver (large messages) */
-
- int kqn_nnodes; /* this cluster's size */
- int kqn_elanid; /* this nodes's elan ID */
-
- EP_STATUSBLK kqn_rpc_success;/* preset RPC reply status blocks */
- EP_STATUSBLK kqn_rpc_failed;
- EP_STATUSBLK kqn_rpc_version;/* reply to future version query */
- EP_STATUSBLK kqn_rpc_magic; /* reply to future version query */
-} kqswnal_data_t;
-
-/* kqn_init state */
-#define KQN_INIT_NOTHING 0 /* MUST BE ZERO so zeroed state is initialised OK */
-#define KQN_INIT_DATA 1
-#define KQN_INIT_ALL 2
-
-extern kqswnal_tunables_t kqswnal_tunables;
-extern kqswnal_data_t kqswnal_data;
-
-extern int kqswnal_thread_start (int (*fn)(void *arg), void *arg);
-extern void kqswnal_rxhandler(EP_RXD *rxd);
-extern int kqswnal_scheduler (void *);
-extern void kqswnal_rx_done (kqswnal_rx_t *krx);
-
-static inline lnet_nid_t
-kqswnal_elanid2nid (int elanid)
-{
- return LNET_MKNID(LNET_NIDNET(kqswnal_data.kqn_ni->ni_nid), elanid);
-}
-
-static inline int
-kqswnal_nid2elanid (lnet_nid_t nid)
-{
- __u32 elanid = LNET_NIDADDR(nid);
-
- /* not in this cluster? */
- return (elanid >= kqswnal_data.kqn_nnodes) ? -1 : elanid;
-}
-
-static inline lnet_nid_t
-kqswnal_rx_nid(kqswnal_rx_t *krx)
-{
- return (kqswnal_elanid2nid(ep_rxd_node(krx->krx_rxd)));
-}
-
-static inline int
-kqswnal_pages_spanned (void *base, int nob)
-{
- unsigned long first_page = ((unsigned long)base) >> PAGE_SHIFT;
- unsigned long last_page = (((unsigned long)base) + (nob - 1)) >> PAGE_SHIFT;
-
- LASSERT (last_page >= first_page); /* can't wrap address space */
- return (last_page - first_page + 1);
-}
-
-static inline void kqswnal_rx_decref (kqswnal_rx_t *krx)
-{
- LASSERT (atomic_read (&krx->krx_refcount) > 0);
- if (atomic_dec_and_test (&krx->krx_refcount))
- kqswnal_rx_done(krx);
-}
-
-int kqswnal_startup (lnet_ni_t *ni);
-void kqswnal_shutdown (lnet_ni_t *ni);
-int kqswnal_ctl (lnet_ni_t *ni, unsigned int cmd, void *arg);
-int kqswnal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
-int kqswnal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
- int delayed, unsigned int niov,
- struct iovec *iov, lnet_kiov_t *kiov,
- unsigned int offset, unsigned int mlen, unsigned int rlen);
-
-int kqswnal_tunables_init(void);
-void kqswnal_tunables_fini(void);
-
-#endif /* _QSWNAL_H */
+++ /dev/null
-/*
- * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- *
- * Copyright (c) 2012, Intel Corporation.
- *
- * Author: Eric Barton <eric@bartonsoftware.com>
- *
- * This file is part of Portals, http://www.lustre.org
- *
- * Portals is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * Portals is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Portals; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include "qswlnd.h"
-
-void
-kqswnal_notify_peer_down(kqswnal_tx_t *ktx)
-{
- time_t then;
-
- then = cfs_time_current_sec() -
- cfs_duration_sec(cfs_time_current() -
- ktx->ktx_launchtime);
-
- lnet_notify(kqswnal_data.kqn_ni, ktx->ktx_nid, 0, then);
-}
-
-void
-kqswnal_unmap_tx (kqswnal_tx_t *ktx)
-{
- int i;
-
- ktx->ktx_rail = -1; /* unset rail */
-
- if (ktx->ktx_nmappedpages == 0)
- return;
-
- CDEBUG(D_NET, "%p unloading %d frags starting at %d\n",
- ktx, ktx->ktx_nfrag, ktx->ktx_firsttmpfrag);
-
- for (i = ktx->ktx_firsttmpfrag; i < ktx->ktx_nfrag; i++)
- ep_dvma_unload(kqswnal_data.kqn_ep,
- kqswnal_data.kqn_ep_tx_nmh,
- &ktx->ktx_frags[i]);
-
- ktx->ktx_nmappedpages = 0;
-}
-
-int
-kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int offset, int nob,
- unsigned int niov, lnet_kiov_t *kiov)
-{
- int nfrags = ktx->ktx_nfrag;
- int nmapped = ktx->ktx_nmappedpages;
- int maxmapped = ktx->ktx_npages;
- __u32 basepage = ktx->ktx_basepage + nmapped;
- char *ptr;
-
- EP_RAILMASK railmask;
- int rail;
-
- if (ktx->ktx_rail < 0)
- ktx->ktx_rail = ep_xmtr_prefrail(kqswnal_data.kqn_eptx,
- EP_RAILMASK_ALL,
- kqswnal_nid2elanid(ktx->ktx_nid));
- rail = ktx->ktx_rail;
- if (rail < 0) {
- CERROR("No rails available for %s\n", libcfs_nid2str(ktx->ktx_nid));
- return (-ENETDOWN);
- }
- railmask = 1 << rail;
-
- LASSERT (nmapped <= maxmapped);
- LASSERT (nfrags >= ktx->ktx_firsttmpfrag);
- LASSERT (nfrags <= EP_MAXFRAG);
- LASSERT (niov > 0);
- LASSERT (nob > 0);
-
- /* skip complete frags before 'offset' */
- while (offset >= kiov->kiov_len) {
- offset -= kiov->kiov_len;
- kiov++;
- niov--;
- LASSERT (niov > 0);
- }
-
- do {
- int fraglen = kiov->kiov_len - offset;
-
- /* each page frag is contained in one page */
- LASSERT (kiov->kiov_offset + kiov->kiov_len <= PAGE_SIZE);
-
- if (fraglen > nob)
- fraglen = nob;
-
- nmapped++;
- if (nmapped > maxmapped) {
- CERROR("Can't map message in %d pages (max %d)\n",
- nmapped, maxmapped);
- return (-EMSGSIZE);
- }
-
- if (nfrags == EP_MAXFRAG) {
- CERROR("Message too fragmented in Elan VM (max %d frags)\n",
- EP_MAXFRAG);
- return (-EMSGSIZE);
- }
-
- /* XXX this is really crap, but we'll have to kmap until
- * EKC has a page (rather than vaddr) mapping interface */
-
- ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset;
-
- CDEBUG(D_NET,
- "%p[%d] loading %p for %d, page %d, %d total\n",
- ktx, nfrags, ptr, fraglen, basepage, nmapped);
-
- ep_dvma_load(kqswnal_data.kqn_ep, NULL,
- ptr, fraglen,
- kqswnal_data.kqn_ep_tx_nmh, basepage,
- &railmask, &ktx->ktx_frags[nfrags]);
-
- if (nfrags == ktx->ktx_firsttmpfrag ||
- !ep_nmd_merge(&ktx->ktx_frags[nfrags - 1],
- &ktx->ktx_frags[nfrags - 1],
- &ktx->ktx_frags[nfrags])) {
- /* new frag if this is the first or can't merge */
- nfrags++;
- }
-
- kunmap (kiov->kiov_page);
-
- /* keep in loop for failure case */
- ktx->ktx_nmappedpages = nmapped;
-
- basepage++;
- kiov++;
- niov--;
- nob -= fraglen;
- offset = 0;
-
- /* iov must not run out before end of data */
- LASSERT (nob == 0 || niov > 0);
-
- } while (nob > 0);
-
- ktx->ktx_nfrag = nfrags;
- CDEBUG (D_NET, "%p got %d frags over %d pages\n",
- ktx, ktx->ktx_nfrag, ktx->ktx_nmappedpages);
-
- return (0);
-}
-
-#if KQSW_CKSUM
-__u32
-kqswnal_csum_kiov (__u32 csum, int offset, int nob,
- unsigned int niov, lnet_kiov_t *kiov)
-{
- char *ptr;
-
- if (nob == 0)
- return csum;
-
- LASSERT (niov > 0);
- LASSERT (nob > 0);
-
- /* skip complete frags before 'offset' */
- while (offset >= kiov->kiov_len) {
- offset -= kiov->kiov_len;
- kiov++;
- niov--;
- LASSERT (niov > 0);
- }
-
- do {
- int fraglen = kiov->kiov_len - offset;
-
- /* each page frag is contained in one page */
- LASSERT (kiov->kiov_offset + kiov->kiov_len <= PAGE_SIZE);
-
- if (fraglen > nob)
- fraglen = nob;
-
- ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset;
-
- csum = kqswnal_csum(csum, ptr, fraglen);
-
- kunmap (kiov->kiov_page);
-
- kiov++;
- niov--;
- nob -= fraglen;
- offset = 0;
-
- /* iov must not run out before end of data */
- LASSERT (nob == 0 || niov > 0);
-
- } while (nob > 0);
-
- return csum;
-}
-#endif
-
-int
-kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int offset, int nob,
- unsigned int niov, struct iovec *iov)
-{
- int nfrags = ktx->ktx_nfrag;
- int nmapped = ktx->ktx_nmappedpages;
- int maxmapped = ktx->ktx_npages;
- __u32 basepage = ktx->ktx_basepage + nmapped;
-
- EP_RAILMASK railmask;
- int rail;
-
- if (ktx->ktx_rail < 0)
- ktx->ktx_rail = ep_xmtr_prefrail(kqswnal_data.kqn_eptx,
- EP_RAILMASK_ALL,
- kqswnal_nid2elanid(ktx->ktx_nid));
- rail = ktx->ktx_rail;
- if (rail < 0) {
- CERROR("No rails available for %s\n", libcfs_nid2str(ktx->ktx_nid));
- return (-ENETDOWN);
- }
- railmask = 1 << rail;
-
- LASSERT (nmapped <= maxmapped);
- LASSERT (nfrags >= ktx->ktx_firsttmpfrag);
- LASSERT (nfrags <= EP_MAXFRAG);
- LASSERT (niov > 0);
- LASSERT (nob > 0);
-
- /* skip complete frags before offset */
- while (offset >= iov->iov_len) {
- offset -= iov->iov_len;
- iov++;
- niov--;
- LASSERT (niov > 0);
- }
-
- do {
- int fraglen = iov->iov_len - offset;
- long npages;
-
- if (fraglen > nob)
- fraglen = nob;
- npages = kqswnal_pages_spanned (iov->iov_base, fraglen);
-
- nmapped += npages;
- if (nmapped > maxmapped) {
- CERROR("Can't map message in %d pages (max %d)\n",
- nmapped, maxmapped);
- return (-EMSGSIZE);
- }
-
- if (nfrags == EP_MAXFRAG) {
- CERROR("Message too fragmented in Elan VM (max %d frags)\n",
- EP_MAXFRAG);
- return (-EMSGSIZE);
- }
-
- CDEBUG(D_NET,
- "%p[%d] loading %p for %d, pages %d for %ld, %d total\n",
- ktx, nfrags, iov->iov_base + offset, fraglen,
- basepage, npages, nmapped);
-
- ep_dvma_load(kqswnal_data.kqn_ep, NULL,
- iov->iov_base + offset, fraglen,
- kqswnal_data.kqn_ep_tx_nmh, basepage,
- &railmask, &ktx->ktx_frags[nfrags]);
-
- if (nfrags == ktx->ktx_firsttmpfrag ||
- !ep_nmd_merge(&ktx->ktx_frags[nfrags - 1],
- &ktx->ktx_frags[nfrags - 1],
- &ktx->ktx_frags[nfrags])) {
- /* new frag if this is the first or can't merge */
- nfrags++;
- }
-
- /* keep in loop for failure case */
- ktx->ktx_nmappedpages = nmapped;
-
- basepage += npages;
- iov++;
- niov--;
- nob -= fraglen;
- offset = 0;
-
- /* iov must not run out before end of data */
- LASSERT (nob == 0 || niov > 0);
-
- } while (nob > 0);
-
- ktx->ktx_nfrag = nfrags;
- CDEBUG (D_NET, "%p got %d frags over %d pages\n",
- ktx, ktx->ktx_nfrag, ktx->ktx_nmappedpages);
-
- return (0);
-}
-
-#if KQSW_CKSUM
-__u32
-kqswnal_csum_iov (__u32 csum, int offset, int nob,
- unsigned int niov, struct iovec *iov)
-{
- if (nob == 0)
- return csum;
-
- LASSERT (niov > 0);
- LASSERT (nob > 0);
-
- /* skip complete frags before offset */
- while (offset >= iov->iov_len) {
- offset -= iov->iov_len;
- iov++;
- niov--;
- LASSERT (niov > 0);
- }
-
- do {
- int fraglen = iov->iov_len - offset;
-
- if (fraglen > nob)
- fraglen = nob;
-
- csum = kqswnal_csum(csum, iov->iov_base + offset, fraglen);
-
- iov++;
- niov--;
- nob -= fraglen;
- offset = 0;
-
- /* iov must not run out before end of data */
- LASSERT (nob == 0 || niov > 0);
-
- } while (nob > 0);
-
- return csum;
-}
-#endif
-
-void
-kqswnal_put_idle_tx (kqswnal_tx_t *ktx)
-{
- unsigned long flags;
-
- kqswnal_unmap_tx(ktx); /* release temporary mappings */
- ktx->ktx_state = KTX_IDLE;
-
- spin_lock_irqsave(&kqswnal_data.kqn_idletxd_lock, flags);
-
- cfs_list_del(&ktx->ktx_list); /* take off active list */
- cfs_list_add(&ktx->ktx_list, &kqswnal_data.kqn_idletxds);
-
- spin_unlock_irqrestore(&kqswnal_data.kqn_idletxd_lock, flags);
-}
-
-kqswnal_tx_t *
-kqswnal_get_idle_tx (void)
-{
- unsigned long flags;
- kqswnal_tx_t *ktx;
-
- spin_lock_irqsave(&kqswnal_data.kqn_idletxd_lock, flags);
-
- if (kqswnal_data.kqn_shuttingdown ||
- cfs_list_empty(&kqswnal_data.kqn_idletxds)) {
- spin_unlock_irqrestore(&kqswnal_data.kqn_idletxd_lock, flags);
-
- return NULL;
- }
-
- ktx = cfs_list_entry (kqswnal_data.kqn_idletxds.next, kqswnal_tx_t,
- ktx_list);
- cfs_list_del (&ktx->ktx_list);
-
- cfs_list_add (&ktx->ktx_list, &kqswnal_data.kqn_activetxds);
- ktx->ktx_launcher = current->pid;
- atomic_inc(&kqswnal_data.kqn_pending_txs);
-
- spin_unlock_irqrestore(&kqswnal_data.kqn_idletxd_lock, flags);
-
- /* Idle descs can't have any mapped (as opposed to pre-mapped) pages */
- LASSERT (ktx->ktx_nmappedpages == 0);
- return (ktx);
-}
-
-void
-kqswnal_tx_done_in_thread_context (kqswnal_tx_t *ktx)
-{
- lnet_msg_t *lnetmsg0 = NULL;
- lnet_msg_t *lnetmsg1 = NULL;
- int status0 = 0;
- int status1 = 0;
- kqswnal_rx_t *krx;
-
- LASSERT (!in_interrupt());
-
- if (ktx->ktx_status == -EHOSTDOWN)
- kqswnal_notify_peer_down(ktx);
-
- switch (ktx->ktx_state) {
- case KTX_RDMA_FETCH: /* optimized PUT/REPLY handled */
- krx = (kqswnal_rx_t *)ktx->ktx_args[0];
- lnetmsg0 = (lnet_msg_t *)ktx->ktx_args[1];
- status0 = ktx->ktx_status;
-#if KQSW_CKSUM
- if (status0 == 0) { /* RDMA succeeded */
- kqswnal_msg_t *msg;
- __u32 csum;
-
- msg = (kqswnal_msg_t *)
- page_address(krx->krx_kiov[0].kiov_page);
-
- csum = (lnetmsg0->msg_kiov != NULL) ?
- kqswnal_csum_kiov(krx->krx_cksum,
- lnetmsg0->msg_offset,
- lnetmsg0->msg_wanted,
- lnetmsg0->msg_niov,
- lnetmsg0->msg_kiov) :
- kqswnal_csum_iov(krx->krx_cksum,
- lnetmsg0->msg_offset,
- lnetmsg0->msg_wanted,
- lnetmsg0->msg_niov,
- lnetmsg0->msg_iov);
-
- /* Can only check csum if I got it all */
- if (lnetmsg0->msg_wanted == lnetmsg0->msg_len &&
- csum != msg->kqm_cksum) {
- ktx->ktx_status = -EIO;
- krx->krx_rpc_reply.msg.status = -EIO;
- CERROR("RDMA checksum failed %u(%u) from %s\n",
- csum, msg->kqm_cksum,
- libcfs_nid2str(kqswnal_rx_nid(krx)));
- }
- }
-#endif
- LASSERT (krx->krx_state == KRX_COMPLETING);
- kqswnal_rx_decref (krx);
- break;
-
- case KTX_RDMA_STORE: /* optimized GET handled */
- case KTX_PUTTING: /* optimized PUT sent */
- case KTX_SENDING: /* normal send */
- lnetmsg0 = (lnet_msg_t *)ktx->ktx_args[1];
- status0 = ktx->ktx_status;
- break;
-
- case KTX_GETTING: /* optimized GET sent & payload received */
- /* Complete the GET with success since we can't avoid
- * delivering a REPLY event; we committed to it when we
- * launched the GET */
- lnetmsg0 = (lnet_msg_t *)ktx->ktx_args[1];
- status0 = 0;
- lnetmsg1 = (lnet_msg_t *)ktx->ktx_args[2];
- status1 = ktx->ktx_status;
-#if KQSW_CKSUM
- if (status1 == 0) { /* RDMA succeeded */
- lnet_msg_t *lnetmsg0 = (lnet_msg_t *)ktx->ktx_args[1];
- lnet_libmd_t *md = lnetmsg0->msg_md;
- __u32 csum;
-
- csum = ((md->md_options & LNET_MD_KIOV) != 0) ?
- kqswnal_csum_kiov(~0, 0,
- md->md_length,
- md->md_niov,
- md->md_iov.kiov) :
- kqswnal_csum_iov(~0, 0,
- md->md_length,
- md->md_niov,
- md->md_iov.iov);
-
- if (csum != ktx->ktx_cksum) {
- CERROR("RDMA checksum failed %u(%u) from %s\n",
- csum, ktx->ktx_cksum,
- libcfs_nid2str(ktx->ktx_nid));
- status1 = -EIO;
- }
- }
-#endif
- break;
-
- default:
- LASSERT (0);
- }
-
- kqswnal_put_idle_tx (ktx);
-
- lnet_finalize (kqswnal_data.kqn_ni, lnetmsg0, status0);
- if (lnetmsg1 != NULL)
- lnet_finalize (kqswnal_data.kqn_ni, lnetmsg1, status1);
-}
-
-void
-kqswnal_tx_done (kqswnal_tx_t *ktx, int status)
-{
- unsigned long flags;
-
- ktx->ktx_status = status;
-
- if (!in_interrupt()) {
- kqswnal_tx_done_in_thread_context(ktx);
- return;
- }
-
- /* Complete the send in thread context */
- spin_lock_irqsave(&kqswnal_data.kqn_sched_lock, flags);
-
- cfs_list_add_tail(&ktx->ktx_schedlist,
- &kqswnal_data.kqn_donetxds);
- wake_up(&kqswnal_data.kqn_sched_waitq);
-
- spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, flags);
-}
-
-static void
-kqswnal_txhandler(EP_TXD *txd, void *arg, int status)
-{
- kqswnal_tx_t *ktx = (kqswnal_tx_t *)arg;
- kqswnal_rpc_reply_t *reply;
-
- LASSERT (txd != NULL);
- LASSERT (ktx != NULL);
-
- CDEBUG(D_NET, "txd %p, arg %p status %d\n", txd, arg, status);
-
- if (status != EP_SUCCESS) {
-
- CNETERR("Tx completion to %s failed: %d\n",
- libcfs_nid2str(ktx->ktx_nid), status);
-
- status = -EHOSTDOWN;
-
- } else switch (ktx->ktx_state) {
-
- case KTX_GETTING:
- case KTX_PUTTING:
- /* RPC complete! */
- reply = (kqswnal_rpc_reply_t *)ep_txd_statusblk(txd);
- if (reply->msg.magic == 0) { /* "old" peer */
- status = reply->msg.status;
- break;
- }
-
- if (reply->msg.magic != LNET_PROTO_QSW_MAGIC) {
- if (reply->msg.magic != swab32(LNET_PROTO_QSW_MAGIC)) {
- CERROR("%s unexpected rpc reply magic %08x\n",
- libcfs_nid2str(ktx->ktx_nid),
- reply->msg.magic);
- status = -EPROTO;
- break;
- }
-
- __swab32s(&reply->msg.status);
- __swab32s(&reply->msg.version);
-
- if (ktx->ktx_state == KTX_GETTING) {
- __swab32s(&reply->msg.u.get.len);
- __swab32s(&reply->msg.u.get.cksum);
- }
- }
-
- status = reply->msg.status;
- if (status != 0) {
- CERROR("%s RPC status %08x\n",
- libcfs_nid2str(ktx->ktx_nid), status);
- break;
- }
-
- if (ktx->ktx_state == KTX_GETTING) {
- lnet_set_reply_msg_len(kqswnal_data.kqn_ni,
- (lnet_msg_t *)ktx->ktx_args[2],
- reply->msg.u.get.len);
-#if KQSW_CKSUM
- ktx->ktx_cksum = reply->msg.u.get.cksum;
-#endif
- }
- break;
-
- case KTX_SENDING:
- status = 0;
- break;
-
- default:
- LBUG();
- break;
- }
-
- kqswnal_tx_done(ktx, status);
-}
-
-int
-kqswnal_launch (kqswnal_tx_t *ktx)
-{
- /* Don't block for transmit descriptor if we're in interrupt context */
- int attr = in_interrupt() ? (EP_NO_SLEEP | EP_NO_ALLOC) : 0;
- int dest = kqswnal_nid2elanid (ktx->ktx_nid);
- unsigned long flags;
- int rc;
-
- ktx->ktx_launchtime = cfs_time_current();
-
- if (kqswnal_data.kqn_shuttingdown)
- return (-ESHUTDOWN);
-
- LASSERT (dest >= 0); /* must be a peer */
-
- if (ktx->ktx_nmappedpages != 0)
- attr = EP_SET_PREFRAIL(attr, ktx->ktx_rail);
-
- switch (ktx->ktx_state) {
- case KTX_GETTING:
- case KTX_PUTTING:
- if (the_lnet.ln_testprotocompat != 0) {
- kqswnal_msg_t *msg = (kqswnal_msg_t *)ktx->ktx_buffer;
-
- /* single-shot proto test:
- * Future version queries will use an RPC, so I'll
- * co-opt one of the existing ones */
- LNET_LOCK();
- if ((the_lnet.ln_testprotocompat & 1) != 0) {
- msg->kqm_version++;
- the_lnet.ln_testprotocompat &= ~1;
- }
- if ((the_lnet.ln_testprotocompat & 2) != 0) {
- msg->kqm_magic = LNET_PROTO_MAGIC;
- the_lnet.ln_testprotocompat &= ~2;
- }
- LNET_UNLOCK();
- }
-
- /* NB ktx_frag[0] is the GET/PUT hdr + kqswnal_remotemd_t.
- * The other frags are the payload, awaiting RDMA */
- rc = ep_transmit_rpc(kqswnal_data.kqn_eptx, dest,
- ktx->ktx_port, attr,
- kqswnal_txhandler, ktx,
- NULL, ktx->ktx_frags, 1);
- break;
-
- case KTX_SENDING:
- rc = ep_transmit_message(kqswnal_data.kqn_eptx, dest,
- ktx->ktx_port, attr,
- kqswnal_txhandler, ktx,
- NULL, ktx->ktx_frags, ktx->ktx_nfrag);
- break;
-
- default:
- LBUG();
- rc = -EINVAL; /* no compiler warning please */
- break;
- }
-
- switch (rc) {
- case EP_SUCCESS: /* success */
- return (0);
-
- case EP_ENOMEM: /* can't allocate ep txd => queue for later */
- spin_lock_irqsave(&kqswnal_data.kqn_sched_lock, flags);
-
- cfs_list_add_tail(&ktx->ktx_schedlist,
- &kqswnal_data.kqn_delayedtxds);
- wake_up(&kqswnal_data.kqn_sched_waitq);
-
- spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock,
- flags);
- return (0);
-
- default: /* fatal error */
- CNETERR ("Tx to %s failed: %d\n",
- libcfs_nid2str(ktx->ktx_nid), rc);
- kqswnal_notify_peer_down(ktx);
- return (-EHOSTUNREACH);
- }
-}
-
-#if 0
-static char *
-hdr_type_string (lnet_hdr_t *hdr)
-{
- switch (hdr->type) {
- case LNET_MSG_ACK:
- return ("ACK");
- case LNET_MSG_PUT:
- return ("PUT");
- case LNET_MSG_GET:
- return ("GET");
- case LNET_MSG_REPLY:
- return ("REPLY");
- default:
- return ("<UNKNOWN>");
- }
-}
-
-static void
-kqswnal_cerror_hdr(lnet_hdr_t * hdr)
-{
- char *type_str = hdr_type_string (hdr);
-
- CERROR("P3 Header at %p of type %s length %d\n", hdr, type_str,
- le32_to_cpu(hdr->payload_length));
- CERROR(" From nid/pid "LPU64"/%u\n", le64_to_cpu(hdr->src_nid),
- le32_to_cpu(hdr->src_pid));
- CERROR(" To nid/pid "LPU64"/%u\n", le64_to_cpu(hdr->dest_nid),
- le32_to_cpu(hdr->dest_pid));
-
- switch (le32_to_cpu(hdr->type)) {
- case LNET_MSG_PUT:
- CERROR(" Ptl index %d, ack md "LPX64"."LPX64", "
- "match bits "LPX64"\n",
- le32_to_cpu(hdr->msg.put.ptl_index),
- hdr->msg.put.ack_wmd.wh_interface_cookie,
- hdr->msg.put.ack_wmd.wh_object_cookie,
- le64_to_cpu(hdr->msg.put.match_bits));
- CERROR(" offset %d, hdr data "LPX64"\n",
- le32_to_cpu(hdr->msg.put.offset),
- hdr->msg.put.hdr_data);
- break;
-
- case LNET_MSG_GET:
- CERROR(" Ptl index %d, return md "LPX64"."LPX64", "
- "match bits "LPX64"\n",
- le32_to_cpu(hdr->msg.get.ptl_index),
- hdr->msg.get.return_wmd.wh_interface_cookie,
- hdr->msg.get.return_wmd.wh_object_cookie,
- hdr->msg.get.match_bits);
- CERROR(" Length %d, src offset %d\n",
- le32_to_cpu(hdr->msg.get.sink_length),
- le32_to_cpu(hdr->msg.get.src_offset));
- break;
-
- case LNET_MSG_ACK:
- CERROR(" dst md "LPX64"."LPX64", manipulated length %d\n",
- hdr->msg.ack.dst_wmd.wh_interface_cookie,
- hdr->msg.ack.dst_wmd.wh_object_cookie,
- le32_to_cpu(hdr->msg.ack.mlength));
- break;
-
- case LNET_MSG_REPLY:
- CERROR(" dst md "LPX64"."LPX64"\n",
- hdr->msg.reply.dst_wmd.wh_interface_cookie,
- hdr->msg.reply.dst_wmd.wh_object_cookie);
- }
-
-} /* end of print_hdr() */
-#endif
-
-int
-kqswnal_check_rdma (int nlfrag, EP_NMD *lfrag,
- int nrfrag, EP_NMD *rfrag)
-{
- int i;
-
- if (nlfrag != nrfrag) {
- CERROR("Can't cope with unequal # frags: %d local %d remote\n",
- nlfrag, nrfrag);
- return (-EINVAL);
- }
-
- for (i = 0; i < nlfrag; i++)
- if (lfrag[i].nmd_len != rfrag[i].nmd_len) {
- CERROR("Can't cope with unequal frags %d(%d):"
- " %d local %d remote\n",
- i, nlfrag, lfrag[i].nmd_len, rfrag[i].nmd_len);
- return (-EINVAL);
- }
-
- return (0);
-}
-
-kqswnal_remotemd_t *
-kqswnal_get_portalscompat_rmd (kqswnal_rx_t *krx)
-{
- /* Check that the RMD sent after the "raw" LNET header in a
- * portals-compatible QSWLND message is OK */
- char *buffer = (char *)page_address(krx->krx_kiov[0].kiov_page);
- kqswnal_remotemd_t *rmd = (kqswnal_remotemd_t *)(buffer + sizeof(lnet_hdr_t));
-
- /* Note RDMA addresses are sent in native endian-ness in the "old"
- * portals protocol so no swabbing... */
-
- if (buffer + krx->krx_nob < (char *)(rmd + 1)) {
- /* msg too small to discover rmd size */
- CERROR ("Incoming message [%d] too small for RMD (%d needed)\n",
- krx->krx_nob, (int)(((char *)(rmd + 1)) - buffer));
- return (NULL);
- }
-
- if (buffer + krx->krx_nob < (char *)&rmd->kqrmd_frag[rmd->kqrmd_nfrag]) {
- /* rmd doesn't fit in the incoming message */
- CERROR ("Incoming message [%d] too small for RMD[%d] (%d needed)\n",
- krx->krx_nob, rmd->kqrmd_nfrag,
- (int)(((char *)&rmd->kqrmd_frag[rmd->kqrmd_nfrag]) - buffer));
- return (NULL);
- }
-
- return (rmd);
-}
-
-void
-kqswnal_rdma_store_complete (EP_RXD *rxd)
-{
- int status = ep_rxd_status(rxd);
- kqswnal_tx_t *ktx = (kqswnal_tx_t *)ep_rxd_arg(rxd);
- kqswnal_rx_t *krx = (kqswnal_rx_t *)ktx->ktx_args[0];
-
- CDEBUG((status == EP_SUCCESS) ? D_NET : D_ERROR,
- "rxd %p, ktx %p, status %d\n", rxd, ktx, status);
-
- LASSERT (ktx->ktx_state == KTX_RDMA_STORE);
- LASSERT (krx->krx_rxd == rxd);
- LASSERT (krx->krx_rpc_reply_needed);
-
- krx->krx_rpc_reply_needed = 0;
- kqswnal_rx_decref (krx);
-
- /* free ktx & finalize() its lnet_msg_t */
- kqswnal_tx_done(ktx, (status == EP_SUCCESS) ? 0 : -ECONNABORTED);
-}
-
-void
-kqswnal_rdma_fetch_complete (EP_RXD *rxd)
-{
- /* Completed fetching the PUT/REPLY data */
- int status = ep_rxd_status(rxd);
- kqswnal_tx_t *ktx = (kqswnal_tx_t *)ep_rxd_arg(rxd);
- kqswnal_rx_t *krx = (kqswnal_rx_t *)ktx->ktx_args[0];
-
- CDEBUG((status == EP_SUCCESS) ? D_NET : D_ERROR,
- "rxd %p, ktx %p, status %d\n", rxd, ktx, status);
-
- LASSERT (ktx->ktx_state == KTX_RDMA_FETCH);
- LASSERT (krx->krx_rxd == rxd);
- /* RPC completes with failure by default */
- LASSERT (krx->krx_rpc_reply_needed);
- LASSERT (krx->krx_rpc_reply.msg.status != 0);
-
- if (status == EP_SUCCESS) {
- krx->krx_rpc_reply.msg.status = 0;
- status = 0;
- } else {
- /* Abandon RPC since get failed */
- krx->krx_rpc_reply_needed = 0;
- status = -ECONNABORTED;
- }
-
- /* krx gets decref'd in kqswnal_tx_done_in_thread_context() */
- LASSERT (krx->krx_state == KRX_PARSE);
- krx->krx_state = KRX_COMPLETING;
-
- /* free ktx & finalize() its lnet_msg_t */
- kqswnal_tx_done(ktx, status);
-}
-
-int
-kqswnal_rdma (kqswnal_rx_t *krx, lnet_msg_t *lntmsg,
- int type, kqswnal_remotemd_t *rmd,
- unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
- unsigned int offset, unsigned int len)
-{
- kqswnal_tx_t *ktx;
- int eprc;
- int rc;
-
- /* Not both mapped and paged payload */
- LASSERT (iov == NULL || kiov == NULL);
- /* RPC completes with failure by default */
- LASSERT (krx->krx_rpc_reply_needed);
- LASSERT (krx->krx_rpc_reply.msg.status != 0);
-
- if (len == 0) {
- /* data got truncated to nothing. */
- lnet_finalize(kqswnal_data.kqn_ni, lntmsg, 0);
- /* Let kqswnal_rx_done() complete the RPC with success */
- krx->krx_rpc_reply.msg.status = 0;
- return (0);
- }
-
- /* NB I'm using 'ktx' just to map the local RDMA buffers; I'm not
- actually sending a portals message with it */
- ktx = kqswnal_get_idle_tx();
- if (ktx == NULL) {
- CERROR ("Can't get txd for RDMA with %s\n",
- libcfs_nid2str(kqswnal_rx_nid(krx)));
- return (-ENOMEM);
- }
-
- ktx->ktx_state = type;
- ktx->ktx_nid = kqswnal_rx_nid(krx);
- ktx->ktx_args[0] = krx;
- ktx->ktx_args[1] = lntmsg;
-
- LASSERT (atomic_read(&krx->krx_refcount) > 0);
- /* Take an extra ref for the completion callback */
- atomic_inc(&krx->krx_refcount);
-
- /* Map on the rail the RPC prefers */
- ktx->ktx_rail = ep_rcvr_prefrail(krx->krx_eprx,
- ep_rxd_railmask(krx->krx_rxd));
-
- /* Start mapping at offset 0 (we're not mapping any headers) */
- ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 0;
-
- if (kiov != NULL)
- rc = kqswnal_map_tx_kiov(ktx, offset, len, niov, kiov);
- else
- rc = kqswnal_map_tx_iov(ktx, offset, len, niov, iov);
-
- if (rc != 0) {
- CERROR ("Can't map local RDMA data: %d\n", rc);
- goto out;
- }
-
- rc = kqswnal_check_rdma (ktx->ktx_nfrag, ktx->ktx_frags,
- rmd->kqrmd_nfrag, rmd->kqrmd_frag);
- if (rc != 0) {
- CERROR ("Incompatible RDMA descriptors\n");
- goto out;
- }
-
- switch (type) {
- default:
- LBUG();
-
- case KTX_RDMA_STORE:
- krx->krx_rpc_reply.msg.status = 0;
- krx->krx_rpc_reply.msg.magic = LNET_PROTO_QSW_MAGIC;
- krx->krx_rpc_reply.msg.version = QSWLND_PROTO_VERSION;
- krx->krx_rpc_reply.msg.u.get.len = len;
-#if KQSW_CKSUM
- krx->krx_rpc_reply.msg.u.get.cksum = (kiov != NULL) ?
- kqswnal_csum_kiov(~0, offset, len, niov, kiov) :
- kqswnal_csum_iov(~0, offset, len, niov, iov);
- if (*kqswnal_tunables.kqn_inject_csum_error == 4) {
- krx->krx_rpc_reply.msg.u.get.cksum++;
- *kqswnal_tunables.kqn_inject_csum_error = 0;
- }
-#endif
- eprc = ep_complete_rpc(krx->krx_rxd,
- kqswnal_rdma_store_complete, ktx,
- &krx->krx_rpc_reply.ep_statusblk,
- ktx->ktx_frags, rmd->kqrmd_frag,
- rmd->kqrmd_nfrag);
- if (eprc != EP_SUCCESS) {
- CERROR("can't complete RPC: %d\n", eprc);
- /* don't re-attempt RPC completion */
- krx->krx_rpc_reply_needed = 0;
- rc = -ECONNABORTED;
- }
- break;
-
- case KTX_RDMA_FETCH:
- eprc = ep_rpc_get (krx->krx_rxd,
- kqswnal_rdma_fetch_complete, ktx,
- rmd->kqrmd_frag, ktx->ktx_frags, ktx->ktx_nfrag);
- if (eprc != EP_SUCCESS) {
- CERROR("ep_rpc_get failed: %d\n", eprc);
- /* Don't attempt RPC completion:
- * EKC nuked it when the get failed */
- krx->krx_rpc_reply_needed = 0;
- rc = -ECONNABORTED;
- }
- break;
- }
-
- out:
- if (rc != 0) {
- kqswnal_rx_decref(krx); /* drop callback's ref */
- kqswnal_put_idle_tx (ktx);
- }
-
- atomic_dec(&kqswnal_data.kqn_pending_txs);
- return (rc);
-}
-
-int
-kqswnal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
-{
- lnet_hdr_t *hdr = &lntmsg->msg_hdr;
- int type = lntmsg->msg_type;
- lnet_process_id_t target = lntmsg->msg_target;
- int target_is_router = lntmsg->msg_target_is_router;
- int routing = lntmsg->msg_routing;
- unsigned int payload_niov = lntmsg->msg_niov;
- struct iovec *payload_iov = lntmsg->msg_iov;
- lnet_kiov_t *payload_kiov = lntmsg->msg_kiov;
- unsigned int payload_offset = lntmsg->msg_offset;
- unsigned int payload_nob = lntmsg->msg_len;
- int nob;
- kqswnal_tx_t *ktx;
- int rc;
-
- /* NB 1. hdr is in network byte order */
- /* 2. 'private' depends on the message type */
-
- CDEBUG(D_NET, "sending %u bytes in %d frags to %s\n",
- payload_nob, payload_niov, libcfs_id2str(target));
-
- LASSERT (payload_nob == 0 || payload_niov > 0);
- LASSERT (payload_niov <= LNET_MAX_IOV);
-
- /* It must be OK to kmap() if required */
- LASSERT (payload_kiov == NULL || !in_interrupt ());
- /* payload is either all vaddrs or all pages */
- LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
-
- if (kqswnal_nid2elanid (target.nid) < 0) {
- CERROR("%s not in my cluster\n", libcfs_nid2str(target.nid));
- return -EIO;
- }
-
- /* I may not block for a transmit descriptor if I might block the
- * router, receiver, or an interrupt handler. */
- ktx = kqswnal_get_idle_tx();
- if (ktx == NULL) {
- CERROR ("Can't get txd for msg type %d for %s\n",
- type, libcfs_nid2str(target.nid));
- return (-ENOMEM);
- }
-
- ktx->ktx_state = KTX_SENDING;
- ktx->ktx_nid = target.nid;
- ktx->ktx_args[0] = private;
- ktx->ktx_args[1] = lntmsg;
- ktx->ktx_args[2] = NULL; /* set when a GET commits to REPLY */
-
- /* The first frag will be the pre-mapped buffer. */
- ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 1;
-
- if ((!target_is_router && /* target.nid is final dest */
- !routing && /* I'm the source */
- type == LNET_MSG_GET && /* optimize GET? */
- *kqswnal_tunables.kqn_optimized_gets != 0 &&
- lntmsg->msg_md->md_length >=
- *kqswnal_tunables.kqn_optimized_gets) ||
- ((type == LNET_MSG_PUT || /* optimize PUT? */
- type == LNET_MSG_REPLY) && /* optimize REPLY? */
- *kqswnal_tunables.kqn_optimized_puts != 0 &&
- payload_nob >= *kqswnal_tunables.kqn_optimized_puts)) {
- lnet_libmd_t *md = lntmsg->msg_md;
- kqswnal_msg_t *msg = (kqswnal_msg_t *)ktx->ktx_buffer;
- lnet_hdr_t *mhdr;
- kqswnal_remotemd_t *rmd;
-
- /* Optimised path: I send over the Elan vaddrs of the local
- * buffers, and my peer DMAs directly to/from them.
- *
- * First I set up ktx as if it was going to send this
- * payload, (it needs to map it anyway). This fills
- * ktx_frags[1] and onward with the network addresses
- * of the buffer frags. */
-
- /* Send an RDMA message */
- msg->kqm_magic = LNET_PROTO_QSW_MAGIC;
- msg->kqm_version = QSWLND_PROTO_VERSION;
- msg->kqm_type = QSWLND_MSG_RDMA;
-
- mhdr = &msg->kqm_u.rdma.kqrm_hdr;
- rmd = &msg->kqm_u.rdma.kqrm_rmd;
-
- *mhdr = *hdr;
- nob = (((char *)rmd) - ktx->ktx_buffer);
-
- if (type == LNET_MSG_GET) {
- if ((md->md_options & LNET_MD_KIOV) != 0)
- rc = kqswnal_map_tx_kiov (ktx, 0, md->md_length,
- md->md_niov, md->md_iov.kiov);
- else
- rc = kqswnal_map_tx_iov (ktx, 0, md->md_length,
- md->md_niov, md->md_iov.iov);
- ktx->ktx_state = KTX_GETTING;
- } else {
- if (payload_kiov != NULL)
- rc = kqswnal_map_tx_kiov(ktx, 0, payload_nob,
- payload_niov, payload_kiov);
- else
- rc = kqswnal_map_tx_iov(ktx, 0, payload_nob,
- payload_niov, payload_iov);
- ktx->ktx_state = KTX_PUTTING;
- }
-
- if (rc != 0)
- goto out;
-
- rmd->kqrmd_nfrag = ktx->ktx_nfrag - 1;
- nob += offsetof(kqswnal_remotemd_t,
- kqrmd_frag[rmd->kqrmd_nfrag]);
- LASSERT (nob <= KQSW_TX_BUFFER_SIZE);
-
- memcpy(&rmd->kqrmd_frag[0], &ktx->ktx_frags[1],
- rmd->kqrmd_nfrag * sizeof(EP_NMD));
-
- ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, 0, nob);
-#if KQSW_CKSUM
- msg->kqm_nob = nob + payload_nob;
- msg->kqm_cksum = 0;
- msg->kqm_cksum = kqswnal_csum(~0, (char *)msg, nob);
-#endif
- if (type == LNET_MSG_GET) {
- /* Allocate reply message now while I'm in thread context */
- ktx->ktx_args[2] = lnet_create_reply_msg (
- kqswnal_data.kqn_ni, lntmsg);
- if (ktx->ktx_args[2] == NULL)
- goto out;
-
- /* NB finalizing the REPLY message is my
- * responsibility now, whatever happens. */
-#if KQSW_CKSUM
- if (*kqswnal_tunables.kqn_inject_csum_error == 3) {
- msg->kqm_cksum++;
- *kqswnal_tunables.kqn_inject_csum_error = 0;
- }
-
- } else if (payload_kiov != NULL) {
- /* must checksum payload after header so receiver can
- * compute partial header cksum before swab. Sadly
- * this causes 2 rounds of kmap */
- msg->kqm_cksum =
- kqswnal_csum_kiov(msg->kqm_cksum, 0, payload_nob,
- payload_niov, payload_kiov);
- if (*kqswnal_tunables.kqn_inject_csum_error == 2) {
- msg->kqm_cksum++;
- *kqswnal_tunables.kqn_inject_csum_error = 0;
- }
- } else {
- msg->kqm_cksum =
- kqswnal_csum_iov(msg->kqm_cksum, 0, payload_nob,
- payload_niov, payload_iov);
- if (*kqswnal_tunables.kqn_inject_csum_error == 2) {
- msg->kqm_cksum++;
- *kqswnal_tunables.kqn_inject_csum_error = 0;
- }
-#endif
- }
-
- } else if (payload_nob <= *kqswnal_tunables.kqn_tx_maxcontig) {
- lnet_hdr_t *mhdr;
- char *payload;
- kqswnal_msg_t *msg = (kqswnal_msg_t *)ktx->ktx_buffer;
-
- /* single frag copied into the pre-mapped buffer */
- msg->kqm_magic = LNET_PROTO_QSW_MAGIC;
- msg->kqm_version = QSWLND_PROTO_VERSION;
- msg->kqm_type = QSWLND_MSG_IMMEDIATE;
-
- mhdr = &msg->kqm_u.immediate.kqim_hdr;
- payload = msg->kqm_u.immediate.kqim_payload;
-
- *mhdr = *hdr;
- nob = (payload - ktx->ktx_buffer) + payload_nob;
-
- ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, 0, nob);
-
- if (payload_kiov != NULL)
- lnet_copy_kiov2flat(KQSW_TX_BUFFER_SIZE, payload, 0,
- payload_niov, payload_kiov,
- payload_offset, payload_nob);
- else
- lnet_copy_iov2flat(KQSW_TX_BUFFER_SIZE, payload, 0,
- payload_niov, payload_iov,
- payload_offset, payload_nob);
-#if KQSW_CKSUM
- msg->kqm_nob = nob;
- msg->kqm_cksum = 0;
- msg->kqm_cksum = kqswnal_csum(~0, (char *)msg, nob);
- if (*kqswnal_tunables.kqn_inject_csum_error == 1) {
- msg->kqm_cksum++;
- *kqswnal_tunables.kqn_inject_csum_error = 0;
- }
-#endif
- } else {
- lnet_hdr_t *mhdr;
- kqswnal_msg_t *msg = (kqswnal_msg_t *)ktx->ktx_buffer;
-
- /* multiple frags: first is hdr in pre-mapped buffer */
- msg->kqm_magic = LNET_PROTO_QSW_MAGIC;
- msg->kqm_version = QSWLND_PROTO_VERSION;
- msg->kqm_type = QSWLND_MSG_IMMEDIATE;
-
- mhdr = &msg->kqm_u.immediate.kqim_hdr;
- nob = offsetof(kqswnal_msg_t, kqm_u.immediate.kqim_payload);
-
- *mhdr = *hdr;
-
- ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, 0, nob);
-
- if (payload_kiov != NULL)
- rc = kqswnal_map_tx_kiov (ktx, payload_offset, payload_nob,
- payload_niov, payload_kiov);
- else
- rc = kqswnal_map_tx_iov (ktx, payload_offset, payload_nob,
- payload_niov, payload_iov);
- if (rc != 0)
- goto out;
-
-#if KQSW_CKSUM
- msg->kqm_nob = nob + payload_nob;
- msg->kqm_cksum = 0;
- msg->kqm_cksum = kqswnal_csum(~0, (char *)msg, nob);
-
- msg->kqm_cksum = (payload_kiov != NULL) ?
- kqswnal_csum_kiov(msg->kqm_cksum,
- payload_offset, payload_nob,
- payload_niov, payload_kiov) :
- kqswnal_csum_iov(msg->kqm_cksum,
- payload_offset, payload_nob,
- payload_niov, payload_iov);
-
- if (*kqswnal_tunables.kqn_inject_csum_error == 1) {
- msg->kqm_cksum++;
- *kqswnal_tunables.kqn_inject_csum_error = 0;
- }
-#endif
- nob += payload_nob;
- }
-
- ktx->ktx_port = (nob <= KQSW_SMALLMSG) ?
- EP_MSG_SVC_PORTALS_SMALL : EP_MSG_SVC_PORTALS_LARGE;
-
- rc = kqswnal_launch (ktx);
-
- out:
- CDEBUG_LIMIT(rc == 0? D_NET :D_NETERROR, "%s %d bytes to %s%s: rc %d\n",
- routing ? (rc == 0 ? "Routed" : "Failed to route") :
- (rc == 0 ? "Sent" : "Failed to send"),
- nob, libcfs_nid2str(target.nid),
- target_is_router ? "(router)" : "", rc);
-
- if (rc != 0) {
- lnet_msg_t *repmsg = (lnet_msg_t *)ktx->ktx_args[2];
- int state = ktx->ktx_state;
-
- kqswnal_put_idle_tx (ktx);
-
- if (state == KTX_GETTING && repmsg != NULL) {
- /* We committed to reply, but there was a problem
- * launching the GET. We can't avoid delivering a
- * REPLY event since we committed above, so we
- * pretend the GET succeeded but the REPLY
- * failed. */
- rc = 0;
- lnet_finalize (kqswnal_data.kqn_ni, lntmsg, 0);
- lnet_finalize (kqswnal_data.kqn_ni, repmsg, -EIO);
- }
-
- }
-
- atomic_dec(&kqswnal_data.kqn_pending_txs);
- return (rc == 0 ? 0 : -EIO);
-}
-
-void
-kqswnal_requeue_rx (kqswnal_rx_t *krx)
-{
- LASSERT (atomic_read(&krx->krx_refcount) == 0);
- LASSERT (!krx->krx_rpc_reply_needed);
-
- krx->krx_state = KRX_POSTED;
-
- if (kqswnal_data.kqn_shuttingdown) {
- /* free EKC rxd on shutdown */
- ep_complete_receive(krx->krx_rxd);
- } else {
- /* repost receive */
- ep_requeue_receive(krx->krx_rxd,
- kqswnal_rxhandler, krx,
- &krx->krx_elanbuffer, 0);
- }
-}
-
-void
-kqswnal_rpc_complete (EP_RXD *rxd)
-{
- int status = ep_rxd_status(rxd);
- kqswnal_rx_t *krx = (kqswnal_rx_t *)ep_rxd_arg(rxd);
-
- CDEBUG((status == EP_SUCCESS) ? D_NET : D_ERROR,
- "rxd %p, krx %p, status %d\n", rxd, krx, status);
-
- LASSERT (krx->krx_rxd == rxd);
- LASSERT (krx->krx_rpc_reply_needed);
-
- krx->krx_rpc_reply_needed = 0;
- kqswnal_requeue_rx (krx);
-}
-
-void
-kqswnal_rx_done (kqswnal_rx_t *krx)
-{
- int rc;
-
- LASSERT (atomic_read(&krx->krx_refcount) == 0);
-
- if (krx->krx_rpc_reply_needed) {
- /* We've not completed the peer's RPC yet... */
- krx->krx_rpc_reply.msg.magic = LNET_PROTO_QSW_MAGIC;
- krx->krx_rpc_reply.msg.version = QSWLND_PROTO_VERSION;
-
- LASSERT (!in_interrupt());
-
- rc = ep_complete_rpc(krx->krx_rxd,
- kqswnal_rpc_complete, krx,
- &krx->krx_rpc_reply.ep_statusblk,
- NULL, NULL, 0);
- if (rc == EP_SUCCESS)
- return;
-
- CERROR("can't complete RPC: %d\n", rc);
- krx->krx_rpc_reply_needed = 0;
- }
-
- kqswnal_requeue_rx(krx);
-}
-
-void
-kqswnal_parse (kqswnal_rx_t *krx)
-{
- lnet_ni_t *ni = kqswnal_data.kqn_ni;
- kqswnal_msg_t *msg = (kqswnal_msg_t *)page_address(krx->krx_kiov[0].kiov_page);
- lnet_nid_t fromnid = kqswnal_rx_nid(krx);
- int swab;
- int n;
- int i;
- int nob;
- int rc;
-
- LASSERT (atomic_read(&krx->krx_refcount) == 1);
-
- if (krx->krx_nob < offsetof(kqswnal_msg_t, kqm_u)) {
- CERROR("Short message %d received from %s\n",
- krx->krx_nob, libcfs_nid2str(fromnid));
- goto done;
- }
-
- swab = msg->kqm_magic == __swab32(LNET_PROTO_QSW_MAGIC);
-
- if (swab || msg->kqm_magic == LNET_PROTO_QSW_MAGIC) {
-#if KQSW_CKSUM
- __u32 csum0;
- __u32 csum1;
-
- /* csum byte array before swab */
- csum1 = msg->kqm_cksum;
- msg->kqm_cksum = 0;
- csum0 = kqswnal_csum_kiov(~0, 0, krx->krx_nob,
- krx->krx_npages, krx->krx_kiov);
- msg->kqm_cksum = csum1;
-#endif
-
- if (swab) {
- __swab16s(&msg->kqm_version);
- __swab16s(&msg->kqm_type);
-#if KQSW_CKSUM
- __swab32s(&msg->kqm_cksum);
- __swab32s(&msg->kqm_nob);
-#endif
- }
-
- if (msg->kqm_version != QSWLND_PROTO_VERSION) {
- /* Future protocol version compatibility support!
- * The next qswlnd-specific protocol rev will first
- * send an RPC to check version.
- * 1.4.6 and 1.4.7.early reply with a status
- * block containing its current version.
- * Later versions send a failure (-ve) status +
- * magic/version */
-
- if (!krx->krx_rpc_reply_needed) {
- CERROR("Unexpected version %d from %s\n",
- msg->kqm_version, libcfs_nid2str(fromnid));
- goto done;
- }
-
- LASSERT (krx->krx_rpc_reply.msg.status == -EPROTO);
- goto done;
- }
-
- switch (msg->kqm_type) {
- default:
- CERROR("Bad request type %x from %s\n",
- msg->kqm_type, libcfs_nid2str(fromnid));
- goto done;
-
- case QSWLND_MSG_IMMEDIATE:
- if (krx->krx_rpc_reply_needed) {
- /* Should have been a simple message */
- CERROR("IMMEDIATE sent as RPC from %s\n",
- libcfs_nid2str(fromnid));
- goto done;
- }
-
- nob = offsetof(kqswnal_msg_t, kqm_u.immediate.kqim_payload);
- if (krx->krx_nob < nob) {
- CERROR("Short IMMEDIATE %d(%d) from %s\n",
- krx->krx_nob, nob, libcfs_nid2str(fromnid));
- goto done;
- }
-
-#if KQSW_CKSUM
- if (csum0 != msg->kqm_cksum) {
- CERROR("Bad IMMEDIATE checksum %08x(%08x) from %s\n",
- csum0, msg->kqm_cksum, libcfs_nid2str(fromnid));
- CERROR("nob %d (%d)\n", krx->krx_nob, msg->kqm_nob);
- goto done;
- }
-#endif
- rc = lnet_parse(ni, &msg->kqm_u.immediate.kqim_hdr,
- fromnid, krx, 0);
- if (rc < 0)
- goto done;
- return;
-
- case QSWLND_MSG_RDMA:
- if (!krx->krx_rpc_reply_needed) {
- /* Should have been a simple message */
- CERROR("RDMA sent as simple message from %s\n",
- libcfs_nid2str(fromnid));
- goto done;
- }
-
- nob = offsetof(kqswnal_msg_t,
- kqm_u.rdma.kqrm_rmd.kqrmd_frag[0]);
- if (krx->krx_nob < nob) {
- CERROR("Short RDMA message %d(%d) from %s\n",
- krx->krx_nob, nob, libcfs_nid2str(fromnid));
- goto done;
- }
-
- if (swab)
- __swab32s(&msg->kqm_u.rdma.kqrm_rmd.kqrmd_nfrag);
-
- n = msg->kqm_u.rdma.kqrm_rmd.kqrmd_nfrag;
- nob = offsetof(kqswnal_msg_t,
- kqm_u.rdma.kqrm_rmd.kqrmd_frag[n]);
-
- if (krx->krx_nob < nob) {
- CERROR("short RDMA message %d(%d) from %s\n",
- krx->krx_nob, nob, libcfs_nid2str(fromnid));
- goto done;
- }
-
- if (swab) {
- for (i = 0; i < n; i++) {
- EP_NMD *nmd = &msg->kqm_u.rdma.kqrm_rmd.kqrmd_frag[i];
-
- __swab32s(&nmd->nmd_addr);
- __swab32s(&nmd->nmd_len);
- __swab32s(&nmd->nmd_attr);
- }
- }
-
-#if KQSW_CKSUM
- krx->krx_cksum = csum0; /* stash checksum so far */
-#endif
- rc = lnet_parse(ni, &msg->kqm_u.rdma.kqrm_hdr,
- fromnid, krx, 1);
- if (rc < 0)
- goto done;
- return;
- }
- /* Not Reached */
- }
-
- if (msg->kqm_magic == LNET_PROTO_MAGIC ||
- msg->kqm_magic == __swab32(LNET_PROTO_MAGIC)) {
- /* Future protocol version compatibility support!
- * When LNET unifies protocols over all LNDs, the first thing a
- * peer will send will be a version query RPC.
- * 1.4.6 and 1.4.7.early reply with a status block containing
- * LNET_PROTO_QSW_MAGIC..
- * Later versions send a failure (-ve) status +
- * magic/version */
-
- if (!krx->krx_rpc_reply_needed) {
- CERROR("Unexpected magic %08x from %s\n",
- msg->kqm_magic, libcfs_nid2str(fromnid));
- goto done;
- }
-
- LASSERT (krx->krx_rpc_reply.msg.status == -EPROTO);
- goto done;
- }
-
- CERROR("Unrecognised magic %08x from %s\n",
- msg->kqm_magic, libcfs_nid2str(fromnid));
- done:
- kqswnal_rx_decref(krx);
-}
-
-/* Receive Interrupt Handler: posts to schedulers */
-void
-kqswnal_rxhandler(EP_RXD *rxd)
-{
- unsigned long flags;
- int nob = ep_rxd_len (rxd);
- int status = ep_rxd_status (rxd);
- kqswnal_rx_t *krx = (kqswnal_rx_t *)ep_rxd_arg (rxd);
- CDEBUG(D_NET, "kqswnal_rxhandler: rxd %p, krx %p, nob %d, status %d\n",
- rxd, krx, nob, status);
-
- LASSERT (krx != NULL);
- LASSERT (krx->krx_state == KRX_POSTED);
-
- krx->krx_state = KRX_PARSE;
- krx->krx_rxd = rxd;
- krx->krx_nob = nob;
-
- /* RPC reply iff rpc request received without error */
- krx->krx_rpc_reply_needed = ep_rxd_isrpc(rxd) &&
- (status == EP_SUCCESS ||
- status == EP_MSG_TOO_BIG);
-
- /* Default to failure if an RPC reply is requested but not handled */
- krx->krx_rpc_reply.msg.status = -EPROTO;
- atomic_set (&krx->krx_refcount, 1);
-
- if (status != EP_SUCCESS) {
- /* receives complete with failure when receiver is removed */
- if (status == EP_SHUTDOWN)
- LASSERT (kqswnal_data.kqn_shuttingdown);
- else
- CERROR("receive status failed with status %d nob %d\n",
- ep_rxd_status(rxd), nob);
- kqswnal_rx_decref(krx);
- return;
- }
-
- if (!in_interrupt()) {
- kqswnal_parse(krx);
- return;
- }
-
- spin_lock_irqsave(&kqswnal_data.kqn_sched_lock, flags);
-
- cfs_list_add_tail(&krx->krx_list, &kqswnal_data.kqn_readyrxds);
- wake_up(&kqswnal_data.kqn_sched_waitq);
-
- spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, flags);
-}
-
-int
-kqswnal_recv (lnet_ni_t *ni,
- void *private,
- lnet_msg_t *lntmsg,
- int delayed,
- unsigned int niov,
- struct iovec *iov,
- lnet_kiov_t *kiov,
- unsigned int offset,
- unsigned int mlen,
- unsigned int rlen)
-{
- kqswnal_rx_t *krx = (kqswnal_rx_t *)private;
- lnet_nid_t fromnid;
- kqswnal_msg_t *msg;
- lnet_hdr_t *hdr;
- kqswnal_remotemd_t *rmd;
- int msg_offset;
- int rc;
-
- LASSERT (!in_interrupt ()); /* OK to map */
- /* Either all pages or all vaddrs */
- LASSERT (!(kiov != NULL && iov != NULL));
-
- fromnid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ep_rxd_node(krx->krx_rxd));
- msg = (kqswnal_msg_t *)page_address(krx->krx_kiov[0].kiov_page);
-
- if (krx->krx_rpc_reply_needed) {
- /* optimized (rdma) request sent as RPC */
-
- LASSERT (msg->kqm_type == QSWLND_MSG_RDMA);
- hdr = &msg->kqm_u.rdma.kqrm_hdr;
- rmd = &msg->kqm_u.rdma.kqrm_rmd;
-
- /* NB header is still in wire byte order */
-
- switch (le32_to_cpu(hdr->type)) {
- case LNET_MSG_PUT:
- case LNET_MSG_REPLY:
- /* This is an optimized PUT/REPLY */
- rc = kqswnal_rdma(krx, lntmsg,
- KTX_RDMA_FETCH, rmd,
- niov, iov, kiov, offset, mlen);
- break;
-
- case LNET_MSG_GET:
-#if KQSW_CKSUM
- if (krx->krx_cksum != msg->kqm_cksum) {
- CERROR("Bad GET checksum %08x(%08x) from %s\n",
- krx->krx_cksum, msg->kqm_cksum,
- libcfs_nid2str(fromnid));
- rc = -EIO;
- break;
- }
-#endif
- if (lntmsg == NULL) {
- /* No buffer match: my decref will
- * complete the RPC with failure */
- rc = 0;
- } else {
- /* Matched something! */
- rc = kqswnal_rdma(krx, lntmsg,
- KTX_RDMA_STORE, rmd,
- lntmsg->msg_niov,
- lntmsg->msg_iov,
- lntmsg->msg_kiov,
- lntmsg->msg_offset,
- lntmsg->msg_len);
- }
- break;
-
- default:
- CERROR("Bad RPC type %d\n",
- le32_to_cpu(hdr->type));
- rc = -EPROTO;
- break;
- }
-
- kqswnal_rx_decref(krx);
- return rc;
- }
-
- LASSERT (msg->kqm_type == QSWLND_MSG_IMMEDIATE);
- msg_offset = offsetof(kqswnal_msg_t, kqm_u.immediate.kqim_payload);
-
- if (krx->krx_nob < msg_offset + rlen) {
- CERROR("Bad message size from %s: have %d, need %d + %d\n",
- libcfs_nid2str(fromnid), krx->krx_nob,
- msg_offset, rlen);
- kqswnal_rx_decref(krx);
- return -EPROTO;
- }
-
- if (kiov != NULL)
- lnet_copy_kiov2kiov(niov, kiov, offset,
- krx->krx_npages, krx->krx_kiov,
- msg_offset, mlen);
- else
- lnet_copy_kiov2iov(niov, iov, offset,
- krx->krx_npages, krx->krx_kiov,
- msg_offset, mlen);
-
- lnet_finalize(ni, lntmsg, 0);
- kqswnal_rx_decref(krx);
- return 0;
-}
-
-int
-kqswnal_thread_start(int (*fn)(void *arg), void *arg, char *name)
-{
- struct task_struct *task = cfs_thread_run(fn, arg, name);
-
- if (IS_ERR(task))
- return PTR_ERR(task);
-
- atomic_inc(&kqswnal_data.kqn_nthreads);
- return 0;
-}
-
-void
-kqswnal_thread_fini (void)
-{
- atomic_dec (&kqswnal_data.kqn_nthreads);
-}
-
-int
-kqswnal_scheduler (void *arg)
-{
- kqswnal_rx_t *krx;
- kqswnal_tx_t *ktx;
- unsigned long flags;
- int rc;
- int counter = 0;
- int did_something;
-
- cfs_block_allsigs ();
-
- spin_lock_irqsave(&kqswnal_data.kqn_sched_lock, flags);
-
- for (;;)
- {
- did_something = 0;
-
- if (!cfs_list_empty (&kqswnal_data.kqn_readyrxds))
- {
- krx = cfs_list_entry(kqswnal_data.kqn_readyrxds.next,
- kqswnal_rx_t, krx_list);
- cfs_list_del (&krx->krx_list);
- spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock,
- flags);
-
- LASSERT (krx->krx_state == KRX_PARSE);
- kqswnal_parse (krx);
-
- did_something = 1;
- spin_lock_irqsave(&kqswnal_data.kqn_sched_lock,
- flags);
- }
-
- if (!cfs_list_empty (&kqswnal_data.kqn_donetxds))
- {
- ktx = cfs_list_entry(kqswnal_data.kqn_donetxds.next,
- kqswnal_tx_t, ktx_schedlist);
- cfs_list_del_init (&ktx->ktx_schedlist);
- spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock,
- flags);
-
- kqswnal_tx_done_in_thread_context(ktx);
-
- did_something = 1;
- spin_lock_irqsave(&kqswnal_data.kqn_sched_lock,
- flags);
- }
-
- if (!cfs_list_empty (&kqswnal_data.kqn_delayedtxds))
- {
- ktx = cfs_list_entry(kqswnal_data.kqn_delayedtxds.next,
- kqswnal_tx_t, ktx_schedlist);
- cfs_list_del_init (&ktx->ktx_schedlist);
- spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock,
- flags);
-
- rc = kqswnal_launch (ktx);
- if (rc != 0) {
- CERROR("Failed delayed transmit to %s: %d\n",
- libcfs_nid2str(ktx->ktx_nid), rc);
- kqswnal_tx_done (ktx, rc);
- }
- atomic_dec (&kqswnal_data.kqn_pending_txs);
-
- did_something = 1;
- spin_lock_irqsave(&kqswnal_data.kqn_sched_lock,
- flags);
- }
-
- /* nothing to do or hogging CPU */
- if (!did_something || counter++ == KQSW_RESCHED) {
- spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock,
- flags);
-
- counter = 0;
-
- if (!did_something) {
- if (kqswnal_data.kqn_shuttingdown == 2) {
- /* We only exit in stage 2 of shutdown
- * when there's nothing left to do */
- break;
- }
- rc = wait_event_interruptible_exclusive (
- kqswnal_data.kqn_sched_waitq,
- kqswnal_data.kqn_shuttingdown == 2 ||
- !cfs_list_empty(&kqswnal_data. \
- kqn_readyrxds) ||
- !cfs_list_empty(&kqswnal_data. \
- kqn_donetxds) ||
- !cfs_list_empty(&kqswnal_data. \
- kqn_delayedtxds));
- LASSERT (rc == 0);
- } else if (need_resched())
- schedule ();
-
- spin_lock_irqsave(&kqswnal_data.kqn_sched_lock,
- flags);
- }
- }
-
- kqswnal_thread_fini ();
- return 0;
-}
+++ /dev/null
-/*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- *
- * Author: Eric Barton <eric@bartonsoftware.com>
- *
- * This file is part of Portals, http://www.lustre.org
- *
- * Portals is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * Portals is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Portals; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-
-#include "qswlnd.h"
-
-static int tx_maxcontig = (1<<10);
-CFS_MODULE_PARM(tx_maxcontig, "i", int, 0444,
- "maximum payload to de-fragment");
-
-static int ntxmsgs = 512;
-CFS_MODULE_PARM(ntxmsgs, "i", int, 0444,
- "# tx msg buffers");
-
-static int credits = 128;
-CFS_MODULE_PARM(credits, "i", int, 0444,
- "# concurrent sends");
-
-static int peer_credits = 8;
-CFS_MODULE_PARM(peer_credits, "i", int, 0444,
- "# per-peer concurrent sends");
-
-static int nrxmsgs_large = 64;
-CFS_MODULE_PARM(nrxmsgs_large, "i", int, 0444,
- "# 'large' rx msg buffers");
-
-static int ep_envelopes_large = 256;
-CFS_MODULE_PARM(ep_envelopes_large, "i", int, 0444,
- "# 'large' rx msg envelope buffers");
-
-static int nrxmsgs_small = 256;
-CFS_MODULE_PARM(nrxmsgs_small, "i", int, 0444,
- "# 'small' rx msg buffers");
-
-static int ep_envelopes_small = 2048;
-CFS_MODULE_PARM(ep_envelopes_small, "i", int, 0444,
- "# 'small' rx msg envelope buffers");
-
-static int optimized_puts = (32<<10);
-CFS_MODULE_PARM(optimized_puts, "i", int, 0644,
- "zero-copy puts >= this size");
-
-static int optimized_gets = 2048;
-CFS_MODULE_PARM(optimized_gets, "i", int, 0644,
- "zero-copy gets >= this size");
-
-#if KQSW_CKSUM
-static int inject_csum_error = 0;
-CFS_MODULE_PARM(inject_csum_error, "i", int, 0644,
- "test checksumming");
-#endif
-
-kqswnal_tunables_t kqswnal_tunables = {
- .kqn_tx_maxcontig = &tx_maxcontig,
- .kqn_ntxmsgs = &ntxmsgs,
- .kqn_credits = &credits,
- .kqn_peercredits = &peer_credits,
- .kqn_nrxmsgs_large = &nrxmsgs_large,
- .kqn_ep_envelopes_large = &ep_envelopes_large,
- .kqn_nrxmsgs_small = &nrxmsgs_small,
- .kqn_ep_envelopes_small = &ep_envelopes_small,
- .kqn_optimized_puts = &optimized_puts,
- .kqn_optimized_gets = &optimized_gets,
-#if KQSW_CKSUM
- .kqn_inject_csum_error = &inject_csum_error,
-#endif
-};
-
-#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
-
-static struct ctl_table kqswnal_ctl_table[] = {
- {
- INIT_CTL_NAME
- .procname = "tx_maxcontig",
- .data = &tx_maxcontig,
- .maxlen = sizeof (int),
- .mode = 0444,
- .proc_handler = &proc_dointvec
- },
- {
- INIT_CTL_NAME
- .procname = "ntxmsgs",
- .data = &ntxmsgs,
- .maxlen = sizeof (int),
- .mode = 0444,
- .proc_handler = &proc_dointvec
- },
- {
- INIT_CTL_NAME
- .procname = "credits",
- .data = &credits,
- .maxlen = sizeof (int),
- .mode = 0444,
- .proc_handler = &proc_dointvec
- },
- {
- INIT_CTL_NAME
- .procname = "peer_credits",
- .data = &peer_credits,
- .maxlen = sizeof (int),
- .mode = 0444,
- .proc_handler = &proc_dointvec
- },
- {
- INIT_CTL_NAME
- .procname = "nrxmsgs_large",
- .data = &nrxmsgs_large,
- .maxlen = sizeof (int),
- .mode = 0444,
- .proc_handler = &proc_dointvec
- },
- {
- INIT_CTL_NAME
- .procname = "ep_envelopes_large",
- .data = &ep_envelopes_large,
- .maxlen = sizeof (int),
- .mode = 0444,
- .proc_handler = &proc_dointvec
- },
- {
- INIT_CTL_NAME
- .procname = "nrxmsgs_small",
- .data = &nrxmsgs_small,
- .maxlen = sizeof (int),
- .mode = 0444,
- .proc_handler = &proc_dointvec
- },
- {
- INIT_CTL_NAME
- .procname = "ep_envelopes_small",
- .data = &ep_envelopes_small,
- .maxlen = sizeof (int),
- .mode = 0444,
- .proc_handler = &proc_dointvec
- },
- {
- INIT_CTL_NAME
- .procname = "optimized_puts",
- .data = &optimized_puts,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = &proc_dointvec
- },
- {
- INIT_CTL_NAME
- .procname = "optimized_gets",
- .data = &optimized_gets,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = &proc_dointvec
- },
-#if KQSW_CKSUM
- {
- INIT_CTL_NAME
- .procname = "inject_csum_error",
- .data = &inject_csum_error,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = &proc_dointvec
- },
-#endif
- { 0 }
-};
-
-static struct ctl_table kqswnal_top_ctl_table[] = {
- {
- INIT_CTL_NAME
- .procname = "qswnal",
- .data = NULL,
- .maxlen = 0,
- .mode = 0555,
- .child = kqswnal_ctl_table
- },
- { 0 }
-};
-
-int
-kqswnal_tunables_init ()
-{
- kqswnal_tunables.kqn_sysctl =
- register_sysctl_table(kqswnal_top_ctl_table);
-
- if (kqswnal_tunables.kqn_sysctl == NULL)
- CWARN("Can't setup /proc tunables\n");
-
- return 0;
-}
-
-void kqswnal_tunables_fini()
-{
- if (kqswnal_tunables.kqn_sysctl != NULL)
- unregister_sysctl_table(kqswnal_tunables.kqn_sysctl);
-}
-#else
-int
-kqswnal_tunables_init ()
-{
- return 0;
-}
-
-void
-kqswnal_tunables_fini ()
-{
-}
-#endif
+++ /dev/null
-MODULES := kralnd
-kralnd-objs := ralnd.o ralnd_cb.o ralnd_modparams.o
-
-EXTRA_POST_CFLAGS := @RACPPFLAGS@
-
-@INCLUDE_RULES@
+++ /dev/null
-#
-# GPL HEADER START
-#
-# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 only,
-# as published by the Free Software Foundation.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# General Public License version 2 for more details (a copy is included
-# in the LICENSE file that accompanied this code).
-#
-# You should have received a copy of the GNU General Public License
-# version 2 along with this program; If not, see
-# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
-#
-# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
-# CA 95054 USA or visit www.sun.com if you need additional information or
-# have any questions.
-#
-# GPL HEADER END
-#
-
-#
-# Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
-# Use is subject to license terms.
-#
-
-#
-# This file is part of Lustre, http://www.lustre.org/
-# Lustre is a trademark of Sun Microsystems, Inc.
-#
-
-if MODULES
-if BUILD_RALND
-modulenet_DATA = kralnd$(KMODEXT)
-endif
-endif
-
-MOSTLYCLEANFILES = @MOSTLYCLEANFILES@
-EXTRA_DIST = $(kralnd-objs:%.o=%.c) ralnd.h
+++ /dev/null
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
- *
- * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
- * CA 95054 USA or visit www.sun.com if you need additional information or
- * have any questions.
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2012, 2014, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * lnet/klnds/ralnd/ralnd.c
- *
- * Author: Eric Barton <eric@bartonsoftware.com>
- */
-#include "ralnd.h"
-
-static int kranal_devids[RANAL_MAXDEVS] = {RAPK_MAIN_DEVICE_ID,
- RAPK_EXPANSION_DEVICE_ID};
-
-lnd_t the_kralnd = {
- .lnd_type = RALND,
- .lnd_startup = kranal_startup,
- .lnd_shutdown = kranal_shutdown,
- .lnd_ctl = kranal_ctl,
- .lnd_send = kranal_send,
- .lnd_recv = kranal_recv,
- .lnd_eager_recv = kranal_eager_recv,
- .lnd_accept = kranal_accept,
-};
-
-kra_data_t kranal_data;
-
-void
-kranal_pack_connreq(kra_connreq_t *connreq, kra_conn_t *conn, lnet_nid_t dstnid)
-{
- RAP_RETURN rrc;
-
- memset(connreq, 0, sizeof(*connreq));
-
- connreq->racr_magic = RANAL_MSG_MAGIC;
- connreq->racr_version = RANAL_MSG_VERSION;
-
- if (conn == NULL) /* prepping a "stub" reply */
- return;
-
- connreq->racr_devid = conn->rac_device->rad_id;
- connreq->racr_srcnid = kranal_data.kra_ni->ni_nid;
- connreq->racr_dstnid = dstnid;
- connreq->racr_peerstamp = kranal_data.kra_peerstamp;
- connreq->racr_connstamp = conn->rac_my_connstamp;
- connreq->racr_timeout = conn->rac_timeout;
-
- rrc = RapkGetRiParams(conn->rac_rihandle, &connreq->racr_riparams);
- LASSERT(rrc == RAP_SUCCESS);
-}
-
-int
-kranal_recv_connreq(struct socket *sock, kra_connreq_t *connreq, int active)
-{
- int timeout = active ? *kranal_tunables.kra_timeout :
- lnet_acceptor_timeout();
- int swab;
- int rc;
-
- /* return 0 on success, -ve on error, +ve to tell the peer I'm "old" */
-
- rc = libcfs_sock_read(sock, &connreq->racr_magic,
- sizeof(connreq->racr_magic), timeout);
- if (rc != 0) {
- CERROR("Read(magic) failed(1): %d\n", rc);
- return -EIO;
- }
-
- if (connreq->racr_magic != RANAL_MSG_MAGIC &&
- connreq->racr_magic != __swab32(RANAL_MSG_MAGIC)) {
- /* Unexpected magic! */
- if (!active &&
- (connreq->racr_magic == LNET_PROTO_MAGIC ||
- connreq->racr_magic == __swab32(LNET_PROTO_MAGIC))) {
- /* future protocol version compatibility!
- * When LNET unifies protocols over all LNDs, the first
- * thing sent will be a version query. +ve rc means I
- * reply with my current magic/version */
- return EPROTO;
- }
-
- CERROR("Unexpected magic %08x (%s)\n",
- connreq->racr_magic, active ? "active" : "passive");
- return -EPROTO;
- }
-
- swab = (connreq->racr_magic == __swab32(RANAL_MSG_MAGIC));
-
- rc = libcfs_sock_read(sock, &connreq->racr_version,
- sizeof(connreq->racr_version), timeout);
- if (rc != 0) {
- CERROR("Read(version) failed: %d\n", rc);
- return -EIO;
- }
-
- if (swab)
- __swab16s(&connreq->racr_version);
-
- if (connreq->racr_version != RANAL_MSG_VERSION) {
- if (active) {
- CERROR("Unexpected version %d\n", connreq->racr_version);
- return -EPROTO;
- }
- /* If this is a future version of the ralnd protocol, and I'm
- * passive (accepted the connection), tell my peer I'm "old"
- * (+ve rc) */
- return EPROTO;
- }
-
- rc = libcfs_sock_read(sock, &connreq->racr_devid,
- sizeof(connreq->racr_version) -
- offsetof(kra_connreq_t, racr_devid),
- timeout);
- if (rc != 0) {
- CERROR("Read(body) failed: %d\n", rc);
- return -EIO;
- }
-
- if (swab) {
- __swab32s(&connreq->racr_magic);
- __swab16s(&connreq->racr_version);
- __swab16s(&connreq->racr_devid);
- __swab64s(&connreq->racr_srcnid);
- __swab64s(&connreq->racr_dstnid);
- __swab64s(&connreq->racr_peerstamp);
- __swab64s(&connreq->racr_connstamp);
- __swab32s(&connreq->racr_timeout);
-
- __swab32s(&connreq->racr_riparams.HostId);
- __swab32s(&connreq->racr_riparams.FmaDomainHndl);
- __swab32s(&connreq->racr_riparams.PTag);
- __swab32s(&connreq->racr_riparams.CompletionCookie);
- }
-
- if (connreq->racr_srcnid == LNET_NID_ANY ||
- connreq->racr_dstnid == LNET_NID_ANY) {
- CERROR("Received LNET_NID_ANY\n");
- return -EPROTO;
- }
-
- if (connreq->racr_timeout < RANAL_MIN_TIMEOUT) {
- CERROR("Received timeout %d < MIN %d\n",
- connreq->racr_timeout, RANAL_MIN_TIMEOUT);
- return -EPROTO;
- }
-
- return 0;
-}
-
-int
-kranal_close_stale_conns_locked (kra_peer_t *peer, kra_conn_t *newconn)
-{
- kra_conn_t *conn;
- cfs_list_t *ctmp;
- cfs_list_t *cnxt;
- int loopback;
- int count = 0;
-
- loopback = peer->rap_nid == kranal_data.kra_ni->ni_nid;
-
- cfs_list_for_each_safe (ctmp, cnxt, &peer->rap_conns) {
- conn = cfs_list_entry(ctmp, kra_conn_t, rac_list);
-
- if (conn == newconn)
- continue;
-
- if (conn->rac_peerstamp != newconn->rac_peerstamp) {
- CDEBUG(D_NET, "Closing stale conn nid: %s "
- " peerstamp:"LPX64"("LPX64")\n",
- libcfs_nid2str(peer->rap_nid),
- conn->rac_peerstamp, newconn->rac_peerstamp);
- LASSERT (conn->rac_peerstamp < newconn->rac_peerstamp);
- count++;
- kranal_close_conn_locked(conn, -ESTALE);
- continue;
- }
-
- if (conn->rac_device != newconn->rac_device)
- continue;
-
- if (loopback &&
- newconn->rac_my_connstamp == conn->rac_peer_connstamp &&
- newconn->rac_peer_connstamp == conn->rac_my_connstamp)
- continue;
-
- LASSERT (conn->rac_peer_connstamp < newconn->rac_peer_connstamp);
-
- CDEBUG(D_NET, "Closing stale conn nid: %s"
- " connstamp:"LPX64"("LPX64")\n",
- libcfs_nid2str(peer->rap_nid),
- conn->rac_peer_connstamp, newconn->rac_peer_connstamp);
-
- count++;
- kranal_close_conn_locked(conn, -ESTALE);
- }
-
- return count;
-}
-
-int
-kranal_conn_isdup_locked(kra_peer_t *peer, kra_conn_t *newconn)
-{
- kra_conn_t *conn;
- cfs_list_t *tmp;
- int loopback;
-
- loopback = peer->rap_nid == kranal_data.kra_ni->ni_nid;
-
- cfs_list_for_each(tmp, &peer->rap_conns) {
- conn = cfs_list_entry(tmp, kra_conn_t, rac_list);
-
- /* 'newconn' is from an earlier version of 'peer'!!! */
- if (newconn->rac_peerstamp < conn->rac_peerstamp)
- return 1;
-
- /* 'conn' is from an earlier version of 'peer': it will be
- * removed when we cull stale conns later on... */
- if (newconn->rac_peerstamp > conn->rac_peerstamp)
- continue;
-
- /* Different devices are OK */
- if (conn->rac_device != newconn->rac_device)
- continue;
-
- /* It's me connecting to myself */
- if (loopback &&
- newconn->rac_my_connstamp == conn->rac_peer_connstamp &&
- newconn->rac_peer_connstamp == conn->rac_my_connstamp)
- continue;
-
- /* 'newconn' is an earlier connection from 'peer'!!! */
- if (newconn->rac_peer_connstamp < conn->rac_peer_connstamp)
- return 2;
-
- /* 'conn' is an earlier connection from 'peer': it will be
- * removed when we cull stale conns later on... */
- if (newconn->rac_peer_connstamp > conn->rac_peer_connstamp)
- continue;
-
- /* 'newconn' has the SAME connection stamp; 'peer' isn't
- * playing the game... */
- return 3;
- }
-
- return 0;
-}
-
-void
-kranal_set_conn_uniqueness (kra_conn_t *conn)
-{
- unsigned long flags;
-
- write_lock_irqsave(&kranal_data.kra_global_lock, flags);
-
- conn->rac_my_connstamp = kranal_data.kra_connstamp++;
-
- do { /* allocate a unique cqid */
- conn->rac_cqid = kranal_data.kra_next_cqid++;
- } while (kranal_cqid2conn_locked(conn->rac_cqid) != NULL);
-
- write_unlock_irqrestore(&kranal_data.kra_global_lock, flags);
-}
-
-int
-kranal_create_conn(kra_conn_t **connp, kra_device_t *dev)
-{
- kra_conn_t *conn;
- RAP_RETURN rrc;
-
- LASSERT (!in_interrupt());
- LIBCFS_ALLOC(conn, sizeof(*conn));
-
- if (conn == NULL)
- return -ENOMEM;
-
- memset(conn, 0, sizeof(*conn));
- atomic_set(&conn->rac_refcount, 1);
- CFS_INIT_LIST_HEAD(&conn->rac_list);
- CFS_INIT_LIST_HEAD(&conn->rac_hashlist);
- CFS_INIT_LIST_HEAD(&conn->rac_schedlist);
- CFS_INIT_LIST_HEAD(&conn->rac_fmaq);
- CFS_INIT_LIST_HEAD(&conn->rac_rdmaq);
- CFS_INIT_LIST_HEAD(&conn->rac_replyq);
- spin_lock_init(&conn->rac_lock);
-
- kranal_set_conn_uniqueness(conn);
-
- conn->rac_device = dev;
- conn->rac_timeout = MAX(*kranal_tunables.kra_timeout, RANAL_MIN_TIMEOUT);
- kranal_update_reaper_timeout(conn->rac_timeout);
-
- rrc = RapkCreateRi(dev->rad_handle, conn->rac_cqid,
- &conn->rac_rihandle);
- if (rrc != RAP_SUCCESS) {
- CERROR("RapkCreateRi failed: %d\n", rrc);
- LIBCFS_FREE(conn, sizeof(*conn));
- return -ENETDOWN;
- }
-
- atomic_inc(&kranal_data.kra_nconns);
- *connp = conn;
- return 0;
-}
-
-void
-kranal_destroy_conn(kra_conn_t *conn)
-{
- RAP_RETURN rrc;
-
- LASSERT (!in_interrupt());
- LASSERT (!conn->rac_scheduled);
- LASSERT (cfs_list_empty(&conn->rac_list));
- LASSERT (cfs_list_empty(&conn->rac_hashlist));
- LASSERT (cfs_list_empty(&conn->rac_schedlist));
- LASSERT (atomic_read(&conn->rac_refcount) == 0);
- LASSERT (cfs_list_empty(&conn->rac_fmaq));
- LASSERT (cfs_list_empty(&conn->rac_rdmaq));
- LASSERT (cfs_list_empty(&conn->rac_replyq));
-
- rrc = RapkDestroyRi(conn->rac_device->rad_handle,
- conn->rac_rihandle);
- LASSERT (rrc == RAP_SUCCESS);
-
- if (conn->rac_peer != NULL)
- kranal_peer_decref(conn->rac_peer);
-
- LIBCFS_FREE(conn, sizeof(*conn));
- atomic_dec(&kranal_data.kra_nconns);
-}
-
-void
-kranal_terminate_conn_locked (kra_conn_t *conn)
-{
- LASSERT (!in_interrupt());
- LASSERT (conn->rac_state == RANAL_CONN_CLOSING);
- LASSERT (!cfs_list_empty(&conn->rac_hashlist));
- LASSERT (cfs_list_empty(&conn->rac_list));
-
- /* Remove from conn hash table: no new callbacks */
- cfs_list_del_init(&conn->rac_hashlist);
- kranal_conn_decref(conn);
-
- conn->rac_state = RANAL_CONN_CLOSED;
-
- /* schedule to clear out all uncompleted comms in context of dev's
- * scheduler */
- kranal_schedule_conn(conn);
-}
-
-void
-kranal_close_conn_locked (kra_conn_t *conn, int error)
-{
- kra_peer_t *peer = conn->rac_peer;
-
- CDEBUG_LIMIT(error == 0 ? D_NET : D_NETERROR,
- "closing conn to %s: error %d\n",
- libcfs_nid2str(peer->rap_nid), error);
-
- LASSERT (!in_interrupt());
- LASSERT (conn->rac_state == RANAL_CONN_ESTABLISHED);
- LASSERT (!cfs_list_empty(&conn->rac_hashlist));
- LASSERT (!cfs_list_empty(&conn->rac_list));
-
- cfs_list_del_init(&conn->rac_list);
-
- if (cfs_list_empty(&peer->rap_conns) &&
- peer->rap_persistence == 0) {
- /* Non-persistent peer with no more conns... */
- kranal_unlink_peer_locked(peer);
- }
-
- /* Reset RX timeout to ensure we wait for an incoming CLOSE for the
- * full timeout. If we get a CLOSE we know the peer has stopped all
- * RDMA. Otherwise if we wait for the full timeout we can also be sure
- * all RDMA has stopped. */
- conn->rac_last_rx = jiffies;
- smp_mb();
-
- conn->rac_state = RANAL_CONN_CLOSING;
- kranal_schedule_conn(conn); /* schedule sending CLOSE */
-
- kranal_conn_decref(conn); /* lose peer's ref */
-}
-
-void
-kranal_close_conn (kra_conn_t *conn, int error)
-{
- unsigned long flags;
-
-
- write_lock_irqsave(&kranal_data.kra_global_lock, flags);
-
- if (conn->rac_state == RANAL_CONN_ESTABLISHED)
- kranal_close_conn_locked(conn, error);
-
- write_unlock_irqrestore(&kranal_data.kra_global_lock, flags);
-}
-
-int
-kranal_set_conn_params(kra_conn_t *conn, kra_connreq_t *connreq,
- __u32 peer_ip, int peer_port)
-{
- kra_device_t *dev = conn->rac_device;
- unsigned long flags;
- RAP_RETURN rrc;
-
- /* CAVEAT EMPTOR: we're really overloading rac_last_tx + rac_keepalive
- * to do RapkCompleteSync() timekeeping (see kibnal_scheduler). */
- conn->rac_last_tx = jiffies;
- conn->rac_keepalive = 0;
-
- rrc = RapkSetRiParams(conn->rac_rihandle, &connreq->racr_riparams);
- if (rrc != RAP_SUCCESS) {
- CERROR("Error setting riparams from %u.%u.%u.%u/%d: %d\n",
- HIPQUAD(peer_ip), peer_port, rrc);
- return -ECONNABORTED;
- }
-
- /* Schedule conn on rad_new_conns */
- kranal_conn_addref(conn);
- spin_lock_irqsave(&dev->rad_lock, flags);
- cfs_list_add_tail(&conn->rac_schedlist, &dev->rad_new_conns);
- wake_up(&dev->rad_waitq);
- spin_unlock_irqrestore(&dev->rad_lock, flags);
-
- rrc = RapkWaitToConnect(conn->rac_rihandle);
- if (rrc != RAP_SUCCESS) {
- CERROR("Error waiting to connect to %u.%u.%u.%u/%d: %d\n",
- HIPQUAD(peer_ip), peer_port, rrc);
- return -ECONNABORTED;
- }
-
- /* Scheduler doesn't touch conn apart from to deschedule and decref it
- * after RapkCompleteSync() return success, so conn is all mine */
-
- conn->rac_peerstamp = connreq->racr_peerstamp;
- conn->rac_peer_connstamp = connreq->racr_connstamp;
- conn->rac_keepalive = RANAL_TIMEOUT2KEEPALIVE(connreq->racr_timeout);
- kranal_update_reaper_timeout(conn->rac_keepalive);
- return 0;
-}
-
-int
-kranal_passive_conn_handshake (struct socket *sock, lnet_nid_t *src_nidp,
- lnet_nid_t *dst_nidp, kra_conn_t **connp)
-{
- __u32 peer_ip;
- unsigned int peer_port;
- kra_connreq_t rx_connreq;
- kra_connreq_t tx_connreq;
- kra_conn_t *conn;
- kra_device_t *dev;
- int rc;
- int i;
-
- rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port);
- if (rc != 0) {
- CERROR("Can't get peer's IP: %d\n", rc);
- return rc;
- }
-
- rc = kranal_recv_connreq(sock, &rx_connreq, 0);
-
- if (rc < 0) {
- CERROR("Can't rx connreq from %u.%u.%u.%u/%d: %d\n",
- HIPQUAD(peer_ip), peer_port, rc);
- return rc;
- }
-
- if (rc > 0) {
- /* Request from "new" peer: send reply with my MAGIC/VERSION to
- * tell her I'm old... */
- kranal_pack_connreq(&tx_connreq, NULL, LNET_NID_ANY);
-
- rc = libcfs_sock_write(sock, &tx_connreq, sizeof(tx_connreq),
- lnet_acceptor_timeout());
- if (rc != 0)
- CERROR("Can't tx stub connreq to %u.%u.%u.%u/%d: %d\n",
- HIPQUAD(peer_ip), peer_port, rc);
-
- return -EPROTO;
- }
-
- for (i = 0;;i++) {
- if (i == kranal_data.kra_ndevs) {
- CERROR("Can't match dev %d from %u.%u.%u.%u/%d\n",
- rx_connreq.racr_devid, HIPQUAD(peer_ip), peer_port);
- return -ENODEV;
- }
- dev = &kranal_data.kra_devices[i];
- if (dev->rad_id == rx_connreq.racr_devid)
- break;
- }
-
- rc = kranal_create_conn(&conn, dev);
- if (rc != 0)
- return rc;
-
- kranal_pack_connreq(&tx_connreq, conn, rx_connreq.racr_srcnid);
-
- rc = libcfs_sock_write(sock, &tx_connreq, sizeof(tx_connreq),
- lnet_acceptor_timeout());
- if (rc != 0) {
- CERROR("Can't tx connreq to %u.%u.%u.%u/%d: %d\n",
- HIPQUAD(peer_ip), peer_port, rc);
- kranal_conn_decref(conn);
- return rc;
- }
-
- rc = kranal_set_conn_params(conn, &rx_connreq, peer_ip, peer_port);
- if (rc != 0) {
- kranal_conn_decref(conn);
- return rc;
- }
-
- *connp = conn;
- *src_nidp = rx_connreq.racr_srcnid;
- *dst_nidp = rx_connreq.racr_dstnid;
- return 0;
-}
-
-int
-kranal_active_conn_handshake(kra_peer_t *peer,
- lnet_nid_t *dst_nidp, kra_conn_t **connp)
-{
- kra_connreq_t connreq;
- kra_conn_t *conn;
- kra_device_t *dev;
- struct socket *sock;
- int rc;
- unsigned int idx;
-
- /* spread connections over all devices using both peer NIDs to ensure
- * all nids use all devices */
- idx = peer->rap_nid + kranal_data.kra_ni->ni_nid;
- dev = &kranal_data.kra_devices[idx % kranal_data.kra_ndevs];
-
- rc = kranal_create_conn(&conn, dev);
- if (rc != 0)
- return rc;
-
- kranal_pack_connreq(&connreq, conn, peer->rap_nid);
-
- if (the_lnet.ln_testprotocompat != 0) {
- /* single-shot proto test */
- LNET_LOCK();
- if ((the_lnet.ln_testprotocompat & 1) != 0) {
- connreq.racr_version++;
- the_lnet.ln_testprotocompat &= ~1;
- }
- if ((the_lnet.ln_testprotocompat & 2) != 0) {
- connreq.racr_magic = LNET_PROTO_MAGIC;
- the_lnet.ln_testprotocompat &= ~2;
- }
- LNET_UNLOCK();
- }
-
- rc = lnet_connect(&sock, peer->rap_nid,
- 0, peer->rap_ip, peer->rap_port);
- if (rc != 0)
- goto failed_0;
-
- /* CAVEAT EMPTOR: the passive side receives with a SHORT rx timeout
- * immediately after accepting a connection, so we connect and then
- * send immediately. */
-
- rc = libcfs_sock_write(sock, &connreq, sizeof(connreq),
- lnet_acceptor_timeout());
- if (rc != 0) {
- CERROR("Can't tx connreq to %u.%u.%u.%u/%d: %d\n",
- HIPQUAD(peer->rap_ip), peer->rap_port, rc);
- goto failed_2;
- }
-
- rc = kranal_recv_connreq(sock, &connreq, 1);
- if (rc != 0) {
- CERROR("Can't rx connreq from %u.%u.%u.%u/%d: %d\n",
- HIPQUAD(peer->rap_ip), peer->rap_port, rc);
- goto failed_2;
- }
-
- libcfs_sock_release(sock);
- rc = -EPROTO;
-
- if (connreq.racr_srcnid != peer->rap_nid) {
- CERROR("Unexpected srcnid from %u.%u.%u.%u/%d: "
- "received %s expected %s\n",
- HIPQUAD(peer->rap_ip), peer->rap_port,
- libcfs_nid2str(connreq.racr_srcnid),
- libcfs_nid2str(peer->rap_nid));
- goto failed_1;
- }
-
- if (connreq.racr_devid != dev->rad_id) {
- CERROR("Unexpected device id from %u.%u.%u.%u/%d: "
- "received %d expected %d\n",
- HIPQUAD(peer->rap_ip), peer->rap_port,
- connreq.racr_devid, dev->rad_id);
- goto failed_1;
- }
-
- rc = kranal_set_conn_params(conn, &connreq,
- peer->rap_ip, peer->rap_port);
- if (rc != 0)
- goto failed_1;
-
- *connp = conn;
- *dst_nidp = connreq.racr_dstnid;
- return 0;
-
- failed_2:
- libcfs_sock_release(sock);
- failed_1:
- lnet_connect_console_error(rc, peer->rap_nid,
- peer->rap_ip, peer->rap_port);
- failed_0:
- kranal_conn_decref(conn);
- return rc;
-}
-
-int
-kranal_conn_handshake (struct socket *sock, kra_peer_t *peer)
-{
- kra_peer_t *peer2;
- kra_tx_t *tx;
- lnet_nid_t peer_nid;
- lnet_nid_t dst_nid;
- unsigned long flags;
- kra_conn_t *conn;
- int rc;
- int nstale;
- int new_peer = 0;
-
- if (sock == NULL) {
- /* active: connd wants to connect to 'peer' */
- LASSERT (peer != NULL);
- LASSERT (peer->rap_connecting);
-
- rc = kranal_active_conn_handshake(peer, &dst_nid, &conn);
- if (rc != 0)
- return rc;
-
- write_lock_irqsave(&kranal_data.kra_global_lock, flags);
-
- if (!kranal_peer_active(peer)) {
- /* raced with peer getting unlinked */
- write_unlock_irqrestore(&kranal_data. \
- kra_global_lock,
- flags);
- kranal_conn_decref(conn);
- return -ESTALE;
- }
-
- peer_nid = peer->rap_nid;
- } else {
- /* passive: listener accepted 'sock' */
- LASSERT (peer == NULL);
-
- rc = kranal_passive_conn_handshake(sock, &peer_nid,
- &dst_nid, &conn);
- if (rc != 0)
- return rc;
-
- /* assume this is a new peer */
- rc = kranal_create_peer(&peer, peer_nid);
- if (rc != 0) {
- CERROR("Can't create conn for %s\n",
- libcfs_nid2str(peer_nid));
- kranal_conn_decref(conn);
- return -ENOMEM;
- }
-
- write_lock_irqsave(&kranal_data.kra_global_lock, flags);
-
- peer2 = kranal_find_peer_locked(peer_nid);
- if (peer2 == NULL) {
- new_peer = 1;
- } else {
- /* peer_nid already in the peer table */
- kranal_peer_decref(peer);
- peer = peer2;
- }
- }
-
- LASSERT ((!new_peer) != (!kranal_peer_active(peer)));
-
- /* Refuse connection if peer thinks we are a different NID. We check
- * this while holding the global lock, to synch with connection
- * destruction on NID change. */
- if (kranal_data.kra_ni->ni_nid != dst_nid) {
- write_unlock_irqrestore(&kranal_data.kra_global_lock,
- flags);
-
- CERROR("Stale/bad connection with %s: dst_nid %s, expected %s\n",
- libcfs_nid2str(peer_nid), libcfs_nid2str(dst_nid),
- libcfs_nid2str(kranal_data.kra_ni->ni_nid));
- rc = -ESTALE;
- goto failed;
- }
-
- /* Refuse to duplicate an existing connection (both sides might try to
- * connect at once). NB we return success! We _are_ connected so we
- * _don't_ have any blocked txs to complete with failure. */
- rc = kranal_conn_isdup_locked(peer, conn);
- if (rc != 0) {
- LASSERT (!cfs_list_empty(&peer->rap_conns));
- LASSERT (cfs_list_empty(&peer->rap_tx_queue));
- write_unlock_irqrestore(&kranal_data.kra_global_lock,
- flags);
- CWARN("Not creating duplicate connection to %s: %d\n",
- libcfs_nid2str(peer_nid), rc);
- rc = 0;
- goto failed;
- }
-
- if (new_peer) {
- /* peer table takes my ref on the new peer */
- cfs_list_add_tail(&peer->rap_list,
- kranal_nid2peerlist(peer_nid));
- }
-
- /* initialise timestamps before reaper looks at them */
- conn->rac_last_tx = conn->rac_last_rx = jiffies;
-
- kranal_peer_addref(peer); /* +1 ref for conn */
- conn->rac_peer = peer;
- cfs_list_add_tail(&conn->rac_list, &peer->rap_conns);
-
- kranal_conn_addref(conn); /* +1 ref for conn table */
- cfs_list_add_tail(&conn->rac_hashlist,
- kranal_cqid2connlist(conn->rac_cqid));
-
- /* Schedule all packets blocking for a connection */
- while (!cfs_list_empty(&peer->rap_tx_queue)) {
- tx = cfs_list_entry(peer->rap_tx_queue.next,
- kra_tx_t, tx_list);
-
- cfs_list_del(&tx->tx_list);
- kranal_post_fma(conn, tx);
- }
-
- nstale = kranal_close_stale_conns_locked(peer, conn);
-
- write_unlock_irqrestore(&kranal_data.kra_global_lock, flags);
-
- /* CAVEAT EMPTOR: passive peer can disappear NOW */
-
- if (nstale != 0)
- CWARN("Closed %d stale conns to %s\n", nstale,
- libcfs_nid2str(peer_nid));
-
- CWARN("New connection to %s on devid[%d] = %d\n",
- libcfs_nid2str(peer_nid),
- conn->rac_device->rad_idx, conn->rac_device->rad_id);
-
- /* Ensure conn gets checked. Transmits may have been queued and an
- * FMA event may have happened before it got in the cq hash table */
- kranal_schedule_conn(conn);
- return 0;
-
- failed:
- if (new_peer)
- kranal_peer_decref(peer);
- kranal_conn_decref(conn);
- return rc;
-}
-
-void
-kranal_connect (kra_peer_t *peer)
-{
- kra_tx_t *tx;
- unsigned long flags;
- cfs_list_t zombies;
- int rc;
-
- LASSERT (peer->rap_connecting);
-
- CDEBUG(D_NET, "About to handshake %s\n",
- libcfs_nid2str(peer->rap_nid));
-
- rc = kranal_conn_handshake(NULL, peer);
-
- CDEBUG(D_NET, "Done handshake %s:%d \n",
- libcfs_nid2str(peer->rap_nid), rc);
-
- write_lock_irqsave(&kranal_data.kra_global_lock, flags);
-
- LASSERT (peer->rap_connecting);
- peer->rap_connecting = 0;
-
- if (rc == 0) {
- /* kranal_conn_handshake() queues blocked txs immediately on
- * success to avoid messages jumping the queue */
- LASSERT (cfs_list_empty(&peer->rap_tx_queue));
-
- peer->rap_reconnect_interval = 0; /* OK to reconnect at any time */
-
- write_unlock_irqrestore(&kranal_data.kra_global_lock,
- flags);
- return;
- }
-
- peer->rap_reconnect_interval *= 2;
- peer->rap_reconnect_interval =
- MAX(peer->rap_reconnect_interval,
- *kranal_tunables.kra_min_reconnect_interval);
- peer->rap_reconnect_interval =
- MIN(peer->rap_reconnect_interval,
- *kranal_tunables.kra_max_reconnect_interval);
-
- peer->rap_reconnect_time = jiffies +
- msecs_to_jiffies(peer->rap_reconnect_interval * MSEC_PER_SEC);
-
- /* Grab all blocked packets while we have the global lock */
- cfs_list_add(&zombies, &peer->rap_tx_queue);
- cfs_list_del_init(&peer->rap_tx_queue);
-
- write_unlock_irqrestore(&kranal_data.kra_global_lock, flags);
-
- if (cfs_list_empty(&zombies))
- return;
-
- CNETERR("Dropping packets for %s: connection failed\n",
- libcfs_nid2str(peer->rap_nid));
-
- do {
- tx = cfs_list_entry(zombies.next, kra_tx_t, tx_list);
-
- cfs_list_del(&tx->tx_list);
- kranal_tx_done(tx, -EHOSTUNREACH);
-
- } while (!cfs_list_empty(&zombies));
-}
-
-void
-kranal_free_acceptsock (kra_acceptsock_t *ras)
-{
- libcfs_sock_release(ras->ras_sock);
- LIBCFS_FREE(ras, sizeof(*ras));
-}
-
-int
-kranal_accept (lnet_ni_t *ni, struct socket *sock)
-{
- kra_acceptsock_t *ras;
- int rc;
- __u32 peer_ip;
- int peer_port;
- unsigned long flags;
-
- rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port);
- LASSERT (rc == 0); /* we succeeded before */
-
- LIBCFS_ALLOC(ras, sizeof(*ras));
- if (ras == NULL) {
- CERROR("ENOMEM allocating connection request from "
- "%u.%u.%u.%u\n", HIPQUAD(peer_ip));
- return -ENOMEM;
- }
-
- ras->ras_sock = sock;
-
- spin_lock_irqsave(&kranal_data.kra_connd_lock, flags);
-
- cfs_list_add_tail(&ras->ras_list, &kranal_data.kra_connd_acceptq);
- wake_up(&kranal_data.kra_connd_waitq);
-
- spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags);
- return 0;
-}
-
-int
-kranal_create_peer (kra_peer_t **peerp, lnet_nid_t nid)
-{
- kra_peer_t *peer;
- unsigned long flags;
-
- LASSERT (nid != LNET_NID_ANY);
-
- LIBCFS_ALLOC(peer, sizeof(*peer));
- if (peer == NULL)
- return -ENOMEM;
-
- memset(peer, 0, sizeof(*peer)); /* zero flags etc */
-
- peer->rap_nid = nid;
- atomic_set(&peer->rap_refcount, 1); /* 1 ref for caller */
-
- CFS_INIT_LIST_HEAD(&peer->rap_list);
- CFS_INIT_LIST_HEAD(&peer->rap_connd_list);
- CFS_INIT_LIST_HEAD(&peer->rap_conns);
- CFS_INIT_LIST_HEAD(&peer->rap_tx_queue);
-
- peer->rap_reconnect_interval = 0; /* OK to connect at any time */
-
- write_lock_irqsave(&kranal_data.kra_global_lock, flags);
-
- if (kranal_data.kra_nonewpeers) {
- /* shutdown has started already */
- write_unlock_irqrestore(&kranal_data.kra_global_lock,
- flags);
-
- LIBCFS_FREE(peer, sizeof(*peer));
- CERROR("Can't create peer: network shutdown\n");
- return -ESHUTDOWN;
- }
-
- atomic_inc(&kranal_data.kra_npeers);
-
- write_unlock_irqrestore(&kranal_data.kra_global_lock, flags);
-
- *peerp = peer;
- return 0;
-}
-
-void
-kranal_destroy_peer (kra_peer_t *peer)
-{
- CDEBUG(D_NET, "peer %s %p deleted\n",
- libcfs_nid2str(peer->rap_nid), peer);
-
- LASSERT (atomic_read(&peer->rap_refcount) == 0);
- LASSERT (peer->rap_persistence == 0);
- LASSERT (!kranal_peer_active(peer));
- LASSERT (!peer->rap_connecting);
- LASSERT (cfs_list_empty(&peer->rap_conns));
- LASSERT (cfs_list_empty(&peer->rap_tx_queue));
- LASSERT (cfs_list_empty(&peer->rap_connd_list));
-
- LIBCFS_FREE(peer, sizeof(*peer));
-
- /* NB a peer's connections keep a reference on their peer until
- * they are destroyed, so we can be assured that _all_ state to do
- * with this peer has been cleaned up when its refcount drops to
- * zero. */
- atomic_dec(&kranal_data.kra_npeers);
-}
-
-kra_peer_t *
-kranal_find_peer_locked (lnet_nid_t nid)
-{
- cfs_list_t *peer_list = kranal_nid2peerlist(nid);
- cfs_list_t *tmp;
- kra_peer_t *peer;
-
- cfs_list_for_each (tmp, peer_list) {
-
- peer = cfs_list_entry(tmp, kra_peer_t, rap_list);
-
- LASSERT (peer->rap_persistence > 0 || /* persistent peer */
- !cfs_list_empty(&peer->rap_conns)); /* active conn */
-
- if (peer->rap_nid != nid)
- continue;
-
- CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n",
- peer, libcfs_nid2str(nid),
- atomic_read(&peer->rap_refcount));
- return peer;
- }
- return NULL;
-}
-
-kra_peer_t *
-kranal_find_peer (lnet_nid_t nid)
-{
- kra_peer_t *peer;
-
- read_lock(&kranal_data.kra_global_lock);
- peer = kranal_find_peer_locked(nid);
- if (peer != NULL) /* +1 ref for caller? */
- kranal_peer_addref(peer);
- read_unlock(&kranal_data.kra_global_lock);
-
- return peer;
-}
-
-void
-kranal_unlink_peer_locked (kra_peer_t *peer)
-{
- LASSERT (peer->rap_persistence == 0);
- LASSERT (cfs_list_empty(&peer->rap_conns));
-
- LASSERT (kranal_peer_active(peer));
- cfs_list_del_init(&peer->rap_list);
-
- /* lose peerlist's ref */
- kranal_peer_decref(peer);
-}
-
-int
-kranal_get_peer_info (int index, lnet_nid_t *nidp, __u32 *ipp, int *portp,
- int *persistencep)
-{
- kra_peer_t *peer;
- cfs_list_t *ptmp;
- int i;
-
- read_lock(&kranal_data.kra_global_lock);
-
- for (i = 0; i < kranal_data.kra_peer_hash_size; i++) {
-
- cfs_list_for_each(ptmp, &kranal_data.kra_peers[i]) {
-
- peer = cfs_list_entry(ptmp, kra_peer_t, rap_list);
- LASSERT (peer->rap_persistence > 0 ||
- !cfs_list_empty(&peer->rap_conns));
-
- if (index-- > 0)
- continue;
-
- *nidp = peer->rap_nid;
- *ipp = peer->rap_ip;
- *portp = peer->rap_port;
- *persistencep = peer->rap_persistence;
-
- read_unlock(&kranal_data.kra_global_lock);
- return 0;
- }
- }
-
- read_unlock(&kranal_data.kra_global_lock);
- return -ENOENT;
-}
-
-int
-kranal_add_persistent_peer (lnet_nid_t nid, __u32 ip, int port)
-{
- unsigned long flags;
- kra_peer_t *peer;
- kra_peer_t *peer2;
- int rc;
-
- if (nid == LNET_NID_ANY)
- return -EINVAL;
-
- rc = kranal_create_peer(&peer, nid);
- if (rc != 0)
- return rc;
-
- write_lock_irqsave(&kranal_data.kra_global_lock, flags);
-
- peer2 = kranal_find_peer_locked(nid);
- if (peer2 != NULL) {
- kranal_peer_decref(peer);
- peer = peer2;
- } else {
- /* peer table takes existing ref on peer */
- cfs_list_add_tail(&peer->rap_list,
- kranal_nid2peerlist(nid));
- }
-
- peer->rap_ip = ip;
- peer->rap_port = port;
- peer->rap_persistence++;
-
- write_unlock_irqrestore(&kranal_data.kra_global_lock, flags);
- return 0;
-}
-
-void
-kranal_del_peer_locked (kra_peer_t *peer)
-{
- cfs_list_t *ctmp;
- cfs_list_t *cnxt;
- kra_conn_t *conn;
-
- peer->rap_persistence = 0;
-
- if (cfs_list_empty(&peer->rap_conns)) {
- kranal_unlink_peer_locked(peer);
- } else {
- cfs_list_for_each_safe(ctmp, cnxt, &peer->rap_conns) {
- conn = cfs_list_entry(ctmp, kra_conn_t, rac_list);
-
- kranal_close_conn_locked(conn, 0);
- }
- /* peer unlinks itself when last conn is closed */
- }
-}
-
-int
-kranal_del_peer (lnet_nid_t nid)
-{
- unsigned long flags;
- cfs_list_t *ptmp;
- cfs_list_t *pnxt;
- kra_peer_t *peer;
- int lo;
- int hi;
- int i;
- int rc = -ENOENT;
-
- write_lock_irqsave(&kranal_data.kra_global_lock, flags);
-
- if (nid != LNET_NID_ANY)
- lo = hi = kranal_nid2peerlist(nid) - kranal_data.kra_peers;
- else {
- lo = 0;
- hi = kranal_data.kra_peer_hash_size - 1;
- }
-
- for (i = lo; i <= hi; i++) {
- cfs_list_for_each_safe (ptmp, pnxt, &kranal_data.kra_peers[i]) {
- peer = cfs_list_entry(ptmp, kra_peer_t, rap_list);
- LASSERT (peer->rap_persistence > 0 ||
- !cfs_list_empty(&peer->rap_conns));
-
- if (!(nid == LNET_NID_ANY || peer->rap_nid == nid))
- continue;
-
- kranal_del_peer_locked(peer);
- rc = 0; /* matched something */
- }
- }
-
- write_unlock_irqrestore(&kranal_data.kra_global_lock, flags);
-
- return rc;
-}
-
-kra_conn_t *
-kranal_get_conn_by_idx (int index)
-{
- kra_peer_t *peer;
- cfs_list_t *ptmp;
- kra_conn_t *conn;
- cfs_list_t *ctmp;
- int i;
-
- read_lock(&kranal_data.kra_global_lock);
-
- for (i = 0; i < kranal_data.kra_peer_hash_size; i++) {
- cfs_list_for_each (ptmp, &kranal_data.kra_peers[i]) {
-
- peer = cfs_list_entry(ptmp, kra_peer_t, rap_list);
- LASSERT (peer->rap_persistence > 0 ||
- !cfs_list_empty(&peer->rap_conns));
-
- cfs_list_for_each (ctmp, &peer->rap_conns) {
- if (index-- > 0)
- continue;
-
- conn = cfs_list_entry(ctmp, kra_conn_t,
- rac_list);
- CDEBUG(D_NET, "++conn[%p] -> %s (%d)\n", conn,
- libcfs_nid2str(conn->rac_peer->rap_nid),
- atomic_read(&conn->rac_refcount));
- atomic_inc(&conn->rac_refcount);
- read_unlock(&kranal_data.kra_global_lock);
- return conn;
- }
- }
- }
-
- read_unlock(&kranal_data.kra_global_lock);
- return NULL;
-}
-
-int
-kranal_close_peer_conns_locked (kra_peer_t *peer, int why)
-{
- kra_conn_t *conn;
- cfs_list_t *ctmp;
- cfs_list_t *cnxt;
- int count = 0;
-
- cfs_list_for_each_safe (ctmp, cnxt, &peer->rap_conns) {
- conn = cfs_list_entry(ctmp, kra_conn_t, rac_list);
-
- count++;
- kranal_close_conn_locked(conn, why);
- }
-
- return count;
-}
-
-int
-kranal_close_matching_conns (lnet_nid_t nid)
-{
- unsigned long flags;
- kra_peer_t *peer;
- cfs_list_t *ptmp;
- cfs_list_t *pnxt;
- int lo;
- int hi;
- int i;
- int count = 0;
-
- write_lock_irqsave(&kranal_data.kra_global_lock, flags);
-
- if (nid != LNET_NID_ANY)
- lo = hi = kranal_nid2peerlist(nid) - kranal_data.kra_peers;
- else {
- lo = 0;
- hi = kranal_data.kra_peer_hash_size - 1;
- }
-
- for (i = lo; i <= hi; i++) {
- cfs_list_for_each_safe (ptmp, pnxt, &kranal_data.kra_peers[i]) {
-
- peer = cfs_list_entry(ptmp, kra_peer_t, rap_list);
- LASSERT (peer->rap_persistence > 0 ||
- !cfs_list_empty(&peer->rap_conns));
-
- if (!(nid == LNET_NID_ANY || nid == peer->rap_nid))
- continue;
-
- count += kranal_close_peer_conns_locked(peer, 0);
- }
- }
-
- write_unlock_irqrestore(&kranal_data.kra_global_lock, flags);
-
- /* wildcards always succeed */
- if (nid == LNET_NID_ANY)
- return 0;
-
- return (count == 0) ? -ENOENT : 0;
-}
-
-int
-kranal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
-{
- struct libcfs_ioctl_data *data = arg;
- int rc = -EINVAL;
-
- LASSERT (ni == kranal_data.kra_ni);
-
- switch(cmd) {
- case IOC_LIBCFS_GET_PEER: {
- lnet_nid_t nid = 0;
- __u32 ip = 0;
- int port = 0;
- int share_count = 0;
-
- rc = kranal_get_peer_info(data->ioc_count,
- &nid, &ip, &port, &share_count);
- data->ioc_nid = nid;
- data->ioc_count = share_count;
- data->ioc_u32[0] = ip;
- data->ioc_u32[1] = port;
- break;
- }
- case IOC_LIBCFS_ADD_PEER: {
- rc = kranal_add_persistent_peer(data->ioc_nid,
- data->ioc_u32[0], /* IP */
- data->ioc_u32[1]); /* port */
- break;
- }
- case IOC_LIBCFS_DEL_PEER: {
- rc = kranal_del_peer(data->ioc_nid);
- break;
- }
- case IOC_LIBCFS_GET_CONN: {
- kra_conn_t *conn = kranal_get_conn_by_idx(data->ioc_count);
-
- if (conn == NULL)
- rc = -ENOENT;
- else {
- rc = 0;
- data->ioc_nid = conn->rac_peer->rap_nid;
- data->ioc_u32[0] = conn->rac_device->rad_id;
- kranal_conn_decref(conn);
- }
- break;
- }
- case IOC_LIBCFS_CLOSE_CONNECTION: {
- rc = kranal_close_matching_conns(data->ioc_nid);
- break;
- }
- case IOC_LIBCFS_REGISTER_MYNID: {
- /* Ignore if this is a noop */
- if (data->ioc_nid == ni->ni_nid) {
- rc = 0;
- } else {
- CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
- libcfs_nid2str(data->ioc_nid),
- libcfs_nid2str(ni->ni_nid));
- rc = -EINVAL;
- }
- break;
- }
- }
-
- return rc;
-}
-
-void
-kranal_free_txdescs(cfs_list_t *freelist)
-{
- kra_tx_t *tx;
-
- while (!cfs_list_empty(freelist)) {
- tx = cfs_list_entry(freelist->next, kra_tx_t, tx_list);
-
- cfs_list_del(&tx->tx_list);
- LIBCFS_FREE(tx->tx_phys, LNET_MAX_IOV * sizeof(*tx->tx_phys));
- LIBCFS_FREE(tx, sizeof(*tx));
- }
-}
-
-int
-kranal_alloc_txdescs(cfs_list_t *freelist, int n)
-{
- int i;
- kra_tx_t *tx;
-
- LASSERT (freelist == &kranal_data.kra_idle_txs);
- LASSERT (cfs_list_empty(freelist));
-
- for (i = 0; i < n; i++) {
-
- LIBCFS_ALLOC(tx, sizeof(*tx));
- if (tx == NULL) {
- CERROR("Can't allocate tx[%d]\n", i);
- kranal_free_txdescs(freelist);
- return -ENOMEM;
- }
-
- LIBCFS_ALLOC(tx->tx_phys,
- LNET_MAX_IOV * sizeof(*tx->tx_phys));
- if (tx->tx_phys == NULL) {
- CERROR("Can't allocate tx[%d]->tx_phys\n", i);
-
- LIBCFS_FREE(tx, sizeof(*tx));
- kranal_free_txdescs(freelist);
- return -ENOMEM;
- }
-
- tx->tx_buftype = RANAL_BUF_NONE;
- tx->tx_msg.ram_type = RANAL_MSG_NONE;
-
- cfs_list_add(&tx->tx_list, freelist);
- }
-
- return 0;
-}
-
-int
-kranal_device_init(int id, kra_device_t *dev)
-{
- int total_ntx = *kranal_tunables.kra_ntx;
- RAP_RETURN rrc;
-
- dev->rad_id = id;
- rrc = RapkGetDeviceByIndex(id, kranal_device_callback,
- &dev->rad_handle);
- if (rrc != RAP_SUCCESS) {
- CERROR("Can't get Rapidarray Device %d: %d\n", id, rrc);
- goto failed_0;
- }
-
- rrc = RapkReserveRdma(dev->rad_handle, total_ntx);
- if (rrc != RAP_SUCCESS) {
- CERROR("Can't reserve %d RDMA descriptors"
- " for device %d: %d\n", total_ntx, id, rrc);
- goto failed_1;
- }
-
- rrc = RapkCreateCQ(dev->rad_handle, total_ntx, RAP_CQTYPE_SEND,
- &dev->rad_rdma_cqh);
- if (rrc != RAP_SUCCESS) {
- CERROR("Can't create rdma cq size %d for device %d: %d\n",
- total_ntx, id, rrc);
- goto failed_1;
- }
-
- rrc = RapkCreateCQ(dev->rad_handle,
- *kranal_tunables.kra_fma_cq_size,
- RAP_CQTYPE_RECV, &dev->rad_fma_cqh);
- if (rrc != RAP_SUCCESS) {
- CERROR("Can't create fma cq size %d for device %d: %d\n",
- *kranal_tunables.kra_fma_cq_size, id, rrc);
- goto failed_2;
- }
-
- return 0;
-
- failed_2:
- RapkDestroyCQ(dev->rad_handle, dev->rad_rdma_cqh);
- failed_1:
- RapkReleaseDevice(dev->rad_handle);
- failed_0:
- return -ENODEV;
-}
-
-void
-kranal_device_fini(kra_device_t *dev)
-{
- LASSERT (cfs_list_empty(&dev->rad_ready_conns));
- LASSERT (cfs_list_empty(&dev->rad_new_conns));
- LASSERT (dev->rad_nphysmap == 0);
- LASSERT (dev->rad_nppphysmap == 0);
- LASSERT (dev->rad_nvirtmap == 0);
- LASSERT (dev->rad_nobvirtmap == 0);
-
- LASSERT(dev->rad_scheduler == NULL);
- RapkDestroyCQ(dev->rad_handle, dev->rad_fma_cqh);
- RapkDestroyCQ(dev->rad_handle, dev->rad_rdma_cqh);
- RapkReleaseDevice(dev->rad_handle);
-}
-
-void
-kranal_shutdown (lnet_ni_t *ni)
-{
- int i;
- unsigned long flags;
-
- CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
- atomic_read(&libcfs_kmemory));
-
- LASSERT (ni == kranal_data.kra_ni);
- LASSERT (ni->ni_data == &kranal_data);
-
- switch (kranal_data.kra_init) {
- default:
- CERROR("Unexpected state %d\n", kranal_data.kra_init);
- LBUG();
-
- case RANAL_INIT_ALL:
- /* Prevent new peers from being created */
- write_lock_irqsave(&kranal_data.kra_global_lock, flags);
- kranal_data.kra_nonewpeers = 1;
- write_unlock_irqrestore(&kranal_data.kra_global_lock,
- flags);
-
- /* Remove all existing peers from the peer table */
- kranal_del_peer(LNET_NID_ANY);
-
- /* Wait for pending conn reqs to be handled */
- i = 2;
- spin_lock_irqsave(&kranal_data.kra_connd_lock, flags);
- while (!cfs_list_empty(&kranal_data.kra_connd_acceptq)) {
- spin_unlock_irqrestore(&kranal_data.kra_connd_lock,
- flags);
- i++;
- CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n */
- "waiting for conn reqs to clean up\n");
- cfs_pause(cfs_time_seconds(1));
-
- spin_lock_irqsave(&kranal_data.kra_connd_lock,
- flags);
- }
- spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags);
-
- /* Wait for all peers to be freed */
- i = 2;
- while (atomic_read(&kranal_data.kra_npeers) != 0) {
- i++;
- CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n */
- "waiting for %d peers to close down\n",
- atomic_read(&kranal_data.kra_npeers));
- cfs_pause(cfs_time_seconds(1));
- }
- /* fall through */
-
- case RANAL_INIT_DATA:
- break;
- }
-
- /* Peer state all cleaned up BEFORE setting shutdown, so threads don't
- * have to worry about shutdown races. NB connections may be created
- * while there are still active connds, but these will be temporary
- * since peer creation always fails after the listener has started to
- * shut down. */
- LASSERT (atomic_read(&kranal_data.kra_npeers) == 0);
-
- /* Flag threads to terminate */
- kranal_data.kra_shutdown = 1;
-
- for (i = 0; i < kranal_data.kra_ndevs; i++) {
- kra_device_t *dev = &kranal_data.kra_devices[i];
-
- spin_lock_irqsave(&dev->rad_lock, flags);
- wake_up(&dev->rad_waitq);
- spin_unlock_irqrestore(&dev->rad_lock, flags);
- }
-
- spin_lock_irqsave(&kranal_data.kra_reaper_lock, flags);
- wake_up_all(&kranal_data.kra_reaper_waitq);
- spin_unlock_irqrestore(&kranal_data.kra_reaper_lock, flags);
-
- LASSERT (cfs_list_empty(&kranal_data.kra_connd_peers));
- spin_lock_irqsave(&kranal_data.kra_connd_lock, flags);
- wake_up_all(&kranal_data.kra_connd_waitq);
- spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags);
-
- /* Wait for threads to exit */
- i = 2;
- while (atomic_read(&kranal_data.kra_nthreads) != 0) {
- i++;
- CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
- "Waiting for %d threads to terminate\n",
- atomic_read(&kranal_data.kra_nthreads));
- cfs_pause(cfs_time_seconds(1));
- }
-
- LASSERT (atomic_read(&kranal_data.kra_npeers) == 0);
- if (kranal_data.kra_peers != NULL) {
- for (i = 0; i < kranal_data.kra_peer_hash_size; i++)
- LASSERT (cfs_list_empty(&kranal_data.kra_peers[i]));
-
- LIBCFS_FREE(kranal_data.kra_peers,
- sizeof (cfs_list_t) *
- kranal_data.kra_peer_hash_size);
- }
-
- LASSERT (atomic_read(&kranal_data.kra_nconns) == 0);
- if (kranal_data.kra_conns != NULL) {
- for (i = 0; i < kranal_data.kra_conn_hash_size; i++)
- LASSERT (cfs_list_empty(&kranal_data.kra_conns[i]));
-
- LIBCFS_FREE(kranal_data.kra_conns,
- sizeof (cfs_list_t) *
- kranal_data.kra_conn_hash_size);
- }
-
- for (i = 0; i < kranal_data.kra_ndevs; i++)
- kranal_device_fini(&kranal_data.kra_devices[i]);
-
- kranal_free_txdescs(&kranal_data.kra_idle_txs);
-
- CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
- atomic_read(&libcfs_kmemory));
-
- kranal_data.kra_init = RANAL_INIT_NOTHING;
- module_put(THIS_MODULE);
-}
-
-int
-kranal_startup (lnet_ni_t *ni)
-{
- struct timeval tv;
- int pkmem = atomic_read(&libcfs_kmemory);
- int rc;
- int i;
- kra_device_t *dev;
- char name[16];
-
- LASSERT (ni->ni_lnd == &the_kralnd);
-
- /* Only 1 instance supported */
- if (kranal_data.kra_init != RANAL_INIT_NOTHING) {
- CERROR ("Only 1 instance supported\n");
- return -EPERM;
- }
-
- if (lnet_set_ip_niaddr(ni) != 0) {
- CERROR ("Can't determine my NID\n");
- return -EPERM;
- }
-
- if (*kranal_tunables.kra_credits > *kranal_tunables.kra_ntx) {
- CERROR ("Can't set credits(%d) > ntx(%d)\n",
- *kranal_tunables.kra_credits,
- *kranal_tunables.kra_ntx);
- return -EINVAL;
- }
-
- memset(&kranal_data, 0, sizeof(kranal_data)); /* zero pointers, flags etc */
-
- ni->ni_maxtxcredits = *kranal_tunables.kra_credits;
- ni->ni_peertxcredits = *kranal_tunables.kra_peercredits;
-
- ni->ni_data = &kranal_data;
- kranal_data.kra_ni = ni;
-
- /* CAVEAT EMPTOR: Every 'Fma' message includes the sender's NID and
- * a unique (for all time) connstamp so we can uniquely identify
- * the sender. The connstamp is an incrementing counter
- * initialised with seconds + microseconds at startup time. So we
- * rely on NOT creating connections more frequently on average than
- * 1MHz to ensure we don't use old connstamps when we reboot. */
- do_gettimeofday(&tv);
- kranal_data.kra_connstamp =
- kranal_data.kra_peerstamp = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
-
- rwlock_init(&kranal_data.kra_global_lock);
-
- for (i = 0; i < RANAL_MAXDEVS; i++ ) {
- kra_device_t *dev = &kranal_data.kra_devices[i];
-
- dev->rad_idx = i;
- CFS_INIT_LIST_HEAD(&dev->rad_ready_conns);
- CFS_INIT_LIST_HEAD(&dev->rad_new_conns);
- init_waitqueue_head(&dev->rad_waitq);
- spin_lock_init(&dev->rad_lock);
- }
-
- kranal_data.kra_new_min_timeout = MAX_SCHEDULE_TIMEOUT;
- init_waitqueue_head(&kranal_data.kra_reaper_waitq);
- spin_lock_init(&kranal_data.kra_reaper_lock);
-
- CFS_INIT_LIST_HEAD(&kranal_data.kra_connd_acceptq);
- CFS_INIT_LIST_HEAD(&kranal_data.kra_connd_peers);
- init_waitqueue_head(&kranal_data.kra_connd_waitq);
- spin_lock_init(&kranal_data.kra_connd_lock);
-
- CFS_INIT_LIST_HEAD(&kranal_data.kra_idle_txs);
- spin_lock_init(&kranal_data.kra_tx_lock);
-
- /* OK to call kranal_api_shutdown() to cleanup now */
- kranal_data.kra_init = RANAL_INIT_DATA;
- try_module_get(THIS_MODULE);
-
- kranal_data.kra_peer_hash_size = RANAL_PEER_HASH_SIZE;
- LIBCFS_ALLOC(kranal_data.kra_peers,
- sizeof(cfs_list_t) *
- kranal_data.kra_peer_hash_size);
- if (kranal_data.kra_peers == NULL)
- goto failed;
-
- for (i = 0; i < kranal_data.kra_peer_hash_size; i++)
- CFS_INIT_LIST_HEAD(&kranal_data.kra_peers[i]);
-
- kranal_data.kra_conn_hash_size = RANAL_PEER_HASH_SIZE;
- LIBCFS_ALLOC(kranal_data.kra_conns,
- sizeof(cfs_list_t) *
- kranal_data.kra_conn_hash_size);
- if (kranal_data.kra_conns == NULL)
- goto failed;
-
- for (i = 0; i < kranal_data.kra_conn_hash_size; i++)
- CFS_INIT_LIST_HEAD(&kranal_data.kra_conns[i]);
-
- rc = kranal_alloc_txdescs(&kranal_data.kra_idle_txs,
- *kranal_tunables.kra_ntx);
- if (rc != 0)
- goto failed;
-
- rc = kranal_thread_start(kranal_reaper, NULL, "kranal_reaper");
- if (rc != 0) {
- CERROR("Can't spawn ranal reaper: %d\n", rc);
- goto failed;
- }
-
- for (i = 0; i < *kranal_tunables.kra_n_connd; i++) {
- snprintf(name, sizeof(name), "kranal_connd_%02ld", i);
- rc = kranal_thread_start(kranal_connd,
- (void *)(unsigned long)i, name);
- if (rc != 0) {
- CERROR("Can't spawn ranal connd[%d]: %d\n",
- i, rc);
- goto failed;
- }
- }
-
- LASSERT (kranal_data.kra_ndevs == 0);
-
- /* Use all available RapidArray devices */
- for (i = 0; i < RANAL_MAXDEVS; i++) {
- dev = &kranal_data.kra_devices[kranal_data.kra_ndevs];
-
- rc = kranal_device_init(kranal_devids[i], dev);
- if (rc == 0)
- kranal_data.kra_ndevs++;
- }
-
- if (kranal_data.kra_ndevs == 0) {
- CERROR("Can't initialise any RapidArray devices\n");
- goto failed;
- }
-
- for (i = 0; i < kranal_data.kra_ndevs; i++) {
- dev = &kranal_data.kra_devices[i];
- snprintf(name, sizeof(name), "kranal_sd_%02d", dev->rad_idx);
- rc = kranal_thread_start(kranal_scheduler, dev, name);
- if (rc != 0) {
- CERROR("Can't spawn ranal scheduler[%d]: %d\n",
- i, rc);
- goto failed;
- }
- }
-
- /* flag everything initialised */
- kranal_data.kra_init = RANAL_INIT_ALL;
- /*****************************************************/
-
- CDEBUG(D_MALLOC, "initial kmem %d\n", pkmem);
- return 0;
-
- failed:
- kranal_shutdown(ni);
- return -ENETDOWN;
-}
-
-void __exit
-kranal_module_fini (void)
-{
- lnet_unregister_lnd(&the_kralnd);
- kranal_tunables_fini();
-}
-
-int __init
-kranal_module_init (void)
-{
- int rc;
-
- rc = kranal_tunables_init();
- if (rc != 0)
- return rc;
-
- lnet_register_lnd(&the_kralnd);
-
- return 0;
-}
-
-MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
-MODULE_DESCRIPTION("Kernel RapidArray LND v0.01");
-MODULE_LICENSE("GPL");
-
-module_init(kranal_module_init);
-module_exit(kranal_module_fini);
+++ /dev/null
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
- *
- * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
- * CA 95054 USA or visit www.sun.com if you need additional information or
- * have any questions.
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2012, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * lnet/klnds/ralnd/ralnd.h
- *
- * Author: Eric Barton <eric@bartonsoftware.com>
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/errno.h>
-#include <linux/unistd.h>
-#include <linux/uio.h>
-
-#include <asm/uaccess.h>
-#include <asm/io.h>
-
-#include <linux/init.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/stat.h>
-#include <linux/list.h>
-#include <linux/kmod.h>
-#include <linux/sysctl.h>
-
-#include <net/sock.h>
-#include <linux/in.h>
-
-#define DEBUG_SUBSYSTEM S_LND
-
-#include <libcfs/libcfs.h>
-#include <lnet/lnet.h>
-#include <lnet/lib-lnet.h>
-
-#include <rapl.h>
-
-/* tunables determined at compile time */
-#define RANAL_RESCHED 100 /* # scheduler loops before reschedule */
-
-#define RANAL_PEER_HASH_SIZE 101 /* # peer lists */
-#define RANAL_CONN_HASH_SIZE 101 /* # conn lists */
-
-#define RANAL_MIN_TIMEOUT 5 /* minimum timeout interval (seconds) */
-#define RANAL_TIMEOUT2KEEPALIVE(t) (((t)+1)/2) /* timeout -> keepalive interval */
-
-/* fixed constants */
-#define RANAL_MAXDEVS 2 /* max # devices RapidArray supports */
-#define RANAL_FMA_MAX_PREFIX 232 /* max bytes in FMA "Prefix" we can use */
-#define RANAL_FMA_MAX_DATA ((7<<10)-256) /* Max FMA MSG is 7K including prefix */
-
-
-typedef struct
-{
- int *kra_n_connd; /* # connection daemons */
- int *kra_min_reconnect_interval; /* first failed connection retry... */
- int *kra_max_reconnect_interval; /* ...exponentially increasing to this */
- int *kra_ntx; /* # tx descs */
- int *kra_credits; /* # concurrent sends */
- int *kra_peercredits; /* # concurrent sends to 1 peer */
- int *kra_fma_cq_size; /* # entries in receive CQ */
- int *kra_timeout; /* comms timeout (seconds) */
- int *kra_max_immediate; /* immediate payload breakpoint */
-
-#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
- struct ctl_table_header *kra_sysctl; /* sysctl interface */
-#endif
-} kra_tunables_t;
-
-typedef struct
-{
- RAP_PVOID rad_handle; /* device handle */
- RAP_PVOID rad_fma_cqh; /* FMA completion queue handle */
- RAP_PVOID rad_rdma_cqh; /* rdma completion queue handle */
- int rad_id; /* device id */
- int rad_idx; /* index in kra_devices */
- int rad_ready; /* set by device callback */
- cfs_list_t rad_ready_conns;/* connections ready to tx/rx */
- cfs_list_t rad_new_conns; /* new connections to complete */
- wait_queue_head_t rad_waitq; /* scheduler waits here */
- spinlock_t rad_lock; /* serialise */
- void *rad_scheduler; /* scheduling thread */
- unsigned int rad_nphysmap; /* # phys mappings */
- unsigned int rad_nppphysmap;/* # phys pages mapped */
- unsigned int rad_nvirtmap; /* # virt mappings */
- unsigned long rad_nobvirtmap;/* # virt bytes mapped */
-} kra_device_t;
-
-typedef struct
-{
- int kra_init; /* initialisation state */
- int kra_shutdown; /* shut down? */
- atomic_t kra_nthreads; /* # live threads */
- lnet_ni_t *kra_ni; /* _the_ nal instance */
-
- kra_device_t kra_devices[RANAL_MAXDEVS]; /* device/ptag/cq */
- int kra_ndevs; /* # devices */
-
- rwlock_t kra_global_lock; /* stabilize peer/conn ops */
-
- cfs_list_t *kra_peers; /* hash table of all my known peers */
- int kra_peer_hash_size; /* size of kra_peers */
- atomic_t kra_npeers; /* # peers extant */
- int kra_nonewpeers; /* prevent new peers */
-
- cfs_list_t *kra_conns; /* conns hashed by cqid */
- int kra_conn_hash_size; /* size of kra_conns */
- __u64 kra_peerstamp; /* when I started up */
- __u64 kra_connstamp; /* conn stamp generator */
- int kra_next_cqid; /* cqid generator */
- atomic_t kra_nconns; /* # connections extant */
-
- long kra_new_min_timeout; /* minimum timeout on any new conn */
- wait_queue_head_t kra_reaper_waitq; /* reaper sleeps here */
- spinlock_t kra_reaper_lock; /* serialise */
-
- cfs_list_t kra_connd_peers; /* peers waiting for a connection */
- cfs_list_t kra_connd_acceptq; /* accepted sockets to handshake */
- wait_queue_head_t kra_connd_waitq; /* connection daemons sleep here */
- spinlock_t kra_connd_lock; /* serialise */
-
- cfs_list_t kra_idle_txs; /* idle tx descriptors */
- __u64 kra_next_tx_cookie; /* RDMA completion cookie */
- spinlock_t kra_tx_lock; /* serialise */
-} kra_data_t;
-
-#define RANAL_INIT_NOTHING 0
-#define RANAL_INIT_DATA 1
-#define RANAL_INIT_ALL 2
-
-typedef struct kra_acceptsock /* accepted socket queued for connd */
-{
- cfs_list_t ras_list; /* queue for attention */
- struct socket *ras_sock; /* the accepted socket */
-} kra_acceptsock_t;
-
-/************************************************************************
- * Wire message structs. These are sent in sender's byte order
- * (i.e. receiver checks magic and flips if required).
- */
-
-typedef struct kra_connreq /* connection request/response */
-{ /* (sent via socket) */
- __u32 racr_magic; /* I'm an ranal connreq */
- __u16 racr_version; /* this is my version number */
- __u16 racr_devid; /* sender's device ID */
- __u64 racr_srcnid; /* sender's NID */
- __u64 racr_dstnid; /* who sender expects to listen */
- __u64 racr_peerstamp; /* sender's instance stamp */
- __u64 racr_connstamp; /* sender's connection stamp */
- __u32 racr_timeout; /* sender's timeout */
- RAP_RI_PARAMETERS racr_riparams; /* sender's endpoint info */
-} kra_connreq_t;
-
-typedef struct
-{
- RAP_MEM_KEY rard_key;
- RAP_PVOID64 rard_addr;
- RAP_UINT32 rard_nob;
-} kra_rdma_desc_t;
-
-typedef struct
-{
- lnet_hdr_t raim_hdr; /* portals header */
- /* Portals payload is in FMA "Message Data" */
-} kra_immediate_msg_t;
-
-typedef struct
-{
- lnet_hdr_t raprm_hdr; /* portals header */
- __u64 raprm_cookie; /* opaque completion cookie */
-} kra_putreq_msg_t;
-
-typedef struct
-{
- __u64 rapam_src_cookie; /* reflected completion cookie */
- __u64 rapam_dst_cookie; /* opaque completion cookie */
- kra_rdma_desc_t rapam_desc; /* sender's sink buffer */
-} kra_putack_msg_t;
-
-typedef struct
-{
- lnet_hdr_t ragm_hdr; /* portals header */
- __u64 ragm_cookie; /* opaque completion cookie */
- kra_rdma_desc_t ragm_desc; /* sender's sink buffer */
-} kra_get_msg_t;
-
-typedef struct
-{
- __u64 racm_cookie; /* reflected completion cookie */
-} kra_completion_msg_t;
-
-typedef struct /* NB must fit in FMA "Prefix" */
-{
- __u32 ram_magic; /* I'm an ranal message */
- __u16 ram_version; /* this is my version number */
- __u16 ram_type; /* msg type */
- __u64 ram_srcnid; /* sender's NID */
- __u64 ram_connstamp; /* sender's connection stamp */
- union {
- kra_immediate_msg_t immediate;
- kra_putreq_msg_t putreq;
- kra_putack_msg_t putack;
- kra_get_msg_t get;
- kra_completion_msg_t completion;
- } ram_u;
- __u32 ram_seq; /* incrementing sequence number */
-} kra_msg_t;
-
-#define RANAL_MSG_MAGIC LNET_PROTO_RA_MAGIC /* unique magic */
-#define RANAL_MSG_VERSION 1 /* current protocol version */
-
-#define RANAL_MSG_FENCE 0x80 /* fence RDMA */
-
-#define RANAL_MSG_NONE 0x00 /* illegal message */
-#define RANAL_MSG_NOOP 0x01 /* empty ram_u (keepalive) */
-#define RANAL_MSG_IMMEDIATE 0x02 /* ram_u.immediate */
-#define RANAL_MSG_PUT_REQ 0x03 /* ram_u.putreq (src->sink) */
-#define RANAL_MSG_PUT_NAK 0x04 /* ram_u.completion (no PUT match: sink->src) */
-#define RANAL_MSG_PUT_ACK 0x05 /* ram_u.putack (PUT matched: sink->src) */
-#define RANAL_MSG_PUT_DONE 0x86 /* ram_u.completion (src->sink) */
-#define RANAL_MSG_GET_REQ 0x07 /* ram_u.get (sink->src) */
-#define RANAL_MSG_GET_NAK 0x08 /* ram_u.completion (no GET match: src->sink) */
-#define RANAL_MSG_GET_DONE 0x89 /* ram_u.completion (src->sink) */
-#define RANAL_MSG_CLOSE 0x8a /* empty ram_u */
-
-/***********************************************************************/
-
-typedef struct kra_tx /* message descriptor */
-{
- cfs_list_t tx_list; /* queue on idle_txs/rac_sendq/rac_waitq */
- struct kra_conn *tx_conn; /* owning conn */
- lnet_msg_t *tx_lntmsg[2]; /* ptl msgs to finalize on completion */
- unsigned long tx_qtime; /* when tx started to wait for something (jiffies) */
- int tx_nob; /* # bytes of payload */
- int tx_buftype; /* payload buffer type */
- void *tx_buffer; /* source/sink buffer */
- int tx_phys_offset; /* first page offset (if phys) */
- int tx_phys_npages; /* # physical pages */
- RAP_PHYS_REGION *tx_phys; /* page descriptors */
- RAP_MEM_KEY tx_map_key; /* mapping key */
- RAP_RDMA_DESCRIPTOR tx_rdma_desc; /* rdma descriptor */
- __u64 tx_cookie; /* identify this tx to peer */
- kra_msg_t tx_msg; /* FMA message buffer */
-} kra_tx_t;
-
-#define RANAL_BUF_NONE 0 /* buffer type not set */
-#define RANAL_BUF_IMMEDIATE 1 /* immediate data */
-#define RANAL_BUF_PHYS_UNMAPPED 2 /* physical: not mapped yet */
-#define RANAL_BUF_PHYS_MAPPED 3 /* physical: mapped already */
-#define RANAL_BUF_VIRT_UNMAPPED 4 /* virtual: not mapped yet */
-#define RANAL_BUF_VIRT_MAPPED 5 /* virtual: mapped already */
-
-typedef struct kra_conn
-{
- struct kra_peer *rac_peer; /* owning peer */
- cfs_list_t rac_list; /* stash on peer's conn list */
- cfs_list_t rac_hashlist; /* stash in connection hash table */
- cfs_list_t rac_schedlist; /* schedule (on rad_???_conns) for attention */
- cfs_list_t rac_fmaq; /* txs queued for FMA */
- cfs_list_t rac_rdmaq; /* txs awaiting RDMA completion */
- cfs_list_t rac_replyq; /* txs awaiting replies */
- __u64 rac_peerstamp; /* peer's unique stamp */
- __u64 rac_peer_connstamp;/* peer's unique connection stamp */
- __u64 rac_my_connstamp; /* my unique connection stamp */
- unsigned long rac_last_tx; /* when I last sent an FMA message (jiffies) */
- unsigned long rac_last_rx; /* when I last received an FMA messages (jiffies) */
- long rac_keepalive; /* keepalive interval (seconds) */
- long rac_timeout; /* infer peer death if no rx for this many seconds */
- __u32 rac_cqid; /* my completion callback id (non-unique) */
- __u32 rac_tx_seq; /* tx msg sequence number */
- __u32 rac_rx_seq; /* rx msg sequence number */
- atomic_t rac_refcount; /* # users */
- unsigned int rac_close_sent; /* I've sent CLOSE */
- unsigned int rac_close_recvd; /* I've received CLOSE */
- unsigned int rac_state; /* connection state */
- unsigned int rac_scheduled; /* being attented to */
- spinlock_t rac_lock; /* serialise */
- kra_device_t *rac_device; /* which device */
- RAP_PVOID rac_rihandle; /* RA endpoint */
- kra_msg_t *rac_rxmsg; /* incoming message (FMA prefix) */
- kra_msg_t rac_msg; /* keepalive/CLOSE message buffer */
-} kra_conn_t;
-
-#define RANAL_CONN_ESTABLISHED 0
-#define RANAL_CONN_CLOSING 1
-#define RANAL_CONN_CLOSED 2
-
-typedef struct kra_peer {
- cfs_list_t rap_list; /* stash on global peer list */
- cfs_list_t rap_connd_list; /* schedule on kra_connd_peers */
- cfs_list_t rap_conns; /* all active connections */
- cfs_list_t rap_tx_queue; /* msgs waiting for a conn */
- lnet_nid_t rap_nid; /* who's on the other end(s) */
- __u32 rap_ip; /* IP address of peer */
- int rap_port; /* port on which peer listens */
- atomic_t rap_refcount; /* # users */
- int rap_persistence; /* "known" peer refs */
- int rap_connecting; /* connection forming */
- unsigned long rap_reconnect_time; /* get_seconds() when reconnect OK */
- unsigned long rap_reconnect_interval; /* exponential backoff */
-} kra_peer_t;
-
-extern kra_data_t kranal_data;
-extern kra_tunables_t kranal_tunables;
-
-extern void kranal_destroy_peer(kra_peer_t *peer);
-extern void kranal_destroy_conn(kra_conn_t *conn);
-
-static inline void
-kranal_peer_addref(kra_peer_t *peer)
-{
- CDEBUG(D_NET, "%p->%s\n", peer, libcfs_nid2str(peer->rap_nid));
- LASSERT(atomic_read(&peer->rap_refcount) > 0);
- atomic_inc(&peer->rap_refcount);
-}
-
-static inline void
-kranal_peer_decref(kra_peer_t *peer)
-{
- CDEBUG(D_NET, "%p->%s\n", peer, libcfs_nid2str(peer->rap_nid));
- LASSERT(atomic_read(&peer->rap_refcount) > 0);
- if (atomic_dec_and_test(&peer->rap_refcount))
- kranal_destroy_peer(peer);
-}
-
-static inline cfs_list_t *
-kranal_nid2peerlist (lnet_nid_t nid)
-{
- unsigned int hash = ((unsigned int)nid) % kranal_data.kra_peer_hash_size;
-
- return (&kranal_data.kra_peers[hash]);
-}
-
-static inline int
-kranal_peer_active(kra_peer_t *peer)
-{
- /* Am I in the peer hash table? */
- return (!cfs_list_empty(&peer->rap_list));
-}
-
-static inline void
-kranal_conn_addref(kra_conn_t *conn)
-{
- CDEBUG(D_NET, "%p->%s\n", conn,
- libcfs_nid2str(conn->rac_peer->rap_nid));
- LASSERT(atomic_read(&conn->rac_refcount) > 0);
- atomic_inc(&conn->rac_refcount);
-}
-
-static inline void
-kranal_conn_decref(kra_conn_t *conn)
-{
- CDEBUG(D_NET, "%p->%s\n", conn,
- libcfs_nid2str(conn->rac_peer->rap_nid));
- LASSERT(atomic_read(&conn->rac_refcount) > 0);
- if (atomic_dec_and_test(&conn->rac_refcount))
- kranal_destroy_conn(conn);
-}
-
-static inline cfs_list_t *
-kranal_cqid2connlist (__u32 cqid)
-{
- unsigned int hash = cqid % kranal_data.kra_conn_hash_size;
-
- return (&kranal_data.kra_conns [hash]);
-}
-
-static inline kra_conn_t *
-kranal_cqid2conn_locked (__u32 cqid)
-{
- cfs_list_t *conns = kranal_cqid2connlist(cqid);
- cfs_list_t *tmp;
- kra_conn_t *conn;
-
- cfs_list_for_each(tmp, conns) {
- conn = cfs_list_entry(tmp, kra_conn_t, rac_hashlist);
-
- if (conn->rac_cqid == cqid)
- return conn;
- }
-
- return NULL;
-}
-
-static inline int
-kranal_tx_mapped (kra_tx_t *tx)
-{
- return (tx->tx_buftype == RANAL_BUF_VIRT_MAPPED ||
- tx->tx_buftype == RANAL_BUF_PHYS_MAPPED);
-}
-
-int kranal_startup (lnet_ni_t *ni);
-void kranal_shutdown (lnet_ni_t *ni);
-int kranal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
-int kranal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
-int kranal_eager_recv(lnet_ni_t *ni, void *private,
- lnet_msg_t *lntmsg, void **new_private);
-int kranal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
- int delayed, unsigned int niov,
- struct iovec *iov, lnet_kiov_t *kiov,
- unsigned int offset, unsigned int mlen, unsigned int rlen);
-int kranal_accept(lnet_ni_t *ni, struct socket *sock);
-
-extern void kranal_free_acceptsock (kra_acceptsock_t *ras);
-extern int kranal_listener_procint (struct ctl_table *table,
- int write, struct file *filp,
- void *buffer, size_t *lenp);
-extern void kranal_update_reaper_timeout (long timeout);
-extern void kranal_tx_done (kra_tx_t *tx, int completion);
-extern void kranal_unlink_peer_locked (kra_peer_t *peer);
-extern void kranal_schedule_conn (kra_conn_t *conn);
-extern int kranal_create_peer (kra_peer_t **peerp, lnet_nid_t nid);
-extern int kranal_add_persistent_peer (lnet_nid_t nid, __u32 ip, int port);
-extern kra_peer_t *kranal_find_peer_locked (lnet_nid_t nid);
-extern void kranal_post_fma (kra_conn_t *conn, kra_tx_t *tx);
-extern int kranal_del_peer (lnet_nid_t nid);
-extern void kranal_device_callback (RAP_INT32 devid, RAP_PVOID arg);
-extern int kranal_thread_start(int(*fn)(void *arg), void *arg, char *name);
-extern int kranal_connd (void *arg);
-extern int kranal_reaper (void *arg);
-extern int kranal_scheduler (void *arg);
-extern void kranal_close_conn_locked (kra_conn_t *conn, int error);
-extern void kranal_close_conn (kra_conn_t *conn, int error);
-extern void kranal_terminate_conn_locked (kra_conn_t *conn);
-extern void kranal_connect (kra_peer_t *peer);
-extern int kranal_conn_handshake (struct socket *sock, kra_peer_t *peer);
-extern int kranal_tunables_init(void);
-extern void kranal_tunables_fini(void);
-extern void kranal_init_msg(kra_msg_t *msg, int type);
+++ /dev/null
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
- *
- * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
- * CA 95054 USA or visit www.sun.com if you need additional information or
- * have any questions.
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright (c) 2012, 2014, Intel Corporation.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * lnet/klnds/ralnd/ralnd_cb.c
- *
- * Author: Eric Barton <eric@bartonsoftware.com>
- */
-
-#include <asm/page.h>
-#include "ralnd.h"
-
-void
-kranal_device_callback(RAP_INT32 devid, RAP_PVOID arg)
-{
- kra_device_t *dev;
- int i;
- unsigned long flags;
-
- CDEBUG(D_NET, "callback for device %d\n", devid);
-
- for (i = 0; i < kranal_data.kra_ndevs; i++) {
-
- dev = &kranal_data.kra_devices[i];
- if (dev->rad_id != devid)
- continue;
-
- spin_lock_irqsave(&dev->rad_lock, flags);
-
- if (!dev->rad_ready) {
- dev->rad_ready = 1;
- wake_up(&dev->rad_waitq);
- }
-
- spin_unlock_irqrestore(&dev->rad_lock, flags);
- return;
- }
-
- CWARN("callback for unknown device %d\n", devid);
-}
-
-void
-kranal_schedule_conn(kra_conn_t *conn)
-{
- kra_device_t *dev = conn->rac_device;
- unsigned long flags;
-
- spin_lock_irqsave(&dev->rad_lock, flags);
-
- if (!conn->rac_scheduled) {
- kranal_conn_addref(conn); /* +1 ref for scheduler */
- conn->rac_scheduled = 1;
- cfs_list_add_tail(&conn->rac_schedlist, &dev->rad_ready_conns);
- wake_up(&dev->rad_waitq);
- }
-
- spin_unlock_irqrestore(&dev->rad_lock, flags);
-}
-
-kra_tx_t *
-kranal_get_idle_tx (void)
-{
- unsigned long flags;
- kra_tx_t *tx;
-
- spin_lock_irqsave(&kranal_data.kra_tx_lock, flags);
-
- if (cfs_list_empty(&kranal_data.kra_idle_txs)) {
- spin_unlock_irqrestore(&kranal_data.kra_tx_lock, flags);
- return NULL;
- }
-
- tx = cfs_list_entry(kranal_data.kra_idle_txs.next, kra_tx_t, tx_list);
- cfs_list_del(&tx->tx_list);
-
- /* Allocate a new completion cookie. It might not be needed, but we've
- * got a lock right now... */
- tx->tx_cookie = kranal_data.kra_next_tx_cookie++;
-
- spin_unlock_irqrestore(&kranal_data.kra_tx_lock, flags);
-
- LASSERT (tx->tx_buftype == RANAL_BUF_NONE);
- LASSERT (tx->tx_msg.ram_type == RANAL_MSG_NONE);
- LASSERT (tx->tx_conn == NULL);
- LASSERT (tx->tx_lntmsg[0] == NULL);
- LASSERT (tx->tx_lntmsg[1] == NULL);
-
- return tx;
-}
-
-void
-kranal_init_msg(kra_msg_t *msg, int type)
-{
- msg->ram_magic = RANAL_MSG_MAGIC;
- msg->ram_version = RANAL_MSG_VERSION;
- msg->ram_type = type;
- msg->ram_srcnid = kranal_data.kra_ni->ni_nid;
- /* ram_connstamp gets set when FMA is sent */
-}
-
-kra_tx_t *
-kranal_new_tx_msg (int type)
-{
- kra_tx_t *tx = kranal_get_idle_tx();
-
- if (tx != NULL)
- kranal_init_msg(&tx->tx_msg, type);
-
- return tx;
-}
-
-int
-kranal_setup_immediate_buffer (kra_tx_t *tx,
- unsigned int niov, struct iovec *iov,
- int offset, int nob)
-
-{
- /* For now this is almost identical to kranal_setup_virt_buffer, but we
- * could "flatten" the payload into a single contiguous buffer ready
- * for sending direct over an FMA if we ever needed to. */
-
- LASSERT (tx->tx_buftype == RANAL_BUF_NONE);
- LASSERT (nob >= 0);
-
- if (nob == 0) {
- tx->tx_buffer = NULL;
- } else {
- LASSERT (niov > 0);
-
- while (offset >= iov->iov_len) {
- offset -= iov->iov_len;
- niov--;
- iov++;
- LASSERT (niov > 0);
- }
-
- if (nob > iov->iov_len - offset) {
- CERROR("Can't handle multiple vaddr fragments\n");
- return -EMSGSIZE;
- }
-
- tx->tx_buffer = (void *)(((unsigned long)iov->iov_base) + offset);
- }
-
- tx->tx_buftype = RANAL_BUF_IMMEDIATE;
- tx->tx_nob = nob;
- return 0;
-}
-
-int
-kranal_setup_virt_buffer (kra_tx_t *tx,
- unsigned int niov, struct iovec *iov,
- int offset, int nob)
-
-{
- LASSERT (nob > 0);
- LASSERT (niov > 0);
- LASSERT (tx->tx_buftype == RANAL_BUF_NONE);
-
- while (offset >= iov->iov_len) {
- offset -= iov->iov_len;
- niov--;
- iov++;
- LASSERT (niov > 0);
- }
-
- if (nob > iov->iov_len - offset) {
- CERROR("Can't handle multiple vaddr fragments\n");
- return -EMSGSIZE;
- }
-
- tx->tx_buftype = RANAL_BUF_VIRT_UNMAPPED;
- tx->tx_nob = nob;
- tx->tx_buffer = (void *)(((unsigned long)iov->iov_base) + offset);
- return 0;
-}
-
-int
-kranal_setup_phys_buffer (kra_tx_t *tx, int nkiov, lnet_kiov_t *kiov,
- int offset, int nob)
-{
- RAP_PHYS_REGION *phys = tx->tx_phys;
- int resid;
-
- CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
-
- LASSERT (nob > 0);
- LASSERT (nkiov > 0);
- LASSERT (tx->tx_buftype == RANAL_BUF_NONE);
-
- while (offset >= kiov->kiov_len) {
- offset -= kiov->kiov_len;
- nkiov--;
- kiov++;
- LASSERT (nkiov > 0);
- }
-
- tx->tx_buftype = RANAL_BUF_PHYS_UNMAPPED;
- tx->tx_nob = nob;
- tx->tx_buffer = (void *)((unsigned long)(kiov->kiov_offset + offset));
-
- phys->Address = page_to_phys(kiov->kiov_page);
- phys++;
-
- resid = nob - (kiov->kiov_len - offset);
- while (resid > 0) {
- kiov++;
- nkiov--;
- LASSERT (nkiov > 0);
-
- if (kiov->kiov_offset != 0 ||
- ((resid > PAGE_SIZE) &&
- kiov->kiov_len < PAGE_SIZE)) {
- /* Can't have gaps */
- CERROR("Can't make payload contiguous in I/O VM:"
- "page %d, offset %d, len %d \n",
- (int)(phys - tx->tx_phys),
- kiov->kiov_offset, kiov->kiov_len);
- return -EINVAL;
- }
-
- if ((phys - tx->tx_phys) == LNET_MAX_IOV) {
- CERROR ("payload too big (%d)\n", (int)(phys - tx->tx_phys));
- return -EMSGSIZE;
- }
-
- phys->Address = page_to_phys(kiov->kiov_page);
- phys++;
-
- resid -= PAGE_SIZE;
- }
-
- tx->tx_phys_npages = phys - tx->tx_phys;
- return 0;
-}
-
-static inline int
-kranal_setup_rdma_buffer (kra_tx_t *tx, unsigned int niov,
- struct iovec *iov, lnet_kiov_t *kiov,
- int offset, int nob)
-{
- LASSERT ((iov == NULL) != (kiov == NULL));
-
- if (kiov != NULL)
- return kranal_setup_phys_buffer(tx, niov, kiov, offset, nob);
-
- return kranal_setup_virt_buffer(tx, niov, iov, offset, nob);
-}
-
-int
-kranal_map_buffer (kra_tx_t *tx)
-{
- kra_conn_t *conn = tx->tx_conn;
- kra_device_t *dev = conn->rac_device;
- RAP_RETURN rrc;
-
- LASSERT (current == dev->rad_scheduler);
-
- switch (tx->tx_buftype) {
- default:
- LBUG();
-
- case RANAL_BUF_NONE:
- case RANAL_BUF_IMMEDIATE:
- case RANAL_BUF_PHYS_MAPPED:
- case RANAL_BUF_VIRT_MAPPED:
- return 0;
-
- case RANAL_BUF_PHYS_UNMAPPED:
- rrc = RapkRegisterPhys(dev->rad_handle,
- tx->tx_phys, tx->tx_phys_npages,
- &tx->tx_map_key);
- if (rrc != RAP_SUCCESS) {
- CERROR ("Can't map %d pages: dev %d "
- "phys %u pp %u, virt %u nob %lu\n",
- tx->tx_phys_npages, dev->rad_id,
- dev->rad_nphysmap, dev->rad_nppphysmap,
- dev->rad_nvirtmap, dev->rad_nobvirtmap);
- return -ENOMEM; /* assume insufficient resources */
- }
-
- dev->rad_nphysmap++;
- dev->rad_nppphysmap += tx->tx_phys_npages;
-
- tx->tx_buftype = RANAL_BUF_PHYS_MAPPED;
- return 0;
-
- case RANAL_BUF_VIRT_UNMAPPED:
- rrc = RapkRegisterMemory(dev->rad_handle,
- tx->tx_buffer, tx->tx_nob,
- &tx->tx_map_key);
- if (rrc != RAP_SUCCESS) {
- CERROR ("Can't map %d bytes: dev %d "
- "phys %u pp %u, virt %u nob %lu\n",
- tx->tx_nob, dev->rad_id,
- dev->rad_nphysmap, dev->rad_nppphysmap,
- dev->rad_nvirtmap, dev->rad_nobvirtmap);
- return -ENOMEM; /* assume insufficient resources */
- }
-
- dev->rad_nvirtmap++;
- dev->rad_nobvirtmap += tx->tx_nob;
-
- tx->tx_buftype = RANAL_BUF_VIRT_MAPPED;
- return 0;
- }
-}
-
-void
-kranal_unmap_buffer (kra_tx_t *tx)
-{
- kra_device_t *dev;
- RAP_RETURN rrc;
-
- switch (tx->tx_buftype) {
- default:
- LBUG();
-
- case RANAL_BUF_NONE:
- case RANAL_BUF_IMMEDIATE:
- case RANAL_BUF_PHYS_UNMAPPED:
- case RANAL_BUF_VIRT_UNMAPPED:
- break;
-
- case RANAL_BUF_PHYS_MAPPED:
- LASSERT (tx->tx_conn != NULL);
- dev = tx->tx_conn->rac_device;
- LASSERT (current == dev->rad_scheduler);
- rrc = RapkDeregisterMemory(dev->rad_handle, NULL,
- &tx->tx_map_key);
- LASSERT (rrc == RAP_SUCCESS);
-
- dev->rad_nphysmap--;
- dev->rad_nppphysmap -= tx->tx_phys_npages;
-
- tx->tx_buftype = RANAL_BUF_PHYS_UNMAPPED;
- break;
-
- case RANAL_BUF_VIRT_MAPPED:
- LASSERT (tx->tx_conn != NULL);
- dev = tx->tx_conn->rac_device;
- LASSERT (current == dev->rad_scheduler);
- rrc = RapkDeregisterMemory(dev->rad_handle, tx->tx_buffer,
- &tx->tx_map_key);
- LASSERT (rrc == RAP_SUCCESS);
-
- dev->rad_nvirtmap--;
- dev->rad_nobvirtmap -= tx->tx_nob;
-
- tx->tx_buftype = RANAL_BUF_VIRT_UNMAPPED;
- break;
- }
-}
-
-void
-kranal_tx_done (kra_tx_t *tx, int completion)
-{
- lnet_msg_t *lnetmsg[2];
- unsigned long flags;
- int i;
-
- LASSERT (!in_interrupt());
-
- kranal_unmap_buffer(tx);
-
- lnetmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL;
- lnetmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL;
-
- tx->tx_buftype = RANAL_BUF_NONE;
- tx->tx_msg.ram_type = RANAL_MSG_NONE;
- tx->tx_conn = NULL;
-
- spin_lock_irqsave(&kranal_data.kra_tx_lock, flags);
-
- cfs_list_add_tail(&tx->tx_list, &kranal_data.kra_idle_txs);
-
- spin_unlock_irqrestore(&kranal_data.kra_tx_lock, flags);
-
- /* finalize AFTER freeing lnet msgs */
- for (i = 0; i < 2; i++) {
- if (lnetmsg[i] == NULL)
- continue;
-
- lnet_finalize(kranal_data.kra_ni, lnetmsg[i], completion);
- }
-}
-
-kra_conn_t *
-kranal_find_conn_locked (kra_peer_t *peer)
-{
- cfs_list_t *tmp;
-
- /* just return the first connection */
- cfs_list_for_each (tmp, &peer->rap_conns) {
- return cfs_list_entry(tmp, kra_conn_t, rac_list);
- }
-
- return NULL;
-}
-
-void
-kranal_post_fma (kra_conn_t *conn, kra_tx_t *tx)
-{
- unsigned long flags;
-
- tx->tx_conn = conn;
-
- spin_lock_irqsave(&conn->rac_lock, flags);
- cfs_list_add_tail(&tx->tx_list, &conn->rac_fmaq);
- tx->tx_qtime = jiffies;
- spin_unlock_irqrestore(&conn->rac_lock, flags);
-
- kranal_schedule_conn(conn);
-}
-
-void
-kranal_launch_tx (kra_tx_t *tx, lnet_nid_t nid)
-{
- unsigned long flags;
- kra_peer_t *peer;
- kra_conn_t *conn;
- int rc;
- int retry;
- rwlock_t *g_lock = &kranal_data.kra_global_lock;
-
- /* If I get here, I've committed to send, so I complete the tx with
- * failure on any problems */
-
- LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */
-
- for (retry = 0; ; retry = 1) {
-
- read_lock(g_lock);
-
- peer = kranal_find_peer_locked(nid);
- if (peer != NULL) {
- conn = kranal_find_conn_locked(peer);
- if (conn != NULL) {
- kranal_post_fma(conn, tx);
- read_unlock(g_lock);
- return;
- }
- }
-
- /* Making connections; I'll need a write lock... */
- read_unlock(g_lock);
- write_lock_irqsave(g_lock, flags);
-
- peer = kranal_find_peer_locked(nid);
- if (peer != NULL)
- break;
-
- write_unlock_irqrestore(g_lock, flags);
-
- if (retry) {
- CERROR("Can't find peer %s\n", libcfs_nid2str(nid));
- kranal_tx_done(tx, -EHOSTUNREACH);
- return;
- }
-
- rc = kranal_add_persistent_peer(nid, LNET_NIDADDR(nid),
- lnet_acceptor_port());
- if (rc != 0) {
- CERROR("Can't add peer %s: %d\n",
- libcfs_nid2str(nid), rc);
- kranal_tx_done(tx, rc);
- return;
- }
- }
-
- conn = kranal_find_conn_locked(peer);
- if (conn != NULL) {
- /* Connection exists; queue message on it */
- kranal_post_fma(conn, tx);
- write_unlock_irqrestore(g_lock, flags);
- return;
- }
-
- LASSERT (peer->rap_persistence > 0);
-
- if (!peer->rap_connecting) {
- LASSERT (cfs_list_empty(&peer->rap_tx_queue));
-
- if (!(peer->rap_reconnect_interval == 0 || /* first attempt */
- cfs_time_aftereq(jiffies, peer->rap_reconnect_time))) {
- write_unlock_irqrestore(g_lock, flags);
- kranal_tx_done(tx, -EHOSTUNREACH);
- return;
- }
-
- peer->rap_connecting = 1;
- kranal_peer_addref(peer); /* extra ref for connd */
-
- spin_lock(&kranal_data.kra_connd_lock);
-
- cfs_list_add_tail(&peer->rap_connd_list,
- &kranal_data.kra_connd_peers);
- wake_up(&kranal_data.kra_connd_waitq);
-
- spin_unlock(&kranal_data.kra_connd_lock);
- }
-
- /* A connection is being established; queue the message... */
- cfs_list_add_tail(&tx->tx_list, &peer->rap_tx_queue);
-
- write_unlock_irqrestore(g_lock, flags);
-}
-
-void
-kranal_rdma(kra_tx_t *tx, int type,
- kra_rdma_desc_t *sink, int nob, __u64 cookie)
-{
- kra_conn_t *conn = tx->tx_conn;
- RAP_RETURN rrc;
- unsigned long flags;
-
- LASSERT (kranal_tx_mapped(tx));
- LASSERT (nob <= sink->rard_nob);
- LASSERT (nob <= tx->tx_nob);
-
- /* No actual race with scheduler sending CLOSE (I'm she!) */
- LASSERT (current == conn->rac_device->rad_scheduler);
-
- memset(&tx->tx_rdma_desc, 0, sizeof(tx->tx_rdma_desc));
- tx->tx_rdma_desc.SrcPtr.AddressBits = (__u64)((unsigned long)tx->tx_buffer);
- tx->tx_rdma_desc.SrcKey = tx->tx_map_key;
- tx->tx_rdma_desc.DstPtr = sink->rard_addr;
- tx->tx_rdma_desc.DstKey = sink->rard_key;
- tx->tx_rdma_desc.Length = nob;
- tx->tx_rdma_desc.AppPtr = tx;
-
- /* prep final completion message */
- kranal_init_msg(&tx->tx_msg, type);
- tx->tx_msg.ram_u.completion.racm_cookie = cookie;
-
- if (nob == 0) { /* Immediate completion */
- kranal_post_fma(conn, tx);
- return;
- }
-
- LASSERT (!conn->rac_close_sent); /* Don't lie (CLOSE == RDMA idle) */
-
- rrc = RapkPostRdma(conn->rac_rihandle, &tx->tx_rdma_desc);
- LASSERT (rrc == RAP_SUCCESS);
-
- spin_lock_irqsave(&conn->rac_lock, flags);
- cfs_list_add_tail(&tx->tx_list, &conn->rac_rdmaq);
- tx->tx_qtime = jiffies;
- spin_unlock_irqrestore(&conn->rac_lock, flags);
-}
-
-int
-kranal_consume_rxmsg (kra_conn_t *conn, void *buffer, int nob)
-{
- __u32 nob_received = nob;
- RAP_RETURN rrc;
-
- LASSERT (conn->rac_rxmsg != NULL);
- CDEBUG(D_NET, "Consuming %p\n", conn);
-
- rrc = RapkFmaCopyOut(conn->rac_rihandle, buffer,
- &nob_received, sizeof(kra_msg_t));
- LASSERT (rrc == RAP_SUCCESS);
-
- conn->rac_rxmsg = NULL;
-
- if (nob_received < nob) {
- CWARN("Incomplete immediate msg from %s: expected %d, got %d\n",
- libcfs_nid2str(conn->rac_peer->rap_nid),
- nob, nob_received);
- return -EPROTO;
- }
-
- return 0;
-}
-
-int
-kranal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
-{
- lnet_hdr_t *hdr = &lntmsg->msg_hdr;
- int type = lntmsg->msg_type;
- lnet_process_id_t target = lntmsg->msg_target;
- int target_is_router = lntmsg->msg_target_is_router;
- int routing = lntmsg->msg_routing;
- unsigned int niov = lntmsg->msg_niov;
- struct iovec *iov = lntmsg->msg_iov;
- lnet_kiov_t *kiov = lntmsg->msg_kiov;
- unsigned int offset = lntmsg->msg_offset;
- unsigned int nob = lntmsg->msg_len;
- kra_tx_t *tx;
- int rc;
-
- /* NB 'private' is different depending on what we're sending.... */
-
- CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
- nob, niov, libcfs_id2str(target));
-
- LASSERT (nob == 0 || niov > 0);
- LASSERT (niov <= LNET_MAX_IOV);
-
- LASSERT (!in_interrupt());
- /* payload is either all vaddrs or all pages */
- LASSERT (!(kiov != NULL && iov != NULL));
-
- if (routing) {
- CERROR ("Can't route\n");
- return -EIO;
- }
-
- switch(type) {
- default:
- LBUG();
-
- case LNET_MSG_ACK:
- LASSERT (nob == 0);
- break;
-
- case LNET_MSG_GET:
- LASSERT (niov == 0);
- LASSERT (nob == 0);
- /* We have to consider the eventual sink buffer rather than any
- * payload passed here (there isn't any, and strictly, looking
- * inside lntmsg is a layering violation). We send a simple
- * IMMEDIATE GET if the sink buffer is mapped already and small
- * enough for FMA */
-
- if (routing || target_is_router)
- break; /* send IMMEDIATE */
-
- if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0 &&
- lntmsg->msg_md->md_length <= RANAL_FMA_MAX_DATA &&
- lntmsg->msg_md->md_length <= *kranal_tunables.kra_max_immediate)
- break; /* send IMMEDIATE */
-
- tx = kranal_new_tx_msg(RANAL_MSG_GET_REQ);
- if (tx == NULL)
- return -ENOMEM;
-
- if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0)
- rc = kranal_setup_virt_buffer(tx, lntmsg->msg_md->md_niov,
- lntmsg->msg_md->md_iov.iov,
- 0, lntmsg->msg_md->md_length);
- else
- rc = kranal_setup_phys_buffer(tx, lntmsg->msg_md->md_niov,
- lntmsg->msg_md->md_iov.kiov,
- 0, lntmsg->msg_md->md_length);
- if (rc != 0) {
- kranal_tx_done(tx, rc);
- return -EIO;
- }
-
- tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg);
- if (tx->tx_lntmsg[1] == NULL) {
- CERROR("Can't create reply for GET to %s\n",
- libcfs_nid2str(target.nid));
- kranal_tx_done(tx, rc);
- return -EIO;
- }
-
- tx->tx_lntmsg[0] = lntmsg;
- tx->tx_msg.ram_u.get.ragm_hdr = *hdr;
- /* rest of tx_msg is setup just before it is sent */
- kranal_launch_tx(tx, target.nid);
- return 0;
-
- case LNET_MSG_REPLY:
- case LNET_MSG_PUT:
- if (kiov == NULL && /* not paged */
- nob <= RANAL_FMA_MAX_DATA && /* small enough */
- nob <= *kranal_tunables.kra_max_immediate)
- break; /* send IMMEDIATE */
-
- tx = kranal_new_tx_msg(RANAL_MSG_PUT_REQ);
- if (tx == NULL)
- return -ENOMEM;
-
- rc = kranal_setup_rdma_buffer(tx, niov, iov, kiov, offset, nob);
- if (rc != 0) {
- kranal_tx_done(tx, rc);
- return -EIO;
- }
-
- tx->tx_lntmsg[0] = lntmsg;
- tx->tx_msg.ram_u.putreq.raprm_hdr = *hdr;
- /* rest of tx_msg is setup just before it is sent */
- kranal_launch_tx(tx, target.nid);
- return 0;
- }
-
- /* send IMMEDIATE */
-
- LASSERT (kiov == NULL);
- LASSERT (nob <= RANAL_FMA_MAX_DATA);
-
- tx = kranal_new_tx_msg(RANAL_MSG_IMMEDIATE);
- if (tx == NULL)
- return -ENOMEM;
-
- rc = kranal_setup_immediate_buffer(tx, niov, iov, offset, nob);
- if (rc != 0) {
- kranal_tx_done(tx, rc);
- return -EIO;
- }
-
- tx->tx_msg.ram_u.immediate.raim_hdr = *hdr;
- tx->tx_lntmsg[0] = lntmsg;
- kranal_launch_tx(tx, target.nid);
- return 0;
-}
-
-void
-kranal_reply(lnet_ni_t *ni, kra_conn_t *conn, lnet_msg_t *lntmsg)
-{
- kra_msg_t *rxmsg = conn->rac_rxmsg;
- unsigned int niov = lntmsg->msg_niov;
- struct iovec *iov = lntmsg->msg_iov;
- lnet_kiov_t *kiov = lntmsg->msg_kiov;
- unsigned int offset = lntmsg->msg_offset;
- unsigned int nob = lntmsg->msg_len;
- kra_tx_t *tx;
- int rc;
-
- tx = kranal_get_idle_tx();
- if (tx == NULL)
- goto failed_0;
-
- rc = kranal_setup_rdma_buffer(tx, niov, iov, kiov, offset, nob);
- if (rc != 0)
- goto failed_1;
-
- tx->tx_conn = conn;
-
- rc = kranal_map_buffer(tx);
- if (rc != 0)
- goto failed_1;
-
- tx->tx_lntmsg[0] = lntmsg;
-
- kranal_rdma(tx, RANAL_MSG_GET_DONE,
- &rxmsg->ram_u.get.ragm_desc, nob,
- rxmsg->ram_u.get.ragm_cookie);
- return;
-
- failed_1:
- kranal_tx_done(tx, -EIO);
- failed_0:
- lnet_finalize(ni, lntmsg, -EIO);
-}
-
-int
-kranal_eager_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
- void **new_private)
-{
- kra_conn_t *conn = (kra_conn_t *)private;
-
- LCONSOLE_ERROR_MSG(0x12b, "Dropping message from %s: no buffers free.\n",
- libcfs_nid2str(conn->rac_peer->rap_nid));
-
- return -EDEADLK;
-}
-
-int
-kranal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
- int delayed, unsigned int niov,
- struct iovec *iov, lnet_kiov_t *kiov,
- unsigned int offset, unsigned int mlen, unsigned int rlen)
-{
- kra_conn_t *conn = private;
- kra_msg_t *rxmsg = conn->rac_rxmsg;
- kra_tx_t *tx;
- void *buffer;
- int rc;
-
- LASSERT (mlen <= rlen);
- LASSERT (!in_interrupt());
- /* Either all pages or all vaddrs */
- LASSERT (!(kiov != NULL && iov != NULL));
-
- CDEBUG(D_NET, "conn %p, rxmsg %p, lntmsg %p\n", conn, rxmsg, lntmsg);
-
- switch(rxmsg->ram_type) {
- default:
- LBUG();
-
- case RANAL_MSG_IMMEDIATE:
- if (mlen == 0) {
- buffer = NULL;
- } else if (kiov != NULL) {
- CERROR("Can't recv immediate into paged buffer\n");
- return -EIO;
- } else {
- LASSERT (niov > 0);
- while (offset >= iov->iov_len) {
- offset -= iov->iov_len;
- iov++;
- niov--;
- LASSERT (niov > 0);
- }
- if (mlen > iov->iov_len - offset) {
- CERROR("Can't handle immediate frags\n");
- return -EIO;
- }
- buffer = ((char *)iov->iov_base) + offset;
- }
- rc = kranal_consume_rxmsg(conn, buffer, mlen);
- lnet_finalize(ni, lntmsg, (rc == 0) ? 0 : -EIO);
- return 0;
-
- case RANAL_MSG_PUT_REQ:
- tx = kranal_new_tx_msg(RANAL_MSG_PUT_ACK);
- if (tx == NULL) {
- kranal_consume_rxmsg(conn, NULL, 0);
- return -ENOMEM;
- }
-
- rc = kranal_setup_rdma_buffer(tx, niov, iov, kiov, offset, mlen);
- if (rc != 0) {
- kranal_tx_done(tx, rc);
- kranal_consume_rxmsg(conn, NULL, 0);
- return -EIO;
- }
-
- tx->tx_conn = conn;
- rc = kranal_map_buffer(tx);
- if (rc != 0) {
- kranal_tx_done(tx, rc);
- kranal_consume_rxmsg(conn, NULL, 0);
- return -EIO;
- }
-
- tx->tx_msg.ram_u.putack.rapam_src_cookie =
- conn->rac_rxmsg->ram_u.putreq.raprm_cookie;
- tx->tx_msg.ram_u.putack.rapam_dst_cookie = tx->tx_cookie;
- tx->tx_msg.ram_u.putack.rapam_desc.rard_key = tx->tx_map_key;
- tx->tx_msg.ram_u.putack.rapam_desc.rard_addr.AddressBits =
- (__u64)((unsigned long)tx->tx_buffer);
- tx->tx_msg.ram_u.putack.rapam_desc.rard_nob = mlen;
-
- tx->tx_lntmsg[0] = lntmsg; /* finalize this on RDMA_DONE */
-
- kranal_post_fma(conn, tx);
- kranal_consume_rxmsg(conn, NULL, 0);
- return 0;
-
- case RANAL_MSG_GET_REQ:
- if (lntmsg != NULL) {
- /* Matched! */
- kranal_reply(ni, conn, lntmsg);
- } else {
- /* No match */
- tx = kranal_new_tx_msg(RANAL_MSG_GET_NAK);
- if (tx != NULL) {
- tx->tx_msg.ram_u.completion.racm_cookie =
- rxmsg->ram_u.get.ragm_cookie;
- kranal_post_fma(conn, tx);
- }
- }
- kranal_consume_rxmsg(conn, NULL, 0);
- return 0;
- }
-}
-
-int
-kranal_thread_start(int(*fn)(void *arg), void *arg, char *name)
-{
- struct task_struct *task = cfs_thread_run(fn, arg, name);
-
- if (!IS_ERR(task))
- atomic_inc(&kranal_data.kra_nthreads);
- return PTR_ERR(task);
-}
-
-void
-kranal_thread_fini (void)
-{
- atomic_dec(&kranal_data.kra_nthreads);
-}
-
-int
-kranal_check_conn_timeouts (kra_conn_t *conn)
-{
- kra_tx_t *tx;
- cfs_list_t *ttmp;
- unsigned long flags;
- long timeout;
- unsigned long now = jiffies;
-
- LASSERT (conn->rac_state == RANAL_CONN_ESTABLISHED ||
- conn->rac_state == RANAL_CONN_CLOSING);
-
- if (!conn->rac_close_sent &&
- cfs_time_aftereq(now, conn->rac_last_tx +
- msecs_to_jiffies(conn->rac_keepalive *
- MSEC_PER_SEC))) {
- /* not sent in a while; schedule conn so scheduler sends a keepalive */
- CDEBUG(D_NET, "Scheduling keepalive %p->%s\n",
- conn, libcfs_nid2str(conn->rac_peer->rap_nid));
- kranal_schedule_conn(conn);
- }
-
- timeout = msecs_to_jiffies(conn->rac_timeout * MSEC_PER_SEC);
-
- if (!conn->rac_close_recvd &&
- cfs_time_aftereq(now, conn->rac_last_rx + timeout)) {
- CERROR("%s received from %s within %lu seconds\n",
- (conn->rac_state == RANAL_CONN_ESTABLISHED) ?
- "Nothing" : "CLOSE not",
- libcfs_nid2str(conn->rac_peer->rap_nid),
- jiffies_to_msecs(now - conn->rac_last_rx)/MSEC_PER_SEC);
- return -ETIMEDOUT;
- }
-
- if (conn->rac_state != RANAL_CONN_ESTABLISHED)
- return 0;
-
- /* Check the conn's queues are moving. These are "belt+braces" checks,
- * in case of hardware/software errors that make this conn seem
- * responsive even though it isn't progressing its message queues. */
-
- spin_lock_irqsave(&conn->rac_lock, flags);
-
- cfs_list_for_each (ttmp, &conn->rac_fmaq) {
- tx = cfs_list_entry(ttmp, kra_tx_t, tx_list);
-
- if (cfs_time_aftereq(now, tx->tx_qtime + timeout)) {
- spin_unlock_irqrestore(&conn->rac_lock, flags);
- CERROR("tx on fmaq for %s blocked %lu seconds\n",
- libcfs_nid2str(conn->rac_peer->rap_nid),
- jiffies_to_msecs(now-tx->tx_qtime)/MSEC_PER_SEC);
- return -ETIMEDOUT;
- }
- }
-
- cfs_list_for_each (ttmp, &conn->rac_rdmaq) {
- tx = cfs_list_entry(ttmp, kra_tx_t, tx_list);
-
- if (cfs_time_aftereq(now, tx->tx_qtime + timeout)) {
- spin_unlock_irqrestore(&conn->rac_lock, flags);
- CERROR("tx on rdmaq for %s blocked %lu seconds\n",
- libcfs_nid2str(conn->rac_peer->rap_nid),
- jiffies_to_msecs(now-tx->tx_qtime)/MSEC_PER_SEC);
- return -ETIMEDOUT;
- }
- }
-
- cfs_list_for_each (ttmp, &conn->rac_replyq) {
- tx = cfs_list_entry(ttmp, kra_tx_t, tx_list);
-
- if (cfs_time_aftereq(now, tx->tx_qtime + timeout)) {
- spin_unlock_irqrestore(&conn->rac_lock, flags);
- CERROR("tx on replyq for %s blocked %lu seconds\n",
- libcfs_nid2str(conn->rac_peer->rap_nid),
- jiffies_to_msecs(now-tx->tx_qtime)/MSEC_PER_SEC);
- return -ETIMEDOUT;
- }
- }
-
- spin_unlock_irqrestore(&conn->rac_lock, flags);
- return 0;
-}
-
-void
-kranal_reaper_check (int idx, unsigned long *min_timeoutp)
-{
- cfs_list_t *conns = &kranal_data.kra_conns[idx];
- cfs_list_t *ctmp;
- kra_conn_t *conn;
- unsigned long flags;
- int rc;
-
- again:
- /* NB. We expect to check all the conns and not find any problems, so
- * we just use a shared lock while we take a look... */
- read_lock(&kranal_data.kra_global_lock);
-
- cfs_list_for_each (ctmp, conns) {
- conn = cfs_list_entry(ctmp, kra_conn_t, rac_hashlist);
-
- if (conn->rac_timeout < *min_timeoutp )
- *min_timeoutp = conn->rac_timeout;
- if (conn->rac_keepalive < *min_timeoutp )
- *min_timeoutp = conn->rac_keepalive;
-
- rc = kranal_check_conn_timeouts(conn);
- if (rc == 0)
- continue;
-
- kranal_conn_addref(conn);
- read_unlock(&kranal_data.kra_global_lock);
-
- CERROR("Conn to %s, cqid %d timed out\n",
- libcfs_nid2str(conn->rac_peer->rap_nid),
- conn->rac_cqid);
-
- write_lock_irqsave(&kranal_data.kra_global_lock, flags);
-
- switch (conn->rac_state) {
- default:
- LBUG();
-
- case RANAL_CONN_ESTABLISHED:
- kranal_close_conn_locked(conn, -ETIMEDOUT);
- break;
-
- case RANAL_CONN_CLOSING:
- kranal_terminate_conn_locked(conn);
- break;
- }
-
- write_unlock_irqrestore(&kranal_data.kra_global_lock,
- flags);
-
- kranal_conn_decref(conn);
-
- /* start again now I've dropped the lock */
- goto again;
- }
-
- read_unlock(&kranal_data.kra_global_lock);
-}
-
-int
-kranal_connd (void *arg)
-{
- long id = (long)arg;
- wait_queue_t wait;
- unsigned long flags;
- kra_peer_t *peer;
- kra_acceptsock_t *ras;
- int did_something;
-
- cfs_block_allsigs();
-
- init_waitqueue_entry_current(&wait);
-
- spin_lock_irqsave(&kranal_data.kra_connd_lock, flags);
-
- while (!kranal_data.kra_shutdown) {
- did_something = 0;
-
- if (!cfs_list_empty(&kranal_data.kra_connd_acceptq)) {
- ras = cfs_list_entry(kranal_data.kra_connd_acceptq.next,
- kra_acceptsock_t, ras_list);
- cfs_list_del(&ras->ras_list);
-
- spin_unlock_irqrestore(&kranal_data.kra_connd_lock,
- flags);
-
- CDEBUG(D_NET,"About to handshake someone\n");
-
- kranal_conn_handshake(ras->ras_sock, NULL);
- kranal_free_acceptsock(ras);
-
- CDEBUG(D_NET,"Finished handshaking someone\n");
-
- spin_lock_irqsave(&kranal_data.kra_connd_lock,
- flags);
- did_something = 1;
- }
-
- if (!cfs_list_empty(&kranal_data.kra_connd_peers)) {
- peer = cfs_list_entry(kranal_data.kra_connd_peers.next,
- kra_peer_t, rap_connd_list);
-
- cfs_list_del_init(&peer->rap_connd_list);
- spin_unlock_irqrestore(&kranal_data.kra_connd_lock,
- flags);
-
- kranal_connect(peer);
- kranal_peer_decref(peer);
-
- spin_lock_irqsave(&kranal_data.kra_connd_lock,
- flags);
- did_something = 1;
- }
-
- if (did_something)
- continue;
-
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue_exclusive(&kranal_data.kra_connd_waitq, &wait);
-
- spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags);
-
- waitq_wait(&wait, TASK_INTERRUPTIBLE);
-
- set_current_state(TASK_RUNNING);
- remove_wait_queue(&kranal_data.kra_connd_waitq, &wait);
-
- spin_lock_irqsave(&kranal_data.kra_connd_lock, flags);
- }
-
- spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags);
-
- kranal_thread_fini();
- return 0;
-}
-
-void
-kranal_update_reaper_timeout(long timeout)
-{
- unsigned long flags;
-
- LASSERT (timeout > 0);
-
- spin_lock_irqsave(&kranal_data.kra_reaper_lock, flags);
-
- if (timeout < kranal_data.kra_new_min_timeout)
- kranal_data.kra_new_min_timeout = timeout;
-
- spin_unlock_irqrestore(&kranal_data.kra_reaper_lock, flags);
-}
-
-int
-kranal_reaper (void *arg)
-{
- wait_queue_t wait;
- unsigned long flags;
- long timeout;
- int i;
- int conn_entries = kranal_data.kra_conn_hash_size;
- int conn_index = 0;
- int base_index = conn_entries - 1;
- unsigned long next_check_time = jiffies;
- long next_min_timeout = MAX_SCHEDULE_TIMEOUT;
- long current_min_timeout = 1;
-
- cfs_block_allsigs();
-
- init_waitqueue_entry_current(&wait);
-
- spin_lock_irqsave(&kranal_data.kra_reaper_lock, flags);
-
- while (!kranal_data.kra_shutdown) {
- /* I wake up every 'p' seconds to check for timeouts on some
- * more peers. I try to check every connection 'n' times
- * within the global minimum of all keepalive and timeout
- * intervals, to ensure I attend to every connection within
- * (n+1)/n times its timeout intervals. */
- const int p = 1;
- const int n = 3;
- unsigned long min_timeout;
- int chunk;
-
- /* careful with the jiffy wrap... */
- timeout = (long)(next_check_time - jiffies);
- if (timeout > 0) {
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&kranal_data.kra_reaper_waitq, &wait);
-
- spin_unlock_irqrestore(&kranal_data.kra_reaper_lock,
- flags);
-
- waitq_timedwait(&wait, TASK_INTERRUPTIBLE,
- timeout);
-
- spin_lock_irqsave(&kranal_data.kra_reaper_lock,
- flags);
-
- set_current_state(TASK_RUNNING);
- remove_wait_queue(&kranal_data.kra_reaper_waitq, &wait);
- continue;
- }
-
- if (kranal_data.kra_new_min_timeout !=
- MAX_SCHEDULE_TIMEOUT) {
- /* new min timeout set: restart min timeout scan */
- next_min_timeout = MAX_SCHEDULE_TIMEOUT;
- base_index = conn_index - 1;
- if (base_index < 0)
- base_index = conn_entries - 1;
-
- if (kranal_data.kra_new_min_timeout <
- current_min_timeout) {
- current_min_timeout =
- kranal_data.kra_new_min_timeout;
- CDEBUG(D_NET, "Set new min timeout %ld\n",
- current_min_timeout);
- }
-
- kranal_data.kra_new_min_timeout =
- MAX_SCHEDULE_TIMEOUT;
- }
- min_timeout = current_min_timeout;
-
- spin_unlock_irqrestore(&kranal_data.kra_reaper_lock, flags);
-
- LASSERT (min_timeout > 0);
-
- /* Compute how many table entries to check now so I get round
- * the whole table fast enough given that I do this at fixed
- * intervals of 'p' seconds) */
- chunk = conn_entries;
- if (min_timeout > n * p)
- chunk = (chunk * n * p) / min_timeout;
- if (chunk == 0)
- chunk = 1;
-
- for (i = 0; i < chunk; i++) {
- kranal_reaper_check(conn_index,
- &next_min_timeout);
- conn_index = (conn_index + 1) % conn_entries;
- }
-
- next_check_time += msecs_to_jiffies(p * MSEC_PER_SEC);
-
- spin_lock_irqsave(&kranal_data.kra_reaper_lock, flags);
-
- if (((conn_index - chunk <= base_index &&
- base_index < conn_index) ||
- (conn_index - conn_entries - chunk <= base_index &&
- base_index < conn_index - conn_entries))) {
-
- /* Scanned all conns: set current_min_timeout... */
- if (current_min_timeout != next_min_timeout) {
- current_min_timeout = next_min_timeout;
- CDEBUG(D_NET, "Set new min timeout %ld\n",
- current_min_timeout);
- }
-
- /* ...and restart min timeout scan */
- next_min_timeout = MAX_SCHEDULE_TIMEOUT;
- base_index = conn_index - 1;
- if (base_index < 0)
- base_index = conn_entries - 1;
- }
- }
-
- kranal_thread_fini();
- return 0;
-}
-
-void
-kranal_check_rdma_cq (kra_device_t *dev)
-{
- kra_conn_t *conn;
- kra_tx_t *tx;
- RAP_RETURN rrc;
- unsigned long flags;
- RAP_RDMA_DESCRIPTOR *desc;
- __u32 cqid;
- __u32 event_type;
-
- for (;;) {
- rrc = RapkCQDone(dev->rad_rdma_cqh, &cqid, &event_type);
- if (rrc == RAP_NOT_DONE) {
- CDEBUG(D_NET, "RDMA CQ %d empty\n", dev->rad_id);
- return;
- }
-
- LASSERT (rrc == RAP_SUCCESS);
- LASSERT ((event_type & RAPK_CQ_EVENT_OVERRUN) == 0);
-
- read_lock(&kranal_data.kra_global_lock);
-
- conn = kranal_cqid2conn_locked(cqid);
- if (conn == NULL) {
- /* Conn was destroyed? */
- CDEBUG(D_NET, "RDMA CQID lookup %d failed\n", cqid);
- read_unlock(&kranal_data.kra_global_lock);
- continue;
- }
-
- rrc = RapkRdmaDone(conn->rac_rihandle, &desc);
- LASSERT (rrc == RAP_SUCCESS);
-
- CDEBUG(D_NET, "Completed %p\n",
- cfs_list_entry(conn->rac_rdmaq.next, kra_tx_t, tx_list));
-
- spin_lock_irqsave(&conn->rac_lock, flags);
-
- LASSERT (!cfs_list_empty(&conn->rac_rdmaq));
- tx = cfs_list_entry(conn->rac_rdmaq.next, kra_tx_t, tx_list);
- cfs_list_del(&tx->tx_list);
-
- LASSERT(desc->AppPtr == (void *)tx);
- LASSERT(tx->tx_msg.ram_type == RANAL_MSG_PUT_DONE ||
- tx->tx_msg.ram_type == RANAL_MSG_GET_DONE);
-
- cfs_list_add_tail(&tx->tx_list, &conn->rac_fmaq);
- tx->tx_qtime = jiffies;
-
- spin_unlock_irqrestore(&conn->rac_lock, flags);
-
- /* Get conn's fmaq processed, now I've just put something
- * there */
- kranal_schedule_conn(conn);
-
- read_unlock(&kranal_data.kra_global_lock);
- }
-}
-
-void
-kranal_check_fma_cq (kra_device_t *dev)
-{
- kra_conn_t *conn;
- RAP_RETURN rrc;
- __u32 cqid;
- __u32 event_type;
- cfs_list_t *conns;
- cfs_list_t *tmp;
- int i;
-
- for (;;) {
- rrc = RapkCQDone(dev->rad_fma_cqh, &cqid, &event_type);
- if (rrc == RAP_NOT_DONE) {
- CDEBUG(D_NET, "FMA CQ %d empty\n", dev->rad_id);
- return;
- }
-
- LASSERT (rrc == RAP_SUCCESS);
-
- if ((event_type & RAPK_CQ_EVENT_OVERRUN) == 0) {
-
- read_lock(&kranal_data.kra_global_lock);
-
- conn = kranal_cqid2conn_locked(cqid);
- if (conn == NULL) {
- CDEBUG(D_NET, "FMA CQID lookup %d failed\n",
- cqid);
- } else {
- CDEBUG(D_NET, "FMA completed: %p CQID %d\n",
- conn, cqid);
- kranal_schedule_conn(conn);
- }
-
- read_unlock(&kranal_data.kra_global_lock);
- continue;
- }
-
- /* FMA CQ has overflowed: check ALL conns */
- CWARN("FMA CQ overflow: scheduling ALL conns on device %d\n",
- dev->rad_id);
-
- for (i = 0; i < kranal_data.kra_conn_hash_size; i++) {
-
- read_lock(&kranal_data.kra_global_lock);
-
- conns = &kranal_data.kra_conns[i];
-
- cfs_list_for_each (tmp, conns) {
- conn = cfs_list_entry(tmp, kra_conn_t,
- rac_hashlist);
-
- if (conn->rac_device == dev)
- kranal_schedule_conn(conn);
- }
-
- /* don't block write lockers for too long... */
- read_unlock(&kranal_data.kra_global_lock);
- }
- }
-}
-
-int
-kranal_sendmsg(kra_conn_t *conn, kra_msg_t *msg,
- void *immediate, int immediatenob)
-{
- int sync = (msg->ram_type & RANAL_MSG_FENCE) != 0;
- RAP_RETURN rrc;
-
- CDEBUG(D_NET,"%p sending msg %p %02x%s [%p for %d]\n",
- conn, msg, msg->ram_type, sync ? "(sync)" : "",
- immediate, immediatenob);
-
- LASSERT (sizeof(*msg) <= RANAL_FMA_MAX_PREFIX);
- LASSERT ((msg->ram_type == RANAL_MSG_IMMEDIATE) ?
- immediatenob <= RANAL_FMA_MAX_DATA :
- immediatenob == 0);
-
- msg->ram_connstamp = conn->rac_my_connstamp;
- msg->ram_seq = conn->rac_tx_seq;
-
- if (sync)
- rrc = RapkFmaSyncSend(conn->rac_rihandle,
- immediate, immediatenob,
- msg, sizeof(*msg));
- else
- rrc = RapkFmaSend(conn->rac_rihandle,
- immediate, immediatenob,
- msg, sizeof(*msg));
-
- switch (rrc) {
- default:
- LBUG();
-
- case RAP_SUCCESS:
- conn->rac_last_tx = jiffies;
- conn->rac_tx_seq++;
- return 0;
-
- case RAP_NOT_DONE:
- if (cfs_time_aftereq(jiffies,
- conn->rac_last_tx +
- msecs_to_jiffies(conn->rac_keepalive *
- MSEC_PER_SEC)))
- CWARN("EAGAIN sending %02x (idle %lu secs)\n",
- msg->ram_type,
- jiffies_to_msecs(jiffies - conn->rac_last_tx) /
- MSEC_PER_SEC);
- return -EAGAIN;
- }
-}
-
-void
-kranal_process_fmaq (kra_conn_t *conn)
-{
- unsigned long flags;
- int more_to_do;
- kra_tx_t *tx;
- int rc;
- int expect_reply;
-
- /* NB 1. kranal_sendmsg() may fail if I'm out of credits right now.
- * However I will be rescheduled by an FMA completion event
- * when I eventually get some.
- * NB 2. Sampling rac_state here races with setting it elsewhere.
- * But it doesn't matter if I try to send a "real" message just
- * as I start closing because I'll get scheduled to send the
- * close anyway. */
-
- /* Not racing with incoming message processing! */
- LASSERT (current == conn->rac_device->rad_scheduler);
-
- if (conn->rac_state != RANAL_CONN_ESTABLISHED) {
- if (!cfs_list_empty(&conn->rac_rdmaq)) {
- /* RDMAs in progress */
- LASSERT (!conn->rac_close_sent);
-
- if (cfs_time_aftereq(jiffies,
- conn->rac_last_tx +
- msecs_to_jiffies(conn->rac_keepalive *
- MSEC_PER_SEC))) {
- CDEBUG(D_NET, "sending NOOP (rdma in progress)\n");
- kranal_init_msg(&conn->rac_msg, RANAL_MSG_NOOP);
- kranal_sendmsg(conn, &conn->rac_msg, NULL, 0);
- }
- return;
- }
-
- if (conn->rac_close_sent)
- return;
-
- CWARN("sending CLOSE to %s\n",
- libcfs_nid2str(conn->rac_peer->rap_nid));
- kranal_init_msg(&conn->rac_msg, RANAL_MSG_CLOSE);
- rc = kranal_sendmsg(conn, &conn->rac_msg, NULL, 0);
- if (rc != 0)
- return;
-
- conn->rac_close_sent = 1;
- if (!conn->rac_close_recvd)
- return;
-
- write_lock_irqsave(&kranal_data.kra_global_lock, flags);
-
- if (conn->rac_state == RANAL_CONN_CLOSING)
- kranal_terminate_conn_locked(conn);
-
- write_unlock_irqrestore(&kranal_data.kra_global_lock,
- flags);
- return;
- }
-
- spin_lock_irqsave(&conn->rac_lock, flags);
-
- if (cfs_list_empty(&conn->rac_fmaq)) {
-
- spin_unlock_irqrestore(&conn->rac_lock, flags);
-
- if (cfs_time_aftereq(jiffies,
- conn->rac_last_tx +
- msecs_to_jiffies(conn->rac_keepalive *
- MSEC_PER_SEC))) {
- CDEBUG(D_NET, "sending NOOP -> %s (%p idle %lu(%ld))\n",
- libcfs_nid2str(conn->rac_peer->rap_nid), conn,
- jiffies_to_msecs(jiffies - conn->rac_last_tx) /
- MSEC_PER_SEC,
- conn->rac_keepalive);
- kranal_init_msg(&conn->rac_msg, RANAL_MSG_NOOP);
- kranal_sendmsg(conn, &conn->rac_msg, NULL, 0);
- }
- return;
- }
-
- tx = cfs_list_entry(conn->rac_fmaq.next, kra_tx_t, tx_list);
- cfs_list_del(&tx->tx_list);
- more_to_do = !cfs_list_empty(&conn->rac_fmaq);
-
- spin_unlock_irqrestore(&conn->rac_lock, flags);
-
- expect_reply = 0;
- CDEBUG(D_NET, "sending regular msg: %p, type %02x, cookie "LPX64"\n",
- tx, tx->tx_msg.ram_type, tx->tx_cookie);
- switch (tx->tx_msg.ram_type) {
- default:
- LBUG();
-
- case RANAL_MSG_IMMEDIATE:
- rc = kranal_sendmsg(conn, &tx->tx_msg,
- tx->tx_buffer, tx->tx_nob);
- break;
-
- case RANAL_MSG_PUT_NAK:
- case RANAL_MSG_PUT_DONE:
- case RANAL_MSG_GET_NAK:
- case RANAL_MSG_GET_DONE:
- rc = kranal_sendmsg(conn, &tx->tx_msg, NULL, 0);
- break;
-
- case RANAL_MSG_PUT_REQ:
- rc = kranal_map_buffer(tx);
- LASSERT (rc != -EAGAIN);
- if (rc != 0)
- break;
-
- tx->tx_msg.ram_u.putreq.raprm_cookie = tx->tx_cookie;
- rc = kranal_sendmsg(conn, &tx->tx_msg, NULL, 0);
- expect_reply = 1;
- break;
-
- case RANAL_MSG_PUT_ACK:
- rc = kranal_sendmsg(conn, &tx->tx_msg, NULL, 0);
- expect_reply = 1;
- break;
-
- case RANAL_MSG_GET_REQ:
- rc = kranal_map_buffer(tx);
- LASSERT (rc != -EAGAIN);
- if (rc != 0)
- break;
-
- tx->tx_msg.ram_u.get.ragm_cookie = tx->tx_cookie;
- tx->tx_msg.ram_u.get.ragm_desc.rard_key = tx->tx_map_key;
- tx->tx_msg.ram_u.get.ragm_desc.rard_addr.AddressBits =
- (__u64)((unsigned long)tx->tx_buffer);
- tx->tx_msg.ram_u.get.ragm_desc.rard_nob = tx->tx_nob;
- rc = kranal_sendmsg(conn, &tx->tx_msg, NULL, 0);
- expect_reply = 1;
- break;
- }
-
- if (rc == -EAGAIN) {
- /* I need credits to send this. Replace tx at the head of the
- * fmaq and I'll get rescheduled when credits appear */
- CDEBUG(D_NET, "EAGAIN on %p\n", conn);
- spin_lock_irqsave(&conn->rac_lock, flags);
- cfs_list_add(&tx->tx_list, &conn->rac_fmaq);
- spin_unlock_irqrestore(&conn->rac_lock, flags);
- return;
- }
-
- if (!expect_reply || rc != 0) {
- kranal_tx_done(tx, rc);
- } else {
- /* LASSERT(current) above ensures this doesn't race with reply
- * processing */
- spin_lock_irqsave(&conn->rac_lock, flags);
- cfs_list_add_tail(&tx->tx_list, &conn->rac_replyq);
- tx->tx_qtime = jiffies;
- spin_unlock_irqrestore(&conn->rac_lock, flags);
- }
-
- if (more_to_do) {
- CDEBUG(D_NET, "Rescheduling %p (more to do)\n", conn);
- kranal_schedule_conn(conn);
- }
-}
-
-static inline void
-kranal_swab_rdma_desc (kra_rdma_desc_t *d)
-{
- __swab64s(&d->rard_key.Key);
- __swab16s(&d->rard_key.Cookie);
- __swab16s(&d->rard_key.MdHandle);
- __swab32s(&d->rard_key.Flags);
- __swab64s(&d->rard_addr.AddressBits);
- __swab32s(&d->rard_nob);
-}
-
-kra_tx_t *
-kranal_match_reply(kra_conn_t *conn, int type, __u64 cookie)
-{
- cfs_list_t *ttmp;
- kra_tx_t *tx;
- unsigned long flags;
-
- spin_lock_irqsave(&conn->rac_lock, flags);
-
- cfs_list_for_each(ttmp, &conn->rac_replyq) {
- tx = cfs_list_entry(ttmp, kra_tx_t, tx_list);
-
- CDEBUG(D_NET,"Checking %p %02x/"LPX64"\n",
- tx, tx->tx_msg.ram_type, tx->tx_cookie);
-
- if (tx->tx_cookie != cookie)
- continue;
-
- if (tx->tx_msg.ram_type != type) {
- spin_unlock_irqrestore(&conn->rac_lock, flags);
- CWARN("Unexpected type %x (%x expected) "
- "matched reply from %s\n",
- tx->tx_msg.ram_type, type,
- libcfs_nid2str(conn->rac_peer->rap_nid));
- return NULL;
- }
-
- cfs_list_del(&tx->tx_list);
- spin_unlock_irqrestore(&conn->rac_lock, flags);
- return tx;
- }
-
- spin_unlock_irqrestore(&conn->rac_lock, flags);
- CWARN("Unmatched reply %02x/"LPX64" from %s\n",
- type, cookie, libcfs_nid2str(conn->rac_peer->rap_nid));
- return NULL;
-}
-
-void
-kranal_check_fma_rx (kra_conn_t *conn)
-{
- unsigned long flags;
- __u32 seq;
- kra_tx_t *tx;
- kra_msg_t *msg;
- void *prefix;
- RAP_RETURN rrc = RapkFmaGetPrefix(conn->rac_rihandle, &prefix);
- kra_peer_t *peer = conn->rac_peer;
- int rc = 0;
- int repost = 1;
-
- if (rrc == RAP_NOT_DONE)
- return;
-
- CDEBUG(D_NET, "RX on %p\n", conn);
-
- LASSERT (rrc == RAP_SUCCESS);
- conn->rac_last_rx = jiffies;
- seq = conn->rac_rx_seq++;
- msg = (kra_msg_t *)prefix;
-
- /* stash message for portals callbacks they'll NULL
- * rac_rxmsg if they consume it */
- LASSERT (conn->rac_rxmsg == NULL);
- conn->rac_rxmsg = msg;
-
- if (msg->ram_magic != RANAL_MSG_MAGIC) {
- if (__swab32(msg->ram_magic) != RANAL_MSG_MAGIC) {
- CERROR("Unexpected magic %08x from %s\n",
- msg->ram_magic, libcfs_nid2str(peer->rap_nid));
- rc = -EPROTO;
- goto out;
- }
-
- __swab32s(&msg->ram_magic);
- __swab16s(&msg->ram_version);
- __swab16s(&msg->ram_type);
- __swab64s(&msg->ram_srcnid);
- __swab64s(&msg->ram_connstamp);
- __swab32s(&msg->ram_seq);
-
- /* NB message type checked below; NOT here... */
- switch (msg->ram_type) {
- case RANAL_MSG_PUT_ACK:
- kranal_swab_rdma_desc(&msg->ram_u.putack.rapam_desc);
- break;
-
- case RANAL_MSG_GET_REQ:
- kranal_swab_rdma_desc(&msg->ram_u.get.ragm_desc);
- break;
-
- default:
- break;
- }
- }
-
- if (msg->ram_version != RANAL_MSG_VERSION) {
- CERROR("Unexpected protocol version %d from %s\n",
- msg->ram_version, libcfs_nid2str(peer->rap_nid));
- rc = -EPROTO;
- goto out;
- }
-
- if (msg->ram_srcnid != peer->rap_nid) {
- CERROR("Unexpected peer %s from %s\n",
- libcfs_nid2str(msg->ram_srcnid),
- libcfs_nid2str(peer->rap_nid));
- rc = -EPROTO;
- goto out;
- }
-
- if (msg->ram_connstamp != conn->rac_peer_connstamp) {
- CERROR("Unexpected connstamp "LPX64"("LPX64
- " expected) from %s\n",
- msg->ram_connstamp, conn->rac_peer_connstamp,
- libcfs_nid2str(peer->rap_nid));
- rc = -EPROTO;
- goto out;
- }
-
- if (msg->ram_seq != seq) {
- CERROR("Unexpected sequence number %d(%d expected) from %s\n",
- msg->ram_seq, seq, libcfs_nid2str(peer->rap_nid));
- rc = -EPROTO;
- goto out;
- }
-
- if ((msg->ram_type & RANAL_MSG_FENCE) != 0) {
- /* This message signals RDMA completion... */
- rrc = RapkFmaSyncWait(conn->rac_rihandle);
- if (rrc != RAP_SUCCESS) {
- CERROR("RapkFmaSyncWait failed: %d\n", rrc);
- rc = -ENETDOWN;
- goto out;
- }
- }
-
- if (conn->rac_close_recvd) {
- CERROR("Unexpected message %d after CLOSE from %s\n",
- msg->ram_type, libcfs_nid2str(conn->rac_peer->rap_nid));
- rc = -EPROTO;
- goto out;
- }
-
- if (msg->ram_type == RANAL_MSG_CLOSE) {
- CWARN("RX CLOSE from %s\n", libcfs_nid2str(conn->rac_peer->rap_nid));
- conn->rac_close_recvd = 1;
- write_lock_irqsave(&kranal_data.kra_global_lock, flags);
-
- if (conn->rac_state == RANAL_CONN_ESTABLISHED)
- kranal_close_conn_locked(conn, 0);
- else if (conn->rac_state == RANAL_CONN_CLOSING &&
- conn->rac_close_sent)
- kranal_terminate_conn_locked(conn);
-
- write_unlock_irqrestore(&kranal_data.kra_global_lock,
- flags);
- goto out;
- }
-
- if (conn->rac_state != RANAL_CONN_ESTABLISHED)
- goto out;
-
- switch (msg->ram_type) {
- case RANAL_MSG_NOOP:
- /* Nothing to do; just a keepalive */
- CDEBUG(D_NET, "RX NOOP on %p\n", conn);
- break;
-
- case RANAL_MSG_IMMEDIATE:
- CDEBUG(D_NET, "RX IMMEDIATE on %p\n", conn);
- rc = lnet_parse(kranal_data.kra_ni, &msg->ram_u.immediate.raim_hdr,
- msg->ram_srcnid, conn, 0);
- repost = rc < 0;
- break;
-
- case RANAL_MSG_PUT_REQ:
- CDEBUG(D_NET, "RX PUT_REQ on %p\n", conn);
- rc = lnet_parse(kranal_data.kra_ni, &msg->ram_u.putreq.raprm_hdr,
- msg->ram_srcnid, conn, 1);
- repost = rc < 0;
- break;
-
- case RANAL_MSG_PUT_NAK:
- CDEBUG(D_NET, "RX PUT_NAK on %p\n", conn);
- tx = kranal_match_reply(conn, RANAL_MSG_PUT_REQ,
- msg->ram_u.completion.racm_cookie);
- if (tx == NULL)
- break;
-
- LASSERT (tx->tx_buftype == RANAL_BUF_PHYS_MAPPED ||
- tx->tx_buftype == RANAL_BUF_VIRT_MAPPED);
- kranal_tx_done(tx, -ENOENT); /* no match */
- break;
-
- case RANAL_MSG_PUT_ACK:
- CDEBUG(D_NET, "RX PUT_ACK on %p\n", conn);
- tx = kranal_match_reply(conn, RANAL_MSG_PUT_REQ,
- msg->ram_u.putack.rapam_src_cookie);
- if (tx == NULL)
- break;
-
- kranal_rdma(tx, RANAL_MSG_PUT_DONE,
- &msg->ram_u.putack.rapam_desc,
- msg->ram_u.putack.rapam_desc.rard_nob,
- msg->ram_u.putack.rapam_dst_cookie);
- break;
-
- case RANAL_MSG_PUT_DONE:
- CDEBUG(D_NET, "RX PUT_DONE on %p\n", conn);
- tx = kranal_match_reply(conn, RANAL_MSG_PUT_ACK,
- msg->ram_u.completion.racm_cookie);
- if (tx == NULL)
- break;
-
- LASSERT (tx->tx_buftype == RANAL_BUF_PHYS_MAPPED ||
- tx->tx_buftype == RANAL_BUF_VIRT_MAPPED);
- kranal_tx_done(tx, 0);
- break;
-
- case RANAL_MSG_GET_REQ:
- CDEBUG(D_NET, "RX GET_REQ on %p\n", conn);
- rc = lnet_parse(kranal_data.kra_ni, &msg->ram_u.get.ragm_hdr,
- msg->ram_srcnid, conn, 1);
- repost = rc < 0;
- break;
-
- case RANAL_MSG_GET_NAK:
- CDEBUG(D_NET, "RX GET_NAK on %p\n", conn);
- tx = kranal_match_reply(conn, RANAL_MSG_GET_REQ,
- msg->ram_u.completion.racm_cookie);
- if (tx == NULL)
- break;
-
- LASSERT (tx->tx_buftype == RANAL_BUF_PHYS_MAPPED ||
- tx->tx_buftype == RANAL_BUF_VIRT_MAPPED);
- kranal_tx_done(tx, -ENOENT); /* no match */
- break;
-
- case RANAL_MSG_GET_DONE:
- CDEBUG(D_NET, "RX GET_DONE on %p\n", conn);
- tx = kranal_match_reply(conn, RANAL_MSG_GET_REQ,
- msg->ram_u.completion.racm_cookie);
- if (tx == NULL)
- break;
-
- LASSERT (tx->tx_buftype == RANAL_BUF_PHYS_MAPPED ||
- tx->tx_buftype == RANAL_BUF_VIRT_MAPPED);
-#if 0
- /* completion message should send rdma length if we ever allow
- * GET truncation */
- lnet_set_reply_msg_len(kranal_data.kra_ni, tx->tx_lntmsg[1], ???);
-#endif
- kranal_tx_done(tx, 0);
- break;
- }
-
- out:
- if (rc < 0) /* protocol/comms error */
- kranal_close_conn (conn, rc);
-
- if (repost && conn->rac_rxmsg != NULL)
- kranal_consume_rxmsg(conn, NULL, 0);
-
- /* check again later */
- kranal_schedule_conn(conn);
-}
-
-void
-kranal_complete_closed_conn (kra_conn_t *conn)
-{
- kra_tx_t *tx;
- int nfma;
- int nreplies;
-
- LASSERT (conn->rac_state == RANAL_CONN_CLOSED);
- LASSERT (cfs_list_empty(&conn->rac_list));
- LASSERT (cfs_list_empty(&conn->rac_hashlist));
-
- for (nfma = 0; !cfs_list_empty(&conn->rac_fmaq); nfma++) {
- tx = cfs_list_entry(conn->rac_fmaq.next, kra_tx_t, tx_list);
-
- cfs_list_del(&tx->tx_list);
- kranal_tx_done(tx, -ECONNABORTED);
- }
-
- LASSERT (cfs_list_empty(&conn->rac_rdmaq));
-
- for (nreplies = 0; !cfs_list_empty(&conn->rac_replyq); nreplies++) {
- tx = cfs_list_entry(conn->rac_replyq.next, kra_tx_t, tx_list);
-
- cfs_list_del(&tx->tx_list);
- kranal_tx_done(tx, -ECONNABORTED);
- }
-
- CWARN("Closed conn %p -> %s: nmsg %d nreplies %d\n",
- conn, libcfs_nid2str(conn->rac_peer->rap_nid), nfma, nreplies);
-}
-
-int kranal_process_new_conn (kra_conn_t *conn)
-{
- RAP_RETURN rrc;
-
- rrc = RapkCompleteSync(conn->rac_rihandle, 1);
- if (rrc == RAP_SUCCESS)
- return 0;
-
- LASSERT (rrc == RAP_NOT_DONE);
- if (!cfs_time_aftereq(jiffies, conn->rac_last_tx +
- msecs_to_jiffies(conn->rac_timeout*MSEC_PER_SEC)))
- return -EAGAIN;
-
- /* Too late */
- rrc = RapkCompleteSync(conn->rac_rihandle, 0);
- LASSERT (rrc == RAP_SUCCESS);
- return -ETIMEDOUT;
-}
-
-int
-kranal_scheduler (void *arg)
-{
- kra_device_t *dev = (kra_device_t *)arg;
- wait_queue_t wait;
- kra_conn_t *conn;
- unsigned long flags;
- unsigned long deadline;
- unsigned long soonest;
- int nsoonest;
- long timeout;
- cfs_list_t *tmp;
- cfs_list_t *nxt;
- int rc;
- int dropped_lock;
- int busy_loops = 0;
-
- cfs_block_allsigs();
-
- dev->rad_scheduler = current;
- init_waitqueue_entry_current(&wait);
-
- spin_lock_irqsave(&dev->rad_lock, flags);
-
- while (!kranal_data.kra_shutdown) {
- /* Safe: kra_shutdown only set when quiescent */
-
- if (busy_loops++ >= RANAL_RESCHED) {
- spin_unlock_irqrestore(&dev->rad_lock, flags);
-
- cond_resched();
- busy_loops = 0;
-
- spin_lock_irqsave(&dev->rad_lock, flags);
- }
-
- dropped_lock = 0;
-
- if (dev->rad_ready) {
- /* Device callback fired since I last checked it */
- dev->rad_ready = 0;
- spin_unlock_irqrestore(&dev->rad_lock, flags);
- dropped_lock = 1;
-
- kranal_check_rdma_cq(dev);
- kranal_check_fma_cq(dev);
-
- spin_lock_irqsave(&dev->rad_lock, flags);
- }
-
- cfs_list_for_each_safe(tmp, nxt, &dev->rad_ready_conns) {
- conn = cfs_list_entry(tmp, kra_conn_t, rac_schedlist);
-
- cfs_list_del_init(&conn->rac_schedlist);
- LASSERT (conn->rac_scheduled);
- conn->rac_scheduled = 0;
- spin_unlock_irqrestore(&dev->rad_lock, flags);
- dropped_lock = 1;
-
- kranal_check_fma_rx(conn);
- kranal_process_fmaq(conn);
-
- if (conn->rac_state == RANAL_CONN_CLOSED)
- kranal_complete_closed_conn(conn);
-
- kranal_conn_decref(conn);
- spin_lock_irqsave(&dev->rad_lock, flags);
- }
-
- nsoonest = 0;
- soonest = jiffies;
-
- cfs_list_for_each_safe(tmp, nxt, &dev->rad_new_conns) {
- conn = cfs_list_entry(tmp, kra_conn_t, rac_schedlist);
-
- deadline = conn->rac_last_tx + conn->rac_keepalive;
- if (cfs_time_aftereq(jiffies, deadline)) {
- /* Time to process this new conn */
- spin_unlock_irqrestore(&dev->rad_lock,
- flags);
- dropped_lock = 1;
-
- rc = kranal_process_new_conn(conn);
- if (rc != -EAGAIN) {
- /* All done with this conn */
- spin_lock_irqsave(&dev->rad_lock,
- flags);
- cfs_list_del_init(&conn->rac_schedlist);
- spin_unlock_irqrestore(&dev-> \
- rad_lock,
- flags);
-
- kranal_conn_decref(conn);
- spin_lock_irqsave(&dev->rad_lock,
- flags);
- continue;
- }
-
- /* retry with exponential backoff until HZ */
- if (conn->rac_keepalive == 0)
- conn->rac_keepalive = 1;
- else if (conn->rac_keepalive <=
- msecs_to_jiffies(MSEC_PER_SEC))
- conn->rac_keepalive *= 2;
- else
- conn->rac_keepalive +=
- msecs_to_jiffies(MSEC_PER_SEC);
-
- deadline = conn->rac_last_tx + conn->rac_keepalive;
- spin_lock_irqsave(&dev->rad_lock, flags);
- }
-
- /* Does this conn need attention soonest? */
- if (nsoonest++ == 0 ||
- !cfs_time_aftereq(deadline, soonest))
- soonest = deadline;
- }
-
- if (dropped_lock) /* may sleep iff I didn't drop the lock */
- continue;
-
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue_exclusive(&dev->rad_waitq, &wait);
- spin_unlock_irqrestore(&dev->rad_lock, flags);
-
- if (nsoonest == 0) {
- busy_loops = 0;
- waitq_wait(&wait, TASK_INTERRUPTIBLE);
- } else {
- timeout = (long)(soonest - jiffies);
- if (timeout > 0) {
- busy_loops = 0;
- waitq_timedwait(&wait,
- TASK_INTERRUPTIBLE,
- timeout);
- }
- }
-
- remove_wait_queue(&dev->rad_waitq, &wait);
- set_current_state(TASK_RUNNING);
- spin_lock_irqsave(&dev->rad_lock, flags);
- }
-
- spin_unlock_irqrestore(&dev->rad_lock, flags);
-
- dev->rad_scheduler = NULL;
- kranal_thread_fini();
- return 0;
-}
+++ /dev/null
-/*
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
- *
- * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
- * CA 95054 USA or visit www.sun.com if you need additional information or
- * have any questions.
- *
- * GPL HEADER END
- */
-/*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Use is subject to license terms.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * lnet/klnds/ralnd/ralnd_modparams.c
- *
- * Author: Eric Barton <eric@bartonsoftware.com>
- */
-
-#include "ralnd.h"
-
-static int n_connd = 4;
-CFS_MODULE_PARM(n_connd, "i", int, 0444,
- "# of connection daemons");
-
-static int min_reconnect_interval = 1;
-CFS_MODULE_PARM(min_reconnect_interval, "i", int, 0644,
- "minimum connection retry interval (seconds)");
-
-static int max_reconnect_interval = 60;
-CFS_MODULE_PARM(max_reconnect_interval, "i", int, 0644,
- "maximum connection retry interval (seconds)");
-
-static int ntx = 256;
-CFS_MODULE_PARM(ntx, "i", int, 0444,
- "# of transmit descriptors");
-
-static int credits = 128;
-CFS_MODULE_PARM(credits, "i", int, 0444,
- "# concurrent sends");
-
-static int peer_credits = 32;
-CFS_MODULE_PARM(peer_credits, "i", int, 0444,
- "# concurrent sends to 1 peer");
-
-static int fma_cq_size = 8192;
-CFS_MODULE_PARM(fma_cq_size, "i", int, 0444,
- "size of the completion queue");
-
-static int timeout = 30;
-CFS_MODULE_PARM(timeout, "i", int, 0644,
- "communications timeout (seconds)");
-
-static int max_immediate = (2<<10);
-CFS_MODULE_PARM(max_immediate, "i", int, 0644,
- "immediate/RDMA breakpoint");
-
-kra_tunables_t kranal_tunables = {
- .kra_n_connd = &n_connd,
- .kra_min_reconnect_interval = &min_reconnect_interval,
- .kra_max_reconnect_interval = &max_reconnect_interval,
- .kra_ntx = &ntx,
- .kra_credits = &credits,
- .kra_peercredits = &peer_credits,
- .kra_fma_cq_size = &fma_cq_size,
- .kra_timeout = &timeout,
- .kra_max_immediate = &max_immediate,
-};
-
-#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
-
-static struct ctl_table kranal_ctl_table[] = {
- {
- INIT_CTL_NAME
- .procname = "n_connd",
- .data = &n_connd,
- .maxlen = sizeof(int),
- .mode = 0444,
- .proc_handler = &proc_dointvec
- },
- {
- INIT_CTL_NAME
- .procname = "min_reconnect_interval",
- .data = &min_reconnect_interval,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec
- },
- {
- INIT_CTL_NAME
- .procname = "max_reconnect_interval",
- .data = &max_reconnect_interval,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec
- },
- {
- INIT_CTL_NAME
- .procname = "ntx",
- .data = &ntx,
- .maxlen = sizeof(int),
- .mode = 0444,
- .proc_handler = &proc_dointvec
- },
- {
- INIT_CTL_NAME
- .procname = "credits",
- .data = &credits,
- .maxlen = sizeof(int),
- .mode = 0444,
- .proc_handler = &proc_dointvec
- },
- {
- INIT_CTL_NAME
- .procname = "peer_credits",
- .data = &peer_credits,
- .maxlen = sizeof(int),
- .mode = 0444,
- .proc_handler = &proc_dointvec
- },
- {
- INIT_CTL_NAME
- .procname = "fma_cq_size",
- .data = &fma_cq_size,
- .maxlen = sizeof(int),
- .mode = 0444,
- .proc_handler = &proc_dointvec
- },
- {
- INIT_CTL_NAME
- .procname = "timeout",
- .data = &timeout,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec
- },
- {
- INIT_CTL_NAME
- .procname = "max_immediate",
- .data = &max_immediate,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec
- },
- { 0 }
-};
-
-static struct ctl_table kranal_top_ctl_table[] = {
- {
- INIT_CTL_NAME
- .procname = "ranal",
- .data = NULL,
- .maxlen = 0,
- .mode = 0555,
- .child = kranal_ctl_table
- },
- { 0 }
-};
-
-int
-kranal_tunables_init ()
-{
- kranal_tunables.kra_sysctl =
- register_sysctl_table(kranal_top_ctl_table);
-
- if (kranal_tunables.kra_sysctl == NULL)
- CWARN("Can't setup /proc tunables\n");
-
- return 0;
-}
-
-void kranal_tunables_fini()
-{
- if (kranal_tunables.kra_sysctl != NULL)
- unregister_sysctl_table(kranal_tunables.kra_sysctl);
-}
-
-#else
-
-int
-kranal_tunables_init ()
-{
- return 0;
-}
-
-void
-kranal_tunables_fini ()
-{
-}
-
-#endif
return 0;
}
-
-int
-lnet_set_ip_niaddr (lnet_ni_t *ni)
-{
- __u32 net = LNET_NIDNET(ni->ni_nid);
- char **names;
- int n;
- __u32 ip;
- __u32 netmask;
- int up;
- int i;
- int rc;
-
- /* Convenience for LNDs that use the IP address of a local interface as
- * the local address part of their NID */
-
- if (ni->ni_interfaces[0] != NULL) {
-
- CLASSERT (LNET_MAX_INTERFACES > 1);
-
- if (ni->ni_interfaces[1] != NULL) {
- CERROR("Net %s doesn't support multiple interfaces\n",
- libcfs_net2str(net));
- return -EPERM;
- }
-
- rc = libcfs_ipif_query(ni->ni_interfaces[0],
- &up, &ip, &netmask);
- if (rc != 0) {
- CERROR("Net %s can't query interface %s: %d\n",
- libcfs_net2str(net), ni->ni_interfaces[0], rc);
- return -EPERM;
- }
-
- if (!up) {
- CERROR("Net %s can't use interface %s: it's down\n",
- libcfs_net2str(net), ni->ni_interfaces[0]);
- return -ENETDOWN;
- }
-
- ni->ni_nid = LNET_MKNID(net, ip);
- return 0;
- }
-
- n = libcfs_ipif_enumerate(&names);
- if (n <= 0) {
- CERROR("Net %s can't enumerate interfaces: %d\n",
- libcfs_net2str(net), n);
- return 0;
- }
-
- for (i = 0; i < n; i++) {
- if (!strcmp(names[i], "lo")) /* skip the loopback IF */
- continue;
-
- rc = libcfs_ipif_query(names[i], &up, &ip, &netmask);
-
- if (rc != 0) {
- CWARN("Net %s can't query interface %s: %d\n",
- libcfs_net2str(net), names[i], rc);
- continue;
- }
-
- if (!up) {
- CWARN("Net %s ignoring interface %s (down)\n",
- libcfs_net2str(net), names[i]);
- continue;
- }
-
- libcfs_ipif_free_enumeration(names, n);
- ni->ni_nid = LNET_MKNID(net, ip);
- return 0;
- }
-
- CERROR("Net %s can't find any interfaces\n", libcfs_net2str(net));
- libcfs_ipif_free_enumeration(names, n);
- return -ENOENT;
-}
-EXPORT_SYMBOL(lnet_set_ip_niaddr);
-
} mod_paths[] = {
{ "libcfs", "libcfs/libcfs" },
{ "lnet", "lnet/lnet" },
- { "kmxlnd", "lnet/klnds/mxlnd" },
{ "ko2iblnd", "lnet/klnds/o2iblnd" },
{ "kgnilnd", "lnet/klnds/gnilnd"},
- { "kqswlnd", "lnet/klnds/qswlnd" },
- { "kralnd", "lnet/klnds/ralnd" },
{ "ksocklnd", "lnet/klnds/socklnd" },
- { "ktdilnd", "lnet/klnds/tdilnd" },
{ "obdclass", "lustre/obdclass" },
{ "llog_test", "lustre/obdclass" },
{ "ptlrpc_gss", "lustre/ptlrpc/gss" },
int index;
int rc;
- if (!g_net_is_compatible (argv[0], SOCKLND, RALND, MXLND,
- O2IBLND, GNILND, 0))
+ if (!g_net_is_compatible (argv[0], SOCKLND, O2IBLND, GNILND, 0))
return -1;
for (index = 0;;index++) {
id.nid = data.ioc_nid;
id.pid = data.ioc_u32[4];
printf ("%-20s [%d]%s->%s:%d #%d\n",
- libcfs_id2str(id),
+ libcfs_id2str(id),
data.ioc_count, /* persistence */
/* my ip */
ptl_ipaddr_2_str(data.ioc_u32[2], buffer[0],
sizeof(buffer[1]), 1),
data.ioc_u32[1], /* peer port */
data.ioc_u32[3]); /* conn_count */
- } else if (g_net_is_compatible(NULL, RALND, 0)) {
- printf ("%-20s [%d]@%s:%d\n",
- libcfs_nid2str(data.ioc_nid), /* peer nid */
- data.ioc_count, /* peer persistence */
- /* peer ip */
- ptl_ipaddr_2_str(data.ioc_u32[0], buffer[1],
- sizeof(buffer[1]), 1),
- data.ioc_u32[1]); /* peer port */
} else if (g_net_is_compatible(NULL, GNILND, 0)) {
int disconn = data.ioc_flags >> 16;
char *state;
int port = 0;
int rc;
- if (!g_net_is_compatible (argv[0], SOCKLND, RALND,
- GNILND, 0))
+ if (!g_net_is_compatible(argv[0], SOCKLND, GNILND, 0))
return -1;
if (argc != 4) {
__u32 ip = 0;
int rc;
- if (!g_net_is_compatible (argv[0], SOCKLND, RALND, MXLND,
- O2IBLND, GNILND, 0))
+ if (!g_net_is_compatible(argv[0], SOCKLND, O2IBLND, GNILND, 0))
return -1;
if (g_net_is_compatible(NULL, SOCKLND, 0)) {
int index;
int rc;
- if (!g_net_is_compatible (argv[0], SOCKLND, RALND, MXLND, O2IBLND,
- GNILND, 0))
+ if (!g_net_is_compatible(argv[0], SOCKLND, O2IBLND, GNILND, 0))
return -1;
for (index = 0; ; index++) {
data.ioc_count, /* tx buffer size */
data.ioc_u32[5], /* rx buffer size */
data.ioc_flags ? "nagle" : "nonagle");
- } else if (g_net_is_compatible (NULL, RALND, 0)) {
- printf ("%-20s [%d]\n",
- libcfs_nid2str(data.ioc_nid),
- data.ioc_u32[0] /* device id */);
} else if (g_net_is_compatible (NULL, O2IBLND, 0)) {
printf ("%s mtu %d\n",
libcfs_nid2str(data.ioc_nid),
return 0;
}
- if (!g_net_is_compatible (NULL, SOCKLND, RALND, MXLND, O2IBLND,
- GNILND, 0))
+ if (!g_net_is_compatible(NULL, SOCKLND, O2IBLND, GNILND, 0))
return 0;
if (argc >= 2 &&
return 0;
}
-int
-jt_ptl_print_active_txs (int argc, char **argv)
-{
- struct libcfs_ioctl_data data;
- int index;
- int rc;
-
- if (!g_net_is_compatible (argv[0], QSWLND, 0))
- return -1;
-
- for (index = 0;;index++) {
- LIBCFS_IOC_INIT(data);
- data.ioc_net = g_net;
- data.ioc_count = index;
-
- rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_GET_TXDESC, &data);
- if (rc != 0)
- break;
-
- printf ("type %u payload %6d to %s via %s by pid %6d: "
- "%s, %s, state %d\n",
- data.ioc_u32[0],
- data.ioc_count,
- libcfs_nid2str(data.ioc_nid),
- libcfs_nid2str(data.ioc_u64[0]),
- data.ioc_u32[1],
- (data.ioc_flags & 1) ? "delayed" : "immediate",
- (data.ioc_flags & 2) ? "nblk" : "normal",
- data.ioc_flags >> 2);
- }
-
- if (index == 0) {
- if (errno == ENOENT) {
- printf ("<no active descs>\n");
- } else {
- fprintf(stderr, "Error getting active transmits list: "
- "%s: check dmesg.\n",
- strerror(errno));
- }
- }
- return 0;
-}
-
int jt_ptl_ping(int argc, char **argv)
{
int rc;
Print all the connected remote NIDs for a given
.B network
type.
-.TP
-.BI active_tx
-This command should print active transmits, and it is only used for elan network type.
.TP
.BI route_list
Print the complete routing table.
{"conn_list", jt_ptl_print_connections, 0,
"print all the connected remote nid\n"
"usage: conn_list"},
- {"active_tx", jt_ptl_print_active_txs, 0, "print active transmits\n"
- "usage: active_tx"},
{"route_list", jt_ptl_print_routes, 0,
"print the portals routing table, same as show_route\n"
"usage: route_list"},