From: James Simmons Date: Tue, 10 Feb 2015 02:28:45 +0000 (-0500) Subject: LU-6209 lnet: Delete all obsolete LND drivers X-Git-Tag: 2.7.51~59 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=8be9e41369659b55e6609d7c40e457824b0d6b24 LU-6209 lnet: Delete all obsolete LND drivers Remove ralnd, mxlnd, qswlnd drivers. They are no longer supported and have not even been buildable for a long time. Change-Id: I9c88b446028e79122b5847448fdd23fb6cb5c530 Signed-off-by: James Simmons Reviewed-on: http://review.whamcloud.com/13663 Tested-by: Jenkins Reviewed-by: Isaac Huang Tested-by: Maloo Reviewed-by: Doug Oucharek Reviewed-by: Andreas Dilger Reviewed-by: Oleg Drokin --- diff --git a/libcfs/include/libcfs/libcfs_ioctl.h b/libcfs/include/libcfs/libcfs_ioctl.h index d8206fb..6d8781f 100644 --- a/libcfs/include/libcfs/libcfs_ioctl.h +++ b/libcfs/include/libcfs/libcfs_ioctl.h @@ -150,7 +150,7 @@ struct libcfs_ioctl_handler { #define IOC_LIBCFS_DEL_PEER _IOWR('e', 74, IOCTL_LIBCFS_TYPE) #define IOC_LIBCFS_ADD_PEER _IOWR('e', 75, IOCTL_LIBCFS_TYPE) #define IOC_LIBCFS_GET_PEER _IOWR('e', 76, IOCTL_LIBCFS_TYPE) -#define IOC_LIBCFS_GET_TXDESC _IOWR('e', 77, IOCTL_LIBCFS_TYPE) +/* ioctl 77 is free for use */ #define IOC_LIBCFS_ADD_INTERFACE _IOWR('e', 78, IOCTL_LIBCFS_TYPE) #define IOC_LIBCFS_DEL_INTERFACE _IOWR('e', 79, IOCTL_LIBCFS_TYPE) #define IOC_LIBCFS_GET_INTERFACE _IOWR('e', 80, IOCTL_LIBCFS_TYPE) diff --git a/lnet/autoconf/lustre-lnet.m4 b/lnet/autoconf/lustre-lnet.m4 index 3621261..5d57ff8 100644 --- a/lnet/autoconf/lustre-lnet.m4 +++ b/lnet/autoconf/lustre-lnet.m4 @@ -146,125 +146,6 @@ AC_DEFUN([LN_CONFIG_DLC], [ ]) # -# LN_CONFIG_QUADRICS -# -# check if quadrics support is in this kernel -# -AC_DEFUN([LN_CONFIG_QUADRICS], [ -AC_MSG_CHECKING([for QsNet sources]) -AC_ARG_WITH([qsnet], - AC_HELP_STRING([--with-qsnet=path], - [set path to qsnet source (default=$LINUX)]), - [QSNET=$with_qsnet], [QSNET=$LINUX]) -AC_MSG_RESULT([$QSNET]) - -QSWLND="" -QSWCPPFLAGS="" -AC_MSG_CHECKING([if quadrics kernel headers are present]) -AS_IF([test -d $QSNET/drivers/net/qsnet], [ - AC_MSG_RESULT([yes]) - QSWLND="qswlnd" - AC_MSG_CHECKING([for multirail EKC]) - AS_IF([test -f $QSNET/include/elan/epcomms.h], [ - AC_MSG_RESULT([supported]) - QSNET=$(readlink --canonicalize $QSNET) - QSWCPPFLAGS="-I$QSNET/include -DMULTIRAIL_EKC=1" - ], [ - AC_MSG_RESULT([not supported]) - AC_MSG_ERROR([Need multirail EKC]) - ]) - - AS_IF([test x$QSNET = x$LINUX], [ - LB_CHECK_CONFIG([QSNET], [], [ - LB_CHECK_CONFIG([QSNET_MODULE], [], [ - AC_MSG_WARN([QSNET is not enabled in this kernel; not building qswlnd.]) - QSWLND="" - QSWCPPFLAGS="" - ]) - ]) - ]) -], [ - AC_MSG_RESULT([no]) -]) -AC_SUBST(QSWLND) -AC_SUBST(QSWCPPFLAGS) -]) # LN_CONFIG_QUADRICS - -# -# LN_CONFIG_MX -# -AC_DEFUN([LN_CONFIG_MX], [ -# set default -MXPATH="/opt/mx" -AC_MSG_CHECKING([whether to enable Myrinet MX support]) -AC_ARG_WITH([mx], - AC_HELP_STRING([--with-mx=path], - [build mxlnd against path]), - [ - case $with_mx in - yes) ENABLEMX=2 ;; - no) ENABLEMX=0 ;; - *) ENABLEMX=3; MXPATH=$with_mx ;; - esac - ],[ - ENABLEMX=1 - ]) -AS_IF([test $ENABLEMX -eq 0], [ - AC_MSG_RESULT([disabled]) -], [test ! \( -f ${MXPATH}/include/myriexpress.h -a \ - -f ${MXPATH}/include/mx_kernel_api.h -a \ - -f ${MXPATH}/include/mx_pin.h \)], [ - AC_MSG_RESULT([no]) - case $ENABLEMX in - 1) ;; - 2) AC_MSG_ERROR([Myrinet MX kernel headers not present]) ;; - 3) AC_MSG_ERROR([bad --with-mx path]) ;; - *) AC_MSG_ERROR([internal error]) ;; - esac -], [ - AC_MSG_RESULT([check]) - MXPATH=$(readlink --canonicalize $MXPATH) - MXCPPFLAGS="-I$MXPATH/include" - MXLIBS="-L$MXPATH/lib" - EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="$EXTRA_KCFLAGS $MXCPPFLAGS" - LB_CHECK_COMPILE([if have Myrinet MX support], - myrinet_mx_support, [ - #define MX_KERNEL 1 - #include - #include - ],[ - mx_endpoint_t end; - mx_status_t status; - mx_request_t request; - int result; - mx_init(); - mx_open_endpoint(MX_ANY_NIC, MX_ANY_ENDPOINT, 0, NULL, 0, &end); - mx_register_unexp_handler(end, (mx_unexp_handler_t) NULL, NULL); - mx_wait_any(end, MX_INFINITE, 0LL, 0LL, &status, &result); - mx_iconnect(end, 0LL, 0, 0, 0, NULL, &request); - return 0; - ],[ - MXLND="mxlnd" - ],[ - case $ENABLEMX in - 1) ;; - 2) AC_MSG_ERROR([can't compile with Myrinet MX kernel headers]) ;; - 3) AC_MSG_ERROR([can't compile with Myrinet MX headers under $MXPATH]) ;; - *) AC_MSG_ERROR([internal error]) ;; - esac - MXCPPFLAGS="" - MXLIBS="" - MXLND="" - ]) - EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save" -]) -AC_SUBST(MXCPPFLAGS) -AC_SUBST(MXLIBS) -AC_SUBST(MXLND) -]) # LN_CONFIG_MX - -# # LN_CONFIG_O2IB # AC_DEFUN([LN_CONFIG_O2IB], [ @@ -437,35 +318,6 @@ AS_IF([test $ENABLEO2IB -ne 0], [ ]) # LN_CONFIG_O2IB # -# LN_CONFIG_RALND -# -# check whether to use the RapidArray lnd -# -AC_DEFUN([LN_CONFIG_RALND], [ -RALND="" -RACPPFLAGS="-I${LINUX}/drivers/xd1/include" -EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS" -EXTRA_KCFLAGS="$EXTRA_KCFLAGS $RACPPFLAGS" -LB_CHECK_COMPILE([if 'RapidArray' kernel headers are present], -RapkGetDeviceByIndex, [ - #include - #include -],[ - RAP_RETURN rc; - RAP_PVOID dev_handle; - rc = RapkGetDeviceByIndex(0, NULL, &dev_handle); - return rc == RAP_SUCCESS ? 0 : 1; -],[ - RALND="ralnd" -],[ - RACPPFLAGS="" -]) -EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save" -AC_SUBST(RACPPFLAGS) -AC_SUBST(RALND) -]) # LN_CONFIG_RALND - -# # LN_CONFIG_GNILND # # check whether to use the Gemini Network Interface lnd @@ -581,11 +433,8 @@ AC_MSG_NOTICE([LNet kernel checks LN_FUNC_DEV_GET_BY_NAME_2ARG LN_CONFIG_AFFINITY LN_CONFIG_BACKOFF -LN_CONFIG_QUADRICS LN_CONFIG_O2IB -LN_CONFIG_RALND LN_CONFIG_GNILND -LN_CONFIG_MX # 2.6.36 LN_CONFIG_TCP_SENDPAGE # 3.15 @@ -673,10 +522,7 @@ LN_CONFIG_DLC # AM_CONDITOINAL defines for lnet # AC_DEFUN([LN_CONDITIONALS], [ -AM_CONDITIONAL(BUILD_QSWLND, test x$QSWLND = "xqswlnd") -AM_CONDITIONAL(BUILD_MXLND, test x$MXLND = "xmxlnd") AM_CONDITIONAL(BUILD_O2IBLND, test x$O2IBLND = "xo2iblnd") -AM_CONDITIONAL(BUILD_RALND, test x$RALND = "xralnd") AM_CONDITIONAL(BUILD_GNILND, test x$GNILND = "xgnilnd") AM_CONDITIONAL(BUILD_GNILND_RCA, test x$GNILNDRCA = "xgnilndrca") AM_CONDITIONAL(BUILD_DLC, test x$USE_DLC = "xyes") @@ -697,14 +543,8 @@ lnet/include/Makefile lnet/include/lnet/Makefile lnet/klnds/Makefile lnet/klnds/autoMakefile -lnet/klnds/mxlnd/autoMakefile -lnet/klnds/mxlnd/Makefile lnet/klnds/o2iblnd/Makefile lnet/klnds/o2iblnd/autoMakefile -lnet/klnds/qswlnd/Makefile -lnet/klnds/qswlnd/autoMakefile -lnet/klnds/ralnd/Makefile -lnet/klnds/ralnd/autoMakefile lnet/klnds/gnilnd/Makefile lnet/klnds/gnilnd/autoMakefile lnet/klnds/socklnd/Makefile diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index a78da4f..3daa92c 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -692,7 +692,6 @@ void lnet_md_deconstruct(lnet_libmd_t *lmd, lnet_md_t *umd); void lnet_register_lnd(lnd_t *lnd); void lnet_unregister_lnd(lnd_t *lnd); -int lnet_set_ip_niaddr (lnet_ni_t *ni); int lnet_connect(cfs_socket_t **sockp, lnet_nid_t peer_nid, __u32 local_ip, __u32 peer_ip, int peer_port); diff --git a/lnet/include/lnet/lnetctl.h b/lnet/include/lnet/lnetctl.h index 4daff78..be19661 100644 --- a/lnet/include/lnet/lnetctl.h +++ b/lnet/include/lnet/lnetctl.h @@ -152,7 +152,6 @@ int jt_ptl_del_peer (int argc, char **argv); int jt_ptl_print_connections (int argc, char **argv); int jt_ptl_disconnect(int argc, char **argv); int jt_ptl_push_connection(int argc, char **argv); -int jt_ptl_print_active_txs(int argc, char **argv); int jt_ptl_ping(int argc, char **argv); int jt_ptl_mynid(int argc, char **argv); int jt_ptl_add_uuid(int argc, char **argv); diff --git a/lnet/klnds/Makefile.in b/lnet/klnds/Makefile.in index ad17897..6fc3744 100644 --- a/lnet/klnds/Makefile.in +++ b/lnet/klnds/Makefile.in @@ -1,8 +1,5 @@ -@BUILD_MXLND_TRUE@subdir-m += mxlnd -@BUILD_RALND_TRUE@subdir-m += ralnd @BUILD_GNILND_TRUE@subdir-m += gnilnd @BUILD_O2IBLND_TRUE@subdir-m += o2iblnd -@BUILD_QSWLND_TRUE@subdir-m += qswlnd subdir-m += socklnd @INCLUDE_RULES@ diff --git a/lnet/klnds/autoMakefile.am b/lnet/klnds/autoMakefile.am index 78eb985..ce24433 100644 --- a/lnet/klnds/autoMakefile.am +++ b/lnet/klnds/autoMakefile.am @@ -34,4 +34,4 @@ # Lustre is a trademark of Sun Microsystems, Inc. # -SUBDIRS = socklnd qswlnd mxlnd ralnd gnilnd o2iblnd +SUBDIRS = socklnd gnilnd o2iblnd diff --git a/lnet/klnds/mxlnd/Makefile.in b/lnet/klnds/mxlnd/Makefile.in deleted file mode 100644 index 378dbdd..0000000 --- a/lnet/klnds/mxlnd/Makefile.in +++ /dev/null @@ -1,6 +0,0 @@ -MODULES := kmxlnd -kmxlnd-objs := mxlnd.o mxlnd_cb.o mxlnd_modparams.o - -EXTRA_POST_CFLAGS := @MXCPPFLAGS@ - -@INCLUDE_RULES@ diff --git a/lnet/klnds/mxlnd/README b/lnet/klnds/mxlnd/README deleted file mode 100644 index 7467b42..0000000 --- a/lnet/klnds/mxlnd/README +++ /dev/null @@ -1,175 +0,0 @@ -************************************************************************* -* * -* Myrinet Express Lustre Networking Driver (MXLND) documentation * -* * -************************************************************************* - -README of MXLND - -MXLND provides support for Myricom's Myrinet Express (MX) communication -layer in Lustre. - -MXLND may be used with either MX-10G or MX-2G. See MX's README for -supported NICs. - -Table of Contents: - I. Installation - 1. Configuring and compiling - 2. Module Parameters - II. MXLND Performance - III. Caveats - 1. Systems with different page sizes - 2. Multi-homing - 3. MX endpoint collision - IV. License - V. Support - -================ -I. Installation -================ - -MXLND is supported on Linux 2.6. It may be possible to run it on 2.4, -but it has not been tested. MXLND requires Myricom's MX version 1.2.8 -or higher. See MX's README for the supported list of processors. - -MXLND requires the optional MX kernel library interface. MX must be compiled -with --enable-kernel-lib. - -1. Configuring and compiling - -MXLND should be already integrated into the Lustre build process. To -build MXLND, you will need to set the path to your MX installation -in Lustre's ./configure: - - --with-mx=/opt/mx - -replacing /opt with the actual path. Configure will check to ensure that -the MX version has the required functions. If not, it will fail to build. -To check if MXLND built, look for: - - checking whether to enable Myrinet MX support... yes - -in configure's output or the presence of Makefile in -$LUSTRE/lnet/klnds/mxlnd. - -2. Module Parameters - -MXLND supports a number of load-time parameters using Linux's module -parameter system. On our test systems, we created the following file: - - /etc/modprobe.d/kmxlnd - -On some (older?) systems, you may need to modify /etc/modprobe.conf. - -The available options are: - - n_waitd # of completion daemons - cksum set non-zero to enable small message (< 4KB) checksums - ntx # of total tx message descriptors - peercredits # concurrent sends to one peer - board index value of the Myrinet board - ep_id MX endpoint ID - ipif_name IPoMX interface name - polling Use 0 to block (wait). A value > 0 will poll that many times before blocking - - credits Unused - was # concurrent sends to all peers - max_peers Unused - was maximum number of peers that may connect - hosts Unused - was IP-to-hostname resolution file - -You may want to vary the options to obtain the optimal performance for your -platform. - - n_waitd sets the number of threads that process completed MX requests -(sends and receives). In our testing, the default of 1 performed best. - - cksum turns on small message checksums. It can be used to aid in trouble- -shooting. MX also provides an optional checksumming feature which can check -all messages (large and small). See the MX README for details. - - ntx is the number of total sends in flight from this machine. - - peercredits is the number of in-flight messages for a specific peer. This is part -of the flow-control system in Lustre. Increasing this value may improve performance -but it requires more memory since each message requires at least one page. - - board is the index of the Myricom NIC. Hosts can have multiple Myricom NICs -and this identifies which one MXLND should use. - - ep_id is the MX endpoint ID. Each process that uses MX is required to have at -least one MX endpoint to access the MX library and NIC. The ID is a simple index -starting at 0. When used on a server, the server will attempt to use this end- -point. When used on a client, it specifies the endpoint to connect to on the -management server. - - ipif_name is the name of the Ethernet interface over MX. Generally, it is -myriN, where N matches the MX board index. - - polling determines whether this host will poll or block for MX request com- -pletions. A value of 0 blocks and any positive value will poll that many times -before blocking. Since polling increases CPU usage, we suggest you set this to -0 on the client and experiment with different values for servers. - -===================== -II. MXLND Performance -===================== - -On MX-2G systems, MXLND should easily saturate the link and use minimal CPU -(5-10% for read and write operations). On MX-10G systems, MXLND can saturate -the link and use moderate CPU resources (20-30% for read and write operations). -MX-10G relies on PCI-Express which is relatively new and performance varies -considerably by processor, motherboard and PCI-E chipset. Refer to Myricom's -website for the latest DMA read/write performance results by motherboard. The -DMA results will place an upper-bound on MXLND performance. - -============ -III. Caveats -============ - -1. Systems with different page sizes - -MXLND will set the maximum small message size equal to the kernel's page size. -This means that machines running MXLND that have different page sizes are not -able to communicate with each other. If you wish to run MXLND in this case, -send email to help@myri.com. - -2. Multi-homing - -At this time, the MXLND does not support more than one interface at a time. -Thus, a single Lustre router cannot route between two MX-10G, between two -MX-2G, or between MX-10G and MX-2G fabrics. - -3. MX endpoint collision - -Each process that uses MX is required to have at least one MX endpoint to -access the MX library and NIC. Other processes may need to use MX and no two -processes can use the same endpoint ID. MPICH-MX dynamically chooses one at -MPI startup and should not interfere with MXLND. Sockets-MX, on the other hand, -is hard coded to use 0 for its ID. If it is possible that anyone will want to -run Sockets-MX on this system, use a non-0 value for MXLND's endpoint ID. - - -=========== -IV. License -=========== - -MXLND is copyright (C) 2006 of Myricom, Inc. - -MXLND is part of Lustre, http://www.lustre.org. - -MXLND is free software; you can redistribute it and/or modify it under the -terms of version 2 of the GNU General Public License as published by the Free -Software Foundation. - -MXLND is distributed in the hope that it will be useful, but WITHOUT ANY -WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -Lustre; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, -Cambridge, MA 02139, USA. - -========== -V. Support -========== - -If you have questions about MXLND, please contact help@myri.com. diff --git a/lnet/klnds/mxlnd/autoMakefile.am b/lnet/klnds/mxlnd/autoMakefile.am deleted file mode 100644 index e5efec3..0000000 --- a/lnet/klnds/mxlnd/autoMakefile.am +++ /dev/null @@ -1,44 +0,0 @@ -# -# GPL HEADER START -# -# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 only, -# as published by the Free Software Foundation. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License version 2 for more details (a copy is included -# in the LICENSE file that accompanied this code). -# -# You should have received a copy of the GNU General Public License -# version 2 along with this program; If not, see -# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf -# -# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, -# CA 95054 USA or visit www.sun.com if you need additional information or -# have any questions. -# -# GPL HEADER END -# - -# -# Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. -# Use is subject to license terms. -# - -# -# This file is part of Lustre, http://www.lustre.org/ -# Lustre is a trademark of Sun Microsystems, Inc. -# - -if MODULES -if BUILD_MXLND -modulenet_DATA = kmxlnd$(KMODEXT) -endif -endif - -MOSTLYCLEANFILES = @MOSTLYCLEANFILES@ -EXTRA_DIST = $(kmxlnd-objs:%.o=%.c) mxlnd.h diff --git a/lnet/klnds/mxlnd/mxlnd.c b/lnet/klnds/mxlnd/mxlnd.c deleted file mode 100644 index 0b494c9..0000000 --- a/lnet/klnds/mxlnd/mxlnd.c +++ /dev/null @@ -1,715 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2014, Intel Corporation. - * - * Copyright (C) 2006 Myricom, Inc. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/mxlnd/mxlnd.c - * - * Author: Eric Barton - * Author: Scott Atchley - */ - -#include "mxlnd.h" - -lnd_t the_kmxlnd = { - .lnd_type = MXLND, - .lnd_startup = mxlnd_startup, - .lnd_shutdown = mxlnd_shutdown, - .lnd_ctl = mxlnd_ctl, - .lnd_send = mxlnd_send, - .lnd_recv = mxlnd_recv, -}; - -kmx_data_t kmxlnd_data; - -void -mxlnd_free_pages(kmx_pages_t *p) -{ - int npages = p->mxg_npages; - int i; - - CDEBUG(D_MALLOC, "freeing %d pages\n", npages); - - for (i = 0; i < npages; i++) { - if (p->mxg_pages[i] != NULL) { - __free_page(p->mxg_pages[i]); - spin_lock(&kmxlnd_data.kmx_mem_lock); - kmxlnd_data.kmx_mem_used -= PAGE_SIZE; - spin_unlock(&kmxlnd_data.kmx_mem_lock); - } - } - - MXLND_FREE(p, offsetof(kmx_pages_t, mxg_pages[npages])); -} - -int -mxlnd_alloc_pages(kmx_pages_t **pp, int npages) -{ - kmx_pages_t *p = NULL; - int i = 0; - - CDEBUG(D_MALLOC, "allocing %d pages\n", npages); - - MXLND_ALLOC(p, offsetof(kmx_pages_t, mxg_pages[npages])); - if (p == NULL) { - CERROR("Can't allocate descriptor for %d pages\n", npages); - return -ENOMEM; - } - - memset(p, 0, offsetof(kmx_pages_t, mxg_pages[npages])); - p->mxg_npages = npages; - - for (i = 0; i < npages; i++) { - p->mxg_pages[i] = alloc_page(GFP_KERNEL); - if (p->mxg_pages[i] == NULL) { - CERROR("Can't allocate page %d of %d\n", i, npages); - mxlnd_free_pages(p); - return -ENOMEM; - } - spin_lock(&kmxlnd_data.kmx_mem_lock); - kmxlnd_data.kmx_mem_used += PAGE_SIZE; - spin_unlock(&kmxlnd_data.kmx_mem_lock); - } - - *pp = p; - return 0; -} - -/** - * mxlnd_ctx_init - reset ctx struct to the default values - * @ctx - a kmx_ctx pointer - */ -void -mxlnd_ctx_init(kmx_ctx_t *ctx) -{ - if (ctx == NULL) return; - - /* do not change mxc_type */ - ctx->mxc_incarnation = 0; - ctx->mxc_deadline = 0; - ctx->mxc_state = MXLND_CTX_IDLE; - if (!cfs_list_empty(&ctx->mxc_list)) - cfs_list_del_init(&ctx->mxc_list); - /* ignore mxc_rx_list */ - if (ctx->mxc_type == MXLND_REQ_TX) { - ctx->mxc_nid = 0; - ctx->mxc_peer = NULL; - ctx->mxc_conn = NULL; - } - /* ignore mxc_msg */ - ctx->mxc_lntmsg[0] = NULL; - ctx->mxc_lntmsg[1] = NULL; - ctx->mxc_msg_type = 0; - ctx->mxc_cookie = 0LL; - ctx->mxc_match = 0LL; - /* ctx->mxc_seg.segment_ptr points to backing page */ - ctx->mxc_seg.segment_length = 0; - if (ctx->mxc_seg_list != NULL) { - LASSERT(ctx->mxc_nseg > 0); - MXLND_FREE(ctx->mxc_seg_list, ctx->mxc_nseg * sizeof(mx_ksegment_t)); - } - ctx->mxc_seg_list = NULL; - ctx->mxc_nseg = 0; - ctx->mxc_nob = 0; - memset(&ctx->mxc_mxreq, 0, sizeof(mx_request_t)); - memset(&ctx->mxc_status, 0, sizeof(mx_status_t)); - ctx->mxc_errno = 0; - /* ctx->mxc_get */ - /* ctx->mxc_put */ - - ctx->mxc_msg->mxm_type = 0; - ctx->mxc_msg->mxm_credits = 0; - ctx->mxc_msg->mxm_nob = 0; - - return; -} - -/** - * mxlnd_free_txs - free kmx_txs and associated pages - * - * Called from mxlnd_shutdown() - */ -void -mxlnd_free_txs(void) -{ - int i = 0; - kmx_ctx_t *tx = NULL; - - if (kmxlnd_data.kmx_tx_pages) { - for (i = 0; i < MXLND_TX_MSGS(); i++) { - tx = &kmxlnd_data.kmx_txs[i]; - if (tx->mxc_seg_list != NULL) { - LASSERT(tx->mxc_nseg > 0); - MXLND_FREE(tx->mxc_seg_list, - tx->mxc_nseg * - sizeof(*tx->mxc_seg_list)); - } - } - MXLND_FREE(kmxlnd_data.kmx_txs, - MXLND_TX_MSGS() * sizeof(kmx_ctx_t)); - mxlnd_free_pages(kmxlnd_data.kmx_tx_pages); - } - - return; -} - -/** - * mxlnd_init_txs - allocate tx descriptors then stash on txs and idle tx lists - * - * Called from mxlnd_startup() - * returns 0 on success, else -ENOMEM - */ -int -mxlnd_init_txs(void) -{ - int ret = 0; - int i = 0; - int ipage = 0; - int offset = 0; - void *addr = NULL; - kmx_ctx_t *tx = NULL; - kmx_pages_t *pages = NULL; - struct page *page = NULL; - - /* pre-mapped messages are not bigger than 1 page */ - CLASSERT(MXLND_MSG_SIZE <= PAGE_SIZE); - - /* No fancy arithmetic when we do the buffer calculations */ - CLASSERT (PAGE_SIZE % MXLND_MSG_SIZE == 0); - - ret = mxlnd_alloc_pages(&pages, MXLND_TX_MSG_PAGES()); - if (ret != 0) { - CERROR("Can't allocate tx pages\n"); - return -ENOMEM; - } - kmxlnd_data.kmx_tx_pages = pages; - - MXLND_ALLOC(kmxlnd_data.kmx_txs, MXLND_TX_MSGS() * sizeof(kmx_ctx_t)); - if (&kmxlnd_data.kmx_txs == NULL) { - CERROR("Can't allocate %d tx descriptors\n", MXLND_TX_MSGS()); - mxlnd_free_pages(pages); - return -ENOMEM; - } - - memset(kmxlnd_data.kmx_txs, 0, MXLND_TX_MSGS() * sizeof(kmx_ctx_t)); - - for (i = 0; i < MXLND_TX_MSGS(); i++) { - - tx = &kmxlnd_data.kmx_txs[i]; - tx->mxc_type = MXLND_REQ_TX; - - CFS_INIT_LIST_HEAD(&tx->mxc_list); - - /* map mxc_msg to page */ - page = pages->mxg_pages[ipage]; - addr = page_address(page); - LASSERT(addr != NULL); - tx->mxc_msg = (kmx_msg_t *)(addr + offset); - tx->mxc_seg.segment_ptr = MX_PA_TO_U64(virt_to_phys(tx->mxc_msg)); - - mxlnd_ctx_init(tx); - - offset += MXLND_MSG_SIZE; - LASSERT (offset <= PAGE_SIZE); - - if (offset == PAGE_SIZE) { - offset = 0; - ipage++; - LASSERT (ipage <= MXLND_TX_MSG_PAGES()); - } - - /* in startup(), no locks required */ - cfs_list_add_tail(&tx->mxc_list, &kmxlnd_data.kmx_tx_idle); - } - - return 0; -} - -/** - * mxlnd_free_peers - free peers - * - * Called from mxlnd_shutdown() - */ -void -mxlnd_free_peers(void) -{ - int i = 0; - int count = 0; - kmx_peer_t *peer = NULL; - kmx_peer_t *next = NULL; - - for (i = 0; i < MXLND_HASH_SIZE; i++) { - cfs_list_for_each_entry_safe(peer, next, - &kmxlnd_data.kmx_peers[i], - mxp_list) { - cfs_list_del_init(&peer->mxp_list); - if (peer->mxp_conn) mxlnd_conn_decref(peer->mxp_conn); - mxlnd_peer_decref(peer); - count++; - } - } - CDEBUG(D_NET, "%s: freed %d peers\n", __func__, count); -} - -/** - * mxlnd_init_mx - open the endpoint, set our ID, register the EAGER callback - * @ni - the network interface - * - * Returns 0 on success, -1 on failure - */ -int -mxlnd_init_mx(lnet_ni_t *ni) -{ - int ret = 0; - mx_return_t mxret; - u32 board = *kmxlnd_tunables.kmx_board; - u32 ep_id = *kmxlnd_tunables.kmx_ep_id; - u64 nic_id = 0LL; - char *ifname = NULL; - __u32 ip; - __u32 netmask; - int if_up = 0; - - mxret = mx_init(); - if (mxret != MX_SUCCESS) { - CERROR("mx_init() failed with %s (%d)\n", mx_strerror(mxret), mxret); - return -1; - } - - if (ni->ni_interfaces[0] != NULL) { - /* Use the IPoMX interface specified in 'networks=' */ - - CLASSERT (LNET_MAX_INTERFACES > 1); - if (ni->ni_interfaces[1] != NULL) { - CERROR("Multiple interfaces not supported\n"); - goto failed_with_init; - } - - ifname = ni->ni_interfaces[0]; - } else { - ifname = *kmxlnd_tunables.kmx_default_ipif; - } - - ret = libcfs_ipif_query(ifname, &if_up, &ip, &netmask); - if (ret != 0) { - CERROR("Can't query IPoMX interface %s: %d\n", - ifname, ret); - goto failed_with_init; - } - - if (!if_up) { - CERROR("Can't query IPoMX interface %s: it's down\n", - ifname); - goto failed_with_init; - } - - mxret = mx_open_endpoint(board, ep_id, MXLND_MSG_MAGIC, - NULL, 0, &kmxlnd_data.kmx_endpt); - if (mxret != MX_SUCCESS) { - CERROR("mx_open_endpoint() failed with %d\n", mxret); - goto failed_with_init; - } - - mx_get_endpoint_addr(kmxlnd_data.kmx_endpt, &kmxlnd_data.kmx_epa); - mx_decompose_endpoint_addr(kmxlnd_data.kmx_epa, &nic_id, &ep_id); - mxret = mx_connect(kmxlnd_data.kmx_endpt, nic_id, ep_id, - MXLND_MSG_MAGIC, - jiffies_to_msecs(MXLND_CONNECT_TIMEOUT), - &kmxlnd_data.kmx_epa); - if (mxret != MX_SUCCESS) { - CNETERR("unable to connect to myself (%s)\n", mx_strerror(mxret)); - goto failed_with_endpoint; - } - - ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ip); - CDEBUG(D_NET, "My NID is 0x%llx\n", ni->ni_nid); - - /* this will catch all unexpected receives. */ - mxret = mx_register_unexp_handler(kmxlnd_data.kmx_endpt, - (mx_unexp_handler_t) mxlnd_unexpected_recv, - NULL); - if (mxret != MX_SUCCESS) { - CERROR("mx_register_unexp_callback() failed with %s\n", - mx_strerror(mxret)); - goto failed_with_endpoint; - } - mxret = mx_set_request_timeout(kmxlnd_data.kmx_endpt, NULL, - jiffies_to_msecs(MXLND_COMM_TIMEOUT)); - if (mxret != MX_SUCCESS) { - CERROR("mx_set_request_timeout() failed with %s\n", - mx_strerror(mxret)); - goto failed_with_endpoint; - } - return 0; - -failed_with_endpoint: - mx_close_endpoint(kmxlnd_data.kmx_endpt); -failed_with_init: - mx_finalize(); - return -1; -} - - -/** - * mxlnd_thread_start - spawn a kernel thread with this function - * @fn - function pointer - * @arg - pointer to the parameter data - * @name - name of new thread - * - * Returns 0 on success and a negative value on failure - */ -int -mxlnd_thread_start(int (*fn)(void *arg), void *arg, char *name) -{ - cfs_task *task; - int i = (int) ((long) arg); - - atomic_inc(&kmxlnd_data.kmx_nthreads); - init_completion(&kmxlnd_data.kmx_completions[i]); - - task = kthread_run(fn, arg, name); - if (IS_ERR(task)) { - CERROR("cfs_create_thread() failed with %d\n", PTR_ERR(task)); - atomic_dec(&kmxlnd_data.kmx_nthreads); - } - return PTR_ERR(task); -} - -/** - * mxlnd_thread_stop - decrement thread counter - * - * The thread returns 0 when it detects shutdown. - * We are simply decrementing the thread counter. - */ -void -mxlnd_thread_stop(long id) -{ - int i = (int) id; - atomic_dec (&kmxlnd_data.kmx_nthreads); - complete(&kmxlnd_data.kmx_completions[i]); -} - -/** - * mxlnd_shutdown - stop IO, clean up state - * @ni - LNET interface handle - * - * No calls to the LND should be made after calling this function. - */ -void -mxlnd_shutdown (lnet_ni_t *ni) -{ - int i = 0; - int nthreads = MXLND_NDAEMONS + *kmxlnd_tunables.kmx_n_waitd; - - LASSERT (ni == kmxlnd_data.kmx_ni); - LASSERT (ni->ni_data == &kmxlnd_data); - CDEBUG(D_NET, "in shutdown()\n"); - - CDEBUG(D_MALLOC, "before MXLND cleanup: libcfs_kmemory %d " - "kmx_mem_used %ld\n", atomic_read(&libcfs_kmemory), - kmxlnd_data.kmx_mem_used); - - - CDEBUG(D_NET, "setting shutdown = 1\n"); - atomic_set(&kmxlnd_data.kmx_shutdown, 1); - - switch (kmxlnd_data.kmx_init) { - - case MXLND_INIT_ALL: - - /* calls write_[un]lock(kmx_global_lock) */ - mxlnd_del_peer(LNET_NID_ANY); - - /* wakeup request_waitds */ - mx_wakeup(kmxlnd_data.kmx_endpt); - up(&kmxlnd_data.kmx_tx_queue_sem); - up(&kmxlnd_data.kmx_conn_sem); - mxlnd_sleep(msecs_to_jiffies(2 * MSEC_PER_SEC)); - - /* fall through */ - - case MXLND_INIT_THREADS: - - CDEBUG(D_NET, "waiting on threads\n"); - /* wait for threads to complete */ - for (i = 0; i < nthreads; i++) { - wait_for_completion(&kmxlnd_data.kmx_completions[i]); - } - LASSERT(atomic_read(&kmxlnd_data.kmx_nthreads) == 0); - - CDEBUG(D_NET, "freeing completions\n"); - MXLND_FREE(kmxlnd_data.kmx_completions, - nthreads * sizeof(struct completion)); - - /* fall through */ - - case MXLND_INIT_MX: - - CDEBUG(D_NET, "stopping mx\n"); - - /* no peers left, close the endpoint */ - mx_close_endpoint(kmxlnd_data.kmx_endpt); - mx_finalize(); - - /* fall through */ - - case MXLND_INIT_TXS: - - CDEBUG(D_NET, "freeing txs\n"); - - /* free all txs and associated pages */ - mxlnd_free_txs(); - - /* fall through */ - - case MXLND_INIT_DATA: - - CDEBUG(D_NET, "freeing peers\n"); - - /* peers should be gone, but check again */ - mxlnd_free_peers(); - - /* conn zombies should be gone, but check again */ - mxlnd_free_conn_zombies(); - - /* fall through */ - - case MXLND_INIT_NOTHING: - break; - } - CDEBUG(D_NET, "shutdown complete\n"); - - CDEBUG(D_MALLOC, "after MXLND cleanup: libcfs_kmemory %d " - "kmx_mem_used %ld\n", atomic_read(&libcfs_kmemory), - kmxlnd_data.kmx_mem_used); - - kmxlnd_data.kmx_init = MXLND_INIT_NOTHING; - module_put(THIS_MODULE); - return; -} - -/** - * mxlnd_startup - initialize state, open an endpoint, start IO - * @ni - LNET interface handle - * - * Initialize state, open an endpoint, start monitoring threads. - * Should only be called once. - */ -int -mxlnd_startup (lnet_ni_t *ni) -{ - int i = 0; - int ret = 0; - int nthreads = MXLND_NDAEMONS /* tx_queued, timeoutd, connd */ - + *kmxlnd_tunables.kmx_n_waitd; - struct timeval tv; - - LASSERT (ni->ni_lnd == &the_kmxlnd); - - if (kmxlnd_data.kmx_init != MXLND_INIT_NOTHING) { - CERROR("Only 1 instance supported\n"); - return -EPERM; - } - CDEBUG(D_MALLOC, "before MXLND startup: libcfs_kmemory %d " - "kmx_mem_used %ld\n", atomic_read(&libcfs_kmemory), - kmxlnd_data.kmx_mem_used); - - ni->ni_maxtxcredits = MXLND_TX_MSGS(); - ni->ni_peertxcredits = *kmxlnd_tunables.kmx_peercredits; - if (ni->ni_maxtxcredits < ni->ni_peertxcredits) - ni->ni_maxtxcredits = ni->ni_peertxcredits; - - try_module_get(THIS_MODULE); - memset (&kmxlnd_data, 0, sizeof (kmxlnd_data)); - - kmxlnd_data.kmx_ni = ni; - ni->ni_data = &kmxlnd_data; - - do_gettimeofday(&tv); - kmxlnd_data.kmx_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; - CDEBUG(D_NET, "my incarnation is %llu\n", kmxlnd_data.kmx_incarnation); - - rwlock_init (&kmxlnd_data.kmx_global_lock); - spin_lock_init (&kmxlnd_data.kmx_mem_lock); - - CFS_INIT_LIST_HEAD (&kmxlnd_data.kmx_conn_reqs); - CFS_INIT_LIST_HEAD (&kmxlnd_data.kmx_conn_zombies); - CFS_INIT_LIST_HEAD (&kmxlnd_data.kmx_orphan_msgs); - spin_lock_init (&kmxlnd_data.kmx_conn_lock); - sema_init(&kmxlnd_data.kmx_conn_sem, 0); - - for (i = 0; i < MXLND_HASH_SIZE; i++) { - CFS_INIT_LIST_HEAD (&kmxlnd_data.kmx_peers[i]); - } - - CFS_INIT_LIST_HEAD (&kmxlnd_data.kmx_tx_idle); - spin_lock_init (&kmxlnd_data.kmx_tx_idle_lock); - kmxlnd_data.kmx_tx_next_cookie = 1; - CFS_INIT_LIST_HEAD (&kmxlnd_data.kmx_tx_queue); - spin_lock_init (&kmxlnd_data.kmx_tx_queue_lock); - sema_init(&kmxlnd_data.kmx_tx_queue_sem, 0); - - kmxlnd_data.kmx_init = MXLND_INIT_DATA; - /*****************************************************/ - - ret = mxlnd_init_txs(); - if (ret != 0) { - CERROR("Can't alloc tx descs: %d\n", ret); - goto failed; - } - kmxlnd_data.kmx_init = MXLND_INIT_TXS; - /*****************************************************/ - - ret = mxlnd_init_mx(ni); - if (ret != 0) { - CERROR("Can't init mx\n"); - goto failed; - } - - kmxlnd_data.kmx_init = MXLND_INIT_MX; - /*****************************************************/ - - /* start threads */ - - MXLND_ALLOC(kmxlnd_data.kmx_completions, - nthreads * sizeof(struct completion)); - if (kmxlnd_data.kmx_completions == NULL) { - CERROR("failed to alloc kmxlnd_data.kmx_completions\n"); - goto failed; - } - memset(kmxlnd_data.kmx_completions, 0, - nthreads * sizeof(struct completion)); - - CDEBUG(D_NET, "using %d %s in mx_wait_any()\n", - *kmxlnd_tunables.kmx_n_waitd, - *kmxlnd_tunables.kmx_n_waitd == 1 ? "thread" : "threads"); - - for (i = 0; i < *kmxlnd_tunables.kmx_n_waitd; i++) { - char name[24]; - memset(name, 0, sizeof(name)); - snprintf(name, sizeof(name), "mxlnd_request_waitd_%02ld", i); - ret = mxlnd_thread_start(mxlnd_request_waitd, (void*)((long)i)); - if (ret < 0) { - CERROR("Starting mxlnd_request_waitd[%d] " - "failed with %d\n", i, ret); - atomic_set(&kmxlnd_data.kmx_shutdown, 1); - mx_wakeup(kmxlnd_data.kmx_endpt); - for (--i; i >= 0; i--) { - wait_for_completion(&kmxlnd_data.kmx_completions[i]); - } - LASSERT(atomic_read(&kmxlnd_data.kmx_nthreads) == 0); - MXLND_FREE(kmxlnd_data.kmx_completions, - nthreads * sizeof(struct completion)); - - goto failed; - } - } - ret = mxlnd_thread_start(mxlnd_tx_queued, (void *)((long)i++), - "mxlnd_tx_queued"); - if (ret < 0) { - CERROR("Starting mxlnd_tx_queued failed with %d\n", ret); - atomic_set(&kmxlnd_data.kmx_shutdown, 1); - mx_wakeup(kmxlnd_data.kmx_endpt); - for (--i; i >= 0; i--) { - wait_for_completion(&kmxlnd_data.kmx_completions[i]); - } - LASSERT(atomic_read(&kmxlnd_data.kmx_nthreads) == 0); - MXLND_FREE(kmxlnd_data.kmx_completions, - nthreads * sizeof(struct completion)); - goto failed; - } - ret = mxlnd_thread_start(mxlnd_timeoutd, (void *)((long)i++), - "mxlnd_timeoutd"); - if (ret < 0) { - CERROR("Starting mxlnd_timeoutd failed with %d\n", ret); - atomic_set(&kmxlnd_data.kmx_shutdown, 1); - mx_wakeup(kmxlnd_data.kmx_endpt); - up(&kmxlnd_data.kmx_tx_queue_sem); - for (--i; i >= 0; i--) { - wait_for_completion(&kmxlnd_data.kmx_completions[i]); - } - LASSERT(atomic_read(&kmxlnd_data.kmx_nthreads) == 0); - MXLND_FREE(kmxlnd_data.kmx_completions, - nthreads * sizeof(struct completion)); - goto failed; - } - ret = mxlnd_thread_start(mxlnd_connd, (void *)((long)i++), - "mxlnd_connd"); - if (ret < 0) { - CERROR("Starting mxlnd_connd failed with %d\n", ret); - atomic_set(&kmxlnd_data.kmx_shutdown, 1); - mx_wakeup(kmxlnd_data.kmx_endpt); - up(&kmxlnd_data.kmx_tx_queue_sem); - for (--i; i >= 0; i--) { - wait_for_completion(&kmxlnd_data.kmx_completions[i]); - } - LASSERT(atomic_read(&kmxlnd_data.kmx_nthreads) == 0); - MXLND_FREE(kmxlnd_data.kmx_completions, - nthreads * sizeof(struct completion)); - goto failed; - } - - kmxlnd_data.kmx_init = MXLND_INIT_THREADS; - /*****************************************************/ - - kmxlnd_data.kmx_init = MXLND_INIT_ALL; - CDEBUG(D_MALLOC, "startup complete (kmx_mem_used %ld)\n", kmxlnd_data.kmx_mem_used); - - return 0; -failed: - CERROR("mxlnd_startup failed\n"); - mxlnd_shutdown(ni); - return (-ENETDOWN); -} - -static int mxlnd_init(void) -{ - lnet_register_lnd(&the_kmxlnd); - return 0; -} - -static void mxlnd_exit(void) -{ - lnet_unregister_lnd(&the_kmxlnd); - return; -} - -module_init(mxlnd_init); -module_exit(mxlnd_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Myricom, Inc. - help@myri.com"); -MODULE_DESCRIPTION("Kernel MyrinetExpress LND"); -MODULE_VERSION("0.6.0"); diff --git a/lnet/klnds/mxlnd/mxlnd.h b/lnet/klnds/mxlnd/mxlnd.h deleted file mode 100644 index 43d5c13..0000000 --- a/lnet/klnds/mxlnd/mxlnd.h +++ /dev/null @@ -1,566 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2014, Intel Corporation. - * - * Copyright (C) 2006 Myricom, Inc. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/mxlnd/mxlnd.h - * - * Author: Eric Barton - * Author: Scott Atchley - */ - -#include /* module */ -#include /* module */ -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include /* module */ -#include -#include -#include -#include -#include -#include -#include -#include -#include /* msecs_to_jiffies */ -#include - -#include -#include - -#include /* __LITTLE_ENDIAN */ -#include /* arp table */ -#include /* get_device_by_name */ -#include /* neigh_lookup, etc. */ -#include /* sock_create_kern, kernel_connect, sock_release */ - -#define DEBUG_SUBSYSTEM S_LND - -#include "libcfs/libcfs.h" -#include "lnet/lnet.h" -#include "lnet/lib-lnet.h" - -#define MX_KERNEL 1 -#include "mx_extensions.h" -#include "myriexpress.h" - -#if LNET_MAX_IOV > MX_MAX_SEGMENTS - #error LNET_MAX_IOV is greater then MX_MAX_SEGMENTS -#endif - -#define MXLND_MSG_MAGIC 0x4d583130 /* unique magic 'MX10' */ -#define MXLND_MSG_VERSION 0x03 - -/* Using MX's 64 match bits - * We are using the match bits to specify message type and the cookie. The - * highest four bits (60-63) are reserved for message type. Below we specify - * the types. We reserve the remaining combinations for future use. The next 8 - * bits (52-59) are reserved for returning a status code for failed GET_DATA - * (payload) messages. The last 52 bits are used for cookies. That should allow - * unique cookies for 4 KB messages at 10 Gbps line rate without rollover for - * about 8 years. That should be enough. */ - -#define MXLND_MSG_OFFSET 60 /* msg type offset */ -#define MXLND_MSG_BITS (64 - MXLND_MSG_OFFSET) -#define MXLND_MSG_MASK (((1ULL<> MXLND_MSG_OFFSET) - -#define MXLND_ERROR_OFFSET 52 /* error value offset */ -#define MXLND_ERROR_BITS (MXLND_MSG_OFFSET - MXLND_ERROR_OFFSET) -#define MXLND_ERROR_MASK (((1ULL<> MXLND_ERROR_OFFSET) - -/* message types */ -#define MXLND_MSG_ICON_REQ 0xb /* mx_iconnect() before CONN_REQ */ -#define MXLND_MSG_CONN_REQ 0xc /* connection request */ -#define MXLND_MSG_ICON_ACK 0x9 /* mx_iconnect() before CONN_ACK */ -#define MXLND_MSG_CONN_ACK 0xa /* connection request response */ -#define MXLND_MSG_BYE 0xd /* disconnect msg */ -#define MXLND_MSG_EAGER 0xe /* eager message */ -#define MXLND_MSG_NOOP 0x1 /* no msg, return credits */ -#define MXLND_MSG_PUT_REQ 0x2 /* put request src->sink */ -#define MXLND_MSG_PUT_ACK 0x3 /* put ack src<-sink */ -#define MXLND_MSG_PUT_DATA 0x4 /* put payload src->sink */ -#define MXLND_MSG_GET_REQ 0x5 /* get request sink->src */ -#define MXLND_MSG_GET_DATA 0x6 /* get payload sink<-src */ - -/* when to roll-over the cookie value */ -#define MXLND_MAX_COOKIE ((1ULL << MXLND_ERROR_OFFSET) - 1) - -/* defaults for configurable parameters */ -#define MXLND_N_SCHED 1 /* # schedulers (mx_wait_any() threads) */ -#define MXLND_NDAEMONS 3 /* connd, timeoutd, tx_queued */ -#define MXLND_MX_BOARD 0 /* Use the first MX NIC if more than 1 avail */ -#define MXLND_MX_EP_ID 0 /* MX endpoint ID */ -/* timeout for send/recv (jiffies) */ -#define MXLND_COMM_TIMEOUT msecs_to_jiffies(20 * MSEC_PER_SEC) -/* timeout for wait (jiffies) */ -#define MXLND_WAIT_TIMEOUT msecs_to_jiffies(MSEC_PER_SEC) -/* timeout for connections (jiffies) */ -#define MXLND_CONNECT_TIMEOUT msecs_to_jiffies(5 * MSEC_PER_SEC) -#define MXLND_POLLING 1000 /* poll iterations before blocking */ -#define MXLND_LOOKUP_COUNT 5 /* how many times to try to resolve MAC */ -#define MXLND_MAX_PEERS 1024 /* number of nodes talking to me */ - -#define MXLND_MSG_SIZE (4<<10) /* pre-posted eager message size */ -#define MXLND_MSG_QUEUE_DEPTH 8 /* default msg queue depth */ -#define MXLND_NTX 256 /* default # of tx msg descriptors */ - -#define MXLND_HASH_BITS 6 /* the number of bits to hash over */ -#define MXLND_HASH_SIZE (1< 0, poll this many - iterations before blocking */ -} kmx_tunables_t; - -typedef struct -{ - int mxg_npages; /* # pages */ - struct page *mxg_pages[0]; -} kmx_pages_t; - -/* global interface state */ -typedef struct kmx_data -{ - int kmx_init; /* initialization state */ - atomic_t kmx_shutdown; /* shutting down? */ - atomic_t kmx_nthreads; /* number of threads */ - struct completion *kmx_completions; /* array of completion struct */ - lnet_ni_t *kmx_ni; /* the LND instance */ - u64 kmx_incarnation; /* my incarnation value */ - long kmx_mem_used; /* memory used */ - mx_endpoint_t kmx_endpt; /* the MX endpoint */ - mx_endpoint_addr_t kmx_epa; /* the MX endpoint address */ - - rwlock_t kmx_global_lock; /* global lock */ - spinlock_t kmx_mem_lock; /* memory accounting lock */ - - cfs_list_t kmx_conn_reqs; /* list of connection reqs */ - spinlock_t kmx_conn_lock; /* connection list lock */ - struct semaphore kmx_conn_sem; /* connection request list */ - cfs_list_t kmx_conn_zombies; /* list of zombie connections */ - cfs_list_t kmx_orphan_msgs; /* list of txs to cancel */ - - /* list of all known peers */ - cfs_list_t kmx_peers[MXLND_HASH_SIZE]; - atomic_t kmx_npeers; /* number of peers */ - - kmx_pages_t *kmx_tx_pages; /* tx msg pages */ - - struct kmx_ctx *kmx_txs; /* all tx descriptors */ - cfs_list_t kmx_tx_idle; /* list of idle tx */ - spinlock_t kmx_tx_idle_lock; /* lock for idle tx list */ - s32 kmx_tx_used; /* txs in use */ - u64 kmx_tx_next_cookie; /* unique id for tx */ - cfs_list_t kmx_tx_queue; /* generic send queue */ - spinlock_t kmx_tx_queue_lock; /* lock for generic sends */ - struct semaphore kmx_tx_queue_sem; /* semaphore for tx queue */ -} kmx_data_t; - -#define MXLND_INIT_NOTHING 0 /* in the beginning, there was nothing... */ -#define MXLND_INIT_DATA 1 /* main data structures created */ -#define MXLND_INIT_TXS 2 /* tx descriptors created */ -#define MXLND_INIT_MX 3 /* initiate MX library, open endpoint, get NIC id */ -#define MXLND_INIT_THREADS 4 /* waitd, timeoutd, tx_queued threads */ -#define MXLND_INIT_ALL 5 /* startup completed */ - -/************************************************************************ - * MXLND Wire message format. - * These are sent in sender's byte order (i.e. receiver flips). - */ - -typedef struct kmx_connreq_msg -{ - u32 mxcrm_queue_depth; /* per peer max messages in flight */ - u32 mxcrm_eager_size; /* size of preposted eager messages */ -} WIRE_ATTR kmx_connreq_msg_t; - -typedef struct kmx_eager_msg -{ - lnet_hdr_t mxem_hdr; /* lnet header */ - char mxem_payload[0]; /* piggy-backed payload */ -} WIRE_ATTR kmx_eager_msg_t; - -typedef struct kmx_putreq_msg -{ - lnet_hdr_t mxprm_hdr; /* lnet header */ - u64 mxprm_cookie; /* opaque completion cookie */ -} WIRE_ATTR kmx_putreq_msg_t; - -typedef struct kmx_putack_msg -{ - u64 mxpam_src_cookie; /* reflected completion cookie */ - u64 mxpam_dst_cookie; /* opaque completion cookie */ -} WIRE_ATTR kmx_putack_msg_t; - -typedef struct kmx_getreq_msg -{ - lnet_hdr_t mxgrm_hdr; /* lnet header */ - u64 mxgrm_cookie; /* opaque completion cookie */ -} WIRE_ATTR kmx_getreq_msg_t; - -typedef struct kmx_msg -{ - /* First two fields fixed for all time */ - u32 mxm_magic; /* MXLND message */ - u16 mxm_version; /* version number */ - - u8 mxm_type; /* message type */ - u8 mxm_credits; /* returned credits */ - u32 mxm_nob; /* # of bytes in whole message */ - u32 mxm_cksum; /* checksum (0 == no checksum) */ - u64 mxm_srcnid; /* sender's NID */ - u64 mxm_srcstamp; /* sender's incarnation */ - u64 mxm_dstnid; /* destination's NID */ - u64 mxm_dststamp; /* destination's incarnation */ - - union { - kmx_connreq_msg_t conn_req; - kmx_eager_msg_t eager; - kmx_putreq_msg_t put_req; - kmx_putack_msg_t put_ack; - kmx_getreq_msg_t get_req; - } WIRE_ATTR mxm_u; -} WIRE_ATTR kmx_msg_t; - -/***********************************************************************/ - -enum kmx_req_type { - MXLND_REQ_TX = 0, - MXLND_REQ_RX = 1, -}; - -/* The life cycle of a request */ -enum kmx_req_state { - MXLND_CTX_INIT = 0, /* just created */ - MXLND_CTX_IDLE = 1, /* available for use */ - MXLND_CTX_PREP = 2, /* getting ready for send/recv */ - MXLND_CTX_PENDING = 3, /* mx_isend() or mx_irecv() called */ - MXLND_CTX_COMPLETED = 4, /* cleaning up after completion or timeout */ - MXLND_CTX_CANCELED = 5, /* timed out but still in ctx list */ -}; - -/* Context Structure - generic tx/rx descriptor - * It represents the context (or state) of each send or receive request. - * In other LNDs, they have separate TX and RX descriptors and this replaces both. - * - * The txs live on the global kmx_txs array for cleanup. The rxs are managed - * per struct kmx_conn. We will move them between the rx/tx idle lists and the - * pending list which is monitored by mxlnd_timeoutd(). - */ -typedef struct kmx_ctx -{ - enum kmx_req_type mxc_type; /* TX or RX */ - u64 mxc_incarnation; /* store the peer's incarnation here - to verify before changing flow - control credits after completion */ - unsigned long mxc_deadline; /* request time out in absolute jiffies */ - enum kmx_req_state mxc_state; /* what is the state of the request? */ - cfs_list_t mxc_list; /* place on rx/tx idle list, tx q, peer tx */ - cfs_list_t mxc_rx_list; /* place on mxp_rx_posted list */ - - lnet_nid_t mxc_nid; /* dst's NID if peer is not known */ - struct kmx_peer *mxc_peer; /* owning peer */ - struct kmx_conn *mxc_conn; /* owning conn */ - kmx_msg_t *mxc_msg; /* msg hdr mapped to mxc_page */ - lnet_msg_t *mxc_lntmsg[2]; /* lnet msgs to finalize */ - - u8 mxc_msg_type; /* what type of message is this? */ - u64 mxc_cookie; /* completion cookie */ - u64 mxc_match; /* MX match info */ - mx_ksegment_t mxc_seg; /* local MX ksegment for non-DATA */ - mx_ksegment_t *mxc_seg_list; /* MX ksegment array for DATA */ - int mxc_nseg; /* number of segments */ - unsigned long mxc_pin_type; /* MX_PIN_PHYSICAL [| MX_PIN_FULLPAGES] */ - u32 mxc_nob; /* number of bytes sent/received */ - mx_request_t mxc_mxreq; /* MX request */ - mx_status_t mxc_status; /* MX status */ - u32 mxc_errno; /* errno for LNET */ - u64 mxc_get; /* # of times returned from idle list */ - u64 mxc_put; /* # of times returned from idle list */ -} kmx_ctx_t; - -#define MXLND_CONN_DISCONNECT -2 /* conn is being destroyed - do not add txs */ -#define MXLND_CONN_FAIL -1 /* connect failed (bad handshake, unavail, etc.) */ -#define MXLND_CONN_INIT 0 /* in the beginning, there was nothing... */ -#define MXLND_CONN_REQ 1 /* a connection request message is needed */ -#define MXLND_CONN_ACK 2 /* a connection ack is needed */ -#define MXLND_CONN_WAIT 3 /* waiting for req or ack to complete */ -#define MXLND_CONN_READY 4 /* ready to send */ - -/* store all data from an unexpected CONN_[REQ|ACK] receive */ -typedef struct kmx_connparams -{ - cfs_list_t mxr_list; /* list to hang on kmx_conn_reqs */ - void *mxr_context; /* context - unused - will hold net */ - mx_endpoint_addr_t mxr_epa; /* the peer's epa */ - u64 mxr_match; /* the CONN_REQ's match bits */ - u32 mxr_nob; /* length of CONN_REQ message */ - struct kmx_peer *mxr_peer; /* peer if known */ - struct kmx_conn *mxr_conn; /* conn if known */ - kmx_msg_t mxr_msg; /* the msg header & connreq_msg_t */ -} kmx_connparams_t; - -/* connection state - queues for queued and pending msgs */ -typedef struct kmx_conn -{ - struct kmx_peer *mxk_peer; /* owning peer */ - cfs_list_t mxk_list; /* for placing on mxp_conns */ - cfs_list_t mxk_zombie; /* for placing on zombies list */ - u64 mxk_incarnation; /* connections's incarnation value */ - u32 mxk_sid; /* peer's MX session id */ - atomic_t mxk_refcount; /* reference counting */ - int mxk_status; /* can we send messages? MXLND_CONN_* */ - - mx_endpoint_addr_t mxk_epa; /* peer's endpoint address */ - - spinlock_t mxk_lock; /* lock */ - unsigned long mxk_timeout; /* expiration of oldest pending tx/rx */ - unsigned long mxk_last_tx; /* when last tx completed with success */ - unsigned long mxk_last_rx; /* when last rx completed */ - - kmx_pages_t *mxk_rx_pages; /* rx msg pages */ - kmx_ctx_t *mxk_rxs; /* the rx descriptors */ - cfs_list_t mxk_rx_idle; /* list of idle rx */ - - int mxk_credits; /* # of my credits for sending to peer */ - int mxk_outstanding; /* # of credits to return */ - - cfs_list_t mxk_tx_credit_queue; /* send queue for peer */ - cfs_list_t mxk_tx_free_queue; /* send queue for peer */ - int mxk_ntx_msgs; /* # of msgs on tx queues */ - int mxk_ntx_data ; /* # of DATA on tx queues */ - int mxk_ntx_posted; /* # of tx msgs in flight */ - int mxk_data_posted; /* # of tx data payloads in flight */ - - cfs_list_t mxk_pending; /* in flight rxs and txs */ -} kmx_conn_t; - -/* peer state */ -typedef struct kmx_peer -{ - cfs_list_t mxp_list; /* for placing on kmx_peers */ - lnet_nid_t mxp_nid; /* peer's LNET NID */ - lnet_ni_t *mxp_ni; /* LNET interface */ - atomic_t mxp_refcount; /* reference counts */ - - cfs_list_t mxp_conns; /* list of connections */ - kmx_conn_t *mxp_conn; /* current connection */ - cfs_list_t mxp_tx_queue; /* msgs waiting for a conn */ - - u32 mxp_board; /* peer's board rank */ - u32 mxp_ep_id; /* peer's MX endpoint ID */ - u64 mxp_nic_id; /* remote's MX nic_id for mx_connect() */ - - unsigned long mxp_reconnect_time; /* when to retry connect */ - int mxp_incompatible; /* incorrect conn_req values */ -} kmx_peer_t; - -extern kmx_data_t kmxlnd_data; -extern kmx_tunables_t kmxlnd_tunables; - -/* required for the LNET API */ -int mxlnd_startup(lnet_ni_t *ni); -void mxlnd_shutdown(lnet_ni_t *ni); -int mxlnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg); -int mxlnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); -int mxlnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, - unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, - unsigned int offset, unsigned int mlen, unsigned int rlen); - -/* in mxlnd.c */ -extern void mxlnd_thread_stop(long id); -extern void mxlnd_ctx_init(kmx_ctx_t *ctx); -extern int mxlnd_peer_alloc(kmx_peer_t **peerp, lnet_nid_t nid, - u32 board, u32 ep_id, u64 nic_id); -extern int mxlnd_alloc_pages(kmx_pages_t **pp, int npages); -extern void mxlnd_free_pages(kmx_pages_t *p); - -/* in mxlnd_cb.c */ -void mxlnd_eager_recv(void *context, uint64_t match_value, uint32_t length); -extern mx_unexp_handler_action_t mxlnd_unexpected_recv(void *context, - mx_endpoint_addr_t source, uint64_t match_value, uint32_t length, - void *data_if_available); -extern void mxlnd_peer_free(kmx_peer_t *peer); -extern void mxlnd_conn_free_locked(kmx_conn_t *conn); -extern void mxlnd_conn_disconnect(kmx_conn_t *conn, int mx_dis, int send_bye); -extern int mxlnd_close_matching_conns(lnet_nid_t nid); -extern void mxlnd_sleep(unsigned long timeout); -extern int mxlnd_tx_queued(void *arg); -extern void mxlnd_handle_rx_completion(kmx_ctx_t *rx); -extern int mxlnd_check_sends(kmx_peer_t *peer); -extern int mxlnd_tx_peer_queued(void *arg); -extern int mxlnd_request_waitd(void *arg); -extern int mxlnd_unex_recvd(void *arg); -extern int mxlnd_timeoutd(void *arg); -extern int mxlnd_free_conn_zombies(void); -extern int mxlnd_connd(void *arg); -extern int mxlnd_del_peer(lnet_nid_t nid); - - -/** - * mxlnd_nid_to_hash - hash the nid - * @nid - LNET ID - * - * Takes the u64 nid and XORs the lowest N bits by the next lowest N bits. - */ -static inline int -mxlnd_nid_to_hash(lnet_nid_t nid) -{ - return (nid & MXLND_HASH_MASK) ^ - ((nid & (MXLND_HASH_MASK << MXLND_HASH_BITS)) >> MXLND_HASH_BITS); -} - - -#define mxlnd_peer_addref(peer) \ -do { \ - LASSERT(peer != NULL); \ - LASSERT(atomic_read(&(peer)->mxp_refcount) > 0); \ - atomic_inc(&(peer)->mxp_refcount); \ -} while (0) - - -#define mxlnd_peer_decref(peer) \ -do { \ - LASSERT(atomic_read(&(peer)->mxp_refcount) > 0); \ - if (atomic_dec_and_test(&(peer)->mxp_refcount)) \ - mxlnd_peer_free(peer); \ -} while (0) - -#define mxlnd_conn_addref(conn) \ -do { \ - LASSERT(conn != NULL); \ - LASSERT(atomic_read(&(conn)->mxk_refcount) > 0); \ - atomic_inc(&(conn)->mxk_refcount); \ -} while (0) - - -#define mxlnd_conn_decref(conn) \ -do { \ - LASSERT(conn != NULL); \ - LASSERT(atomic_read(&(conn)->mxk_refcount) > 0); \ - if (atomic_dec_and_test(&(conn)->mxk_refcount)) { \ - spin_lock(&kmxlnd_data.kmx_conn_lock); \ - LASSERT((conn)->mxk_status == MXLND_CONN_DISCONNECT); \ - CDEBUG(D_NET, "adding conn %p to zombies\n", (conn)); \ - cfs_list_add_tail(&(conn)->mxk_zombie, \ - &kmxlnd_data.kmx_conn_zombies); \ - spin_unlock(&kmxlnd_data.kmx_conn_lock); \ - up(&kmxlnd_data.kmx_conn_sem); \ - } \ -} while (0) - -#define mxlnd_valid_msg_type(type) \ -do { \ - LASSERT((type) == MXLND_MSG_EAGER || \ - (type) == MXLND_MSG_ICON_REQ || \ - (type) == MXLND_MSG_CONN_REQ || \ - (type) == MXLND_MSG_ICON_ACK || \ - (type) == MXLND_MSG_CONN_ACK || \ - (type) == MXLND_MSG_BYE || \ - (type) == MXLND_MSG_NOOP || \ - (type) == MXLND_MSG_PUT_REQ || \ - (type) == MXLND_MSG_PUT_ACK || \ - (type) == MXLND_MSG_PUT_DATA || \ - (type) == MXLND_MSG_GET_REQ || \ - (type) == MXLND_MSG_GET_DATA); \ -} while (0) diff --git a/lnet/klnds/mxlnd/mxlnd_cb.c b/lnet/klnds/mxlnd/mxlnd_cb.c deleted file mode 100644 index d40d1c6..0000000 --- a/lnet/klnds/mxlnd/mxlnd_cb.c +++ /dev/null @@ -1,4088 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2014, Intel Corporation. - * - * Copyright (C) 2006 Myricom, Inc. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/mxlnd/mxlnd.c - * - * Author: Eric Barton - * Author: Scott Atchley - */ - -#include -#include "mxlnd.h" - -mx_endpoint_addr_t MX_EPA_NULL; /* use to determine if an endpoint is NULL */ - -inline int -mxlnd_endpoint_addr_null(mx_endpoint_addr_t epa) -{ - /* if memcmp() == 0, it is NULL */ - return !(memcmp(&epa, &MX_EPA_NULL, sizeof(epa))); -} - -char * -mxlnd_ctxstate_to_str(int mxc_state) -{ - switch (mxc_state) { - case MXLND_CTX_INIT: - return "MXLND_CTX_INIT"; - case MXLND_CTX_IDLE: - return "MXLND_CTX_IDLE"; - case MXLND_CTX_PREP: - return "MXLND_CTX_PREP"; - case MXLND_CTX_PENDING: - return "MXLND_CTX_PENDING"; - case MXLND_CTX_COMPLETED: - return "MXLND_CTX_COMPLETED"; - case MXLND_CTX_CANCELED: - return "MXLND_CTX_CANCELED"; - default: - return "*unknown*"; - } -} - -char * -mxlnd_connstatus_to_str(int mxk_status) -{ - switch (mxk_status) { - case MXLND_CONN_READY: - return "MXLND_CONN_READY"; - case MXLND_CONN_INIT: - return "MXLND_CONN_INIT"; - case MXLND_CONN_WAIT: - return "MXLND_CONN_WAIT"; - case MXLND_CONN_DISCONNECT: - return "MXLND_CONN_DISCONNECT"; - case MXLND_CONN_FAIL: - return "MXLND_CONN_FAIL"; - default: - return "unknown"; - } -} - -char * -mxlnd_msgtype_to_str(int type) { - switch (type) { - case MXLND_MSG_EAGER: - return "MXLND_MSG_EAGER"; - case MXLND_MSG_CONN_REQ: - return "MXLND_MSG_CONN_REQ"; - case MXLND_MSG_CONN_ACK: - return "MXLND_MSG_CONN_ACK"; - case MXLND_MSG_BYE: - return "MXLND_MSG_BYE"; - case MXLND_MSG_NOOP: - return "MXLND_MSG_NOOP"; - case MXLND_MSG_PUT_REQ: - return "MXLND_MSG_PUT_REQ"; - case MXLND_MSG_PUT_ACK: - return "MXLND_MSG_PUT_ACK"; - case MXLND_MSG_PUT_DATA: - return "MXLND_MSG_PUT_DATA"; - case MXLND_MSG_GET_REQ: - return "MXLND_MSG_GET_REQ"; - case MXLND_MSG_GET_DATA: - return "MXLND_MSG_GET_DATA"; - default: - return "unknown"; - } -} - -char * -mxlnd_lnetmsg_to_str(int type) -{ - switch (type) { - case LNET_MSG_ACK: - return "LNET_MSG_ACK"; - case LNET_MSG_PUT: - return "LNET_MSG_PUT"; - case LNET_MSG_GET: - return "LNET_MSG_GET"; - case LNET_MSG_REPLY: - return "LNET_MSG_REPLY"; - case LNET_MSG_HELLO: - return "LNET_MSG_HELLO"; - default: - LBUG(); - return "*unknown*"; - } -} - -static inline u64 -mxlnd_create_match(kmx_ctx_t *ctx, u8 error) -{ - u64 type = (u64) ctx->mxc_msg_type; - u64 err = (u64) error; - u64 match = 0ULL; - - mxlnd_valid_msg_type(ctx->mxc_msg_type); - LASSERT(ctx->mxc_cookie >> MXLND_ERROR_OFFSET == 0); - match = (type << MXLND_MSG_OFFSET) | (err << MXLND_ERROR_OFFSET) | ctx->mxc_cookie; - return match; -} - -static inline void -mxlnd_parse_match(u64 match, u8 *msg_type, u8 *error, u64 *cookie) -{ - *msg_type = (u8) MXLND_MSG_TYPE(match); - *error = (u8) MXLND_ERROR_VAL(match); - *cookie = match & MXLND_MAX_COOKIE; - mxlnd_valid_msg_type(*msg_type); - return; -} - -kmx_ctx_t * -mxlnd_get_idle_rx(kmx_conn_t *conn) -{ - cfs_list_t *rxs = NULL; - kmx_ctx_t *rx = NULL; - - LASSERT(conn != NULL); - - rxs = &conn->mxk_rx_idle; - - spin_lock(&conn->mxk_lock); - - if (cfs_list_empty (rxs)) { - spin_unlock(&conn->mxk_lock); - return NULL; - } - - rx = cfs_list_entry (rxs->next, kmx_ctx_t, mxc_list); - cfs_list_del_init(&rx->mxc_list); - spin_unlock(&conn->mxk_lock); - -#if MXLND_DEBUG - if (rx->mxc_get != rx->mxc_put) { - CNETERR("*** RX get (%llu) != put (%llu) ***\n", rx->mxc_get, rx->mxc_put); - CNETERR("*** incarnation= %lld ***\n", rx->mxc_incarnation); - CNETERR("*** deadline= %ld ***\n", rx->mxc_deadline); - CNETERR("*** state= %s ***\n", mxlnd_ctxstate_to_str(rx->mxc_state)); - CNETERR("*** listed?= %d ***\n", !cfs_list_empty(&rx->mxc_list)); - CNETERR("*** nid= 0x%llx ***\n", rx->mxc_nid); - CNETERR("*** peer= 0x%p ***\n", rx->mxc_peer); - CNETERR("*** msg_type= %s ***\n", mxlnd_msgtype_to_str(rx->mxc_msg_type)); - CNETERR("*** cookie= 0x%llx ***\n", rx->mxc_cookie); - CNETERR("*** nob= %d ***\n", rx->mxc_nob); - } -#endif - LASSERT (rx->mxc_get == rx->mxc_put); - - rx->mxc_get++; - - LASSERT (rx->mxc_state == MXLND_CTX_IDLE); - rx->mxc_state = MXLND_CTX_PREP; - rx->mxc_deadline = jiffies + MXLND_COMM_TIMEOUT; - - return rx; -} - -int -mxlnd_put_idle_rx(kmx_ctx_t *rx) -{ - kmx_conn_t *conn = rx->mxc_conn; - cfs_list_t *rxs = &conn->mxk_rx_idle; - - LASSERT(rx->mxc_type == MXLND_REQ_RX); - - mxlnd_ctx_init(rx); - - rx->mxc_put++; - LASSERT(rx->mxc_get == rx->mxc_put); - - spin_lock(&conn->mxk_lock); - cfs_list_add(&rx->mxc_list, rxs); - spin_unlock(&conn->mxk_lock); - return 0; -} - -kmx_ctx_t * -mxlnd_get_idle_tx(void) -{ - cfs_list_t *tmp = &kmxlnd_data.kmx_tx_idle; - kmx_ctx_t *tx = NULL; - - spin_lock(&kmxlnd_data.kmx_tx_idle_lock); - - if (cfs_list_empty (&kmxlnd_data.kmx_tx_idle)) { - CNETERR("%d txs in use\n", kmxlnd_data.kmx_tx_used); - spin_unlock(&kmxlnd_data.kmx_tx_idle_lock); - return NULL; - } - - tmp = &kmxlnd_data.kmx_tx_idle; - tx = cfs_list_entry (tmp->next, kmx_ctx_t, mxc_list); - cfs_list_del_init(&tx->mxc_list); - - /* Allocate a new completion cookie. It might not be needed, - * but we've got a lock right now and we're unlikely to - * wrap... */ - tx->mxc_cookie = kmxlnd_data.kmx_tx_next_cookie++; - if (kmxlnd_data.kmx_tx_next_cookie > MXLND_MAX_COOKIE) { - kmxlnd_data.kmx_tx_next_cookie = 1; - } - kmxlnd_data.kmx_tx_used++; - spin_unlock(&kmxlnd_data.kmx_tx_idle_lock); - - LASSERT (tx->mxc_get == tx->mxc_put); - - tx->mxc_get++; - - LASSERT (tx->mxc_state == MXLND_CTX_IDLE); - LASSERT (tx->mxc_lntmsg[0] == NULL); - LASSERT (tx->mxc_lntmsg[1] == NULL); - - tx->mxc_state = MXLND_CTX_PREP; - tx->mxc_deadline = jiffies + MXLND_COMM_TIMEOUT; - - return tx; -} - -void -mxlnd_conn_disconnect(kmx_conn_t *conn, int mx_dis, int send_bye); - -int -mxlnd_put_idle_tx(kmx_ctx_t *tx) -{ - int result = 0; - lnet_msg_t *lntmsg[2]; - - LASSERT(tx->mxc_type == MXLND_REQ_TX); - - if (tx->mxc_status.code != MX_STATUS_SUCCESS || tx->mxc_errno != 0) { - kmx_conn_t *conn = tx->mxc_conn; - - result = -EIO; - if (tx->mxc_errno != 0) result = tx->mxc_errno; - /* FIXME should we set mx_dis? */ - mxlnd_conn_disconnect(conn, 0, 1); - } - - lntmsg[0] = tx->mxc_lntmsg[0]; - lntmsg[1] = tx->mxc_lntmsg[1]; - - mxlnd_ctx_init(tx); - - tx->mxc_put++; - LASSERT(tx->mxc_get == tx->mxc_put); - - spin_lock(&kmxlnd_data.kmx_tx_idle_lock); - cfs_list_add_tail(&tx->mxc_list, &kmxlnd_data.kmx_tx_idle); - kmxlnd_data.kmx_tx_used--; - spin_unlock(&kmxlnd_data.kmx_tx_idle_lock); - - if (lntmsg[0] != NULL) - lnet_finalize(kmxlnd_data.kmx_ni, lntmsg[0], result); - if (lntmsg[1] != NULL) - lnet_finalize(kmxlnd_data.kmx_ni, lntmsg[1], result); - return 0; -} - - -void -mxlnd_connparams_free(kmx_connparams_t *cp) -{ - LASSERT(cfs_list_empty(&cp->mxr_list)); - MXLND_FREE(cp, sizeof(*cp)); - return; -} - -int -mxlnd_connparams_alloc(kmx_connparams_t **cp, void *context, - mx_endpoint_addr_t epa, u64 match, u32 length, - kmx_conn_t *conn, kmx_peer_t *peer, void *data) -{ - kmx_connparams_t *c = NULL; - - MXLND_ALLOC(c, sizeof(*c)); - if (!c) return -ENOMEM; - - CFS_INIT_LIST_HEAD(&c->mxr_list); - c->mxr_context = context; - c->mxr_epa = epa; - c->mxr_match = match; - c->mxr_nob = length; - c->mxr_conn = conn; - c->mxr_peer = peer; - c->mxr_msg = *((kmx_msg_t *) data); - - *cp = c; - return 0; -} - -static inline void -mxlnd_set_conn_status(kmx_conn_t *conn, int status) -{ - conn->mxk_status = status; - smp_mb(); -} - -/** - * mxlnd_conn_free_locked - free the conn - * @conn - a kmx_conn pointer - * - * The calling function should remove the conn from the conns list first - * then destroy it. Caller should have write-locked kmx_global_lock. - */ -void -mxlnd_conn_free_locked(kmx_conn_t *conn) -{ - int valid = !mxlnd_endpoint_addr_null(conn->mxk_epa); - kmx_peer_t *peer = conn->mxk_peer; - - CDEBUG(D_NET, "freeing conn 0x%p *****\n", conn); - LASSERT (cfs_list_empty (&conn->mxk_tx_credit_queue) && - cfs_list_empty (&conn->mxk_tx_free_queue) && - cfs_list_empty (&conn->mxk_pending)); - if (!cfs_list_empty(&conn->mxk_list)) { - cfs_list_del_init(&conn->mxk_list); - if (peer->mxp_conn == conn) { - peer->mxp_conn = NULL; - if (valid) { - kmx_conn_t *temp = NULL; - - mx_get_endpoint_addr_context(conn->mxk_epa, - (void **) &temp); - if (conn == temp) { - mx_set_endpoint_addr_context(conn->mxk_epa, - (void *) NULL); - } - } - /* unlink from global list and drop its ref */ - cfs_list_del_init(&peer->mxp_list); - mxlnd_peer_decref(peer); - } - } - mxlnd_peer_decref(peer); /* drop conn's ref to peer */ - if (conn->mxk_rx_pages) { - LASSERT (conn->mxk_rxs != NULL); - mxlnd_free_pages(conn->mxk_rx_pages); - } - if (conn->mxk_rxs) { - int i = 0; - kmx_ctx_t *rx = NULL; - - for (i = 0; i < MXLND_RX_MSGS(); i++) { - rx = &conn->mxk_rxs[i]; - if (rx->mxc_seg_list != NULL) { - LASSERT(rx->mxc_nseg > 0); - MXLND_FREE(rx->mxc_seg_list, - rx->mxc_nseg * - sizeof(*rx->mxc_seg_list)); - } - } - MXLND_FREE(conn->mxk_rxs, MXLND_RX_MSGS() * sizeof(kmx_ctx_t)); - } - - MXLND_FREE(conn, sizeof (*conn)); - return; -} - - -int -mxlnd_conn_cancel_pending_rxs(kmx_conn_t *conn) -{ - int found = 0; - int count = 0; - kmx_ctx_t *ctx = NULL; - kmx_ctx_t *next = NULL; - mx_return_t mxret = MX_SUCCESS; - u32 result = 0; - - do { - found = 0; - spin_lock(&conn->mxk_lock); - cfs_list_for_each_entry_safe(ctx, next, &conn->mxk_pending, - mxc_list) { - cfs_list_del_init(&ctx->mxc_list); - if (ctx->mxc_type == MXLND_REQ_RX) { - found = 1; - mxret = mx_cancel(kmxlnd_data.kmx_endpt, - &ctx->mxc_mxreq, - &result); - if (mxret != MX_SUCCESS) { - CNETERR("mx_cancel() returned %s (%d)\n", mx_strerror(mxret), mxret); - } - if (result == 1) { - ctx->mxc_errno = -ECONNABORTED; - ctx->mxc_state = MXLND_CTX_CANCELED; - spin_unlock(&conn->mxk_lock); - spin_lock(&kmxlnd_data.kmx_conn_lock); - /* we may be holding the global lock, - * move to orphan list so that it can free it */ - cfs_list_add_tail(&ctx->mxc_list, - &kmxlnd_data.kmx_orphan_msgs); - count++; - spin_unlock(&kmxlnd_data.kmx_conn_lock); - spin_lock(&conn->mxk_lock); - } - break; - } - } - spin_unlock(&conn->mxk_lock); - } while (found); - - return count; -} - -int -mxlnd_cancel_queued_txs(kmx_conn_t *conn) -{ - int count = 0; - cfs_list_t *tmp = NULL; - - spin_lock(&conn->mxk_lock); - while (!cfs_list_empty(&conn->mxk_tx_free_queue) || - !cfs_list_empty(&conn->mxk_tx_credit_queue)) { - - kmx_ctx_t *tx = NULL; - - if (!cfs_list_empty(&conn->mxk_tx_free_queue)) { - tmp = &conn->mxk_tx_free_queue; - } else { - tmp = &conn->mxk_tx_credit_queue; - } - - tx = cfs_list_entry(tmp->next, kmx_ctx_t, mxc_list); - cfs_list_del_init(&tx->mxc_list); - spin_unlock(&conn->mxk_lock); - tx->mxc_errno = -ECONNABORTED; - tx->mxc_state = MXLND_CTX_CANCELED; - /* move to orphan list and then abort */ - spin_lock(&kmxlnd_data.kmx_conn_lock); - cfs_list_add_tail(&tx->mxc_list, &kmxlnd_data.kmx_orphan_msgs); - spin_unlock(&kmxlnd_data.kmx_conn_lock); - count++; - spin_lock(&conn->mxk_lock); - } - spin_unlock(&conn->mxk_lock); - - return count; -} - -void -mxlnd_send_message(mx_endpoint_addr_t epa, u8 msg_type, int error, u64 cookie) -{ - u64 match = (((u64) msg_type) << MXLND_MSG_OFFSET) | - (((u64) error) << MXLND_ERROR_OFFSET) | cookie; - - mx_kisend(kmxlnd_data.kmx_endpt, NULL, 0, MX_PIN_PHYSICAL, - epa, match, NULL, NULL); - return; -} - -/** - * mxlnd_conn_disconnect - shutdown a connection - * @conn - a kmx_conn pointer - * @mx_dis - call mx_disconnect() - * @send_bye - send peer a BYE msg - * - * This function sets the status to DISCONNECT, completes queued - * txs with failure, calls mx_disconnect, which will complete - * pending txs and matched rxs with failure. - */ -void -mxlnd_conn_disconnect(kmx_conn_t *conn, int mx_dis, int send_bye) -{ - mx_endpoint_addr_t epa = conn->mxk_epa; - int valid = !mxlnd_endpoint_addr_null(epa); - int count = 0; - - spin_lock(&conn->mxk_lock); - if (conn->mxk_status == MXLND_CONN_DISCONNECT) { - spin_unlock(&conn->mxk_lock); - return; - } - mxlnd_set_conn_status(conn, MXLND_CONN_DISCONNECT); - conn->mxk_timeout = 0; - spin_unlock(&conn->mxk_lock); - - count = mxlnd_cancel_queued_txs(conn); - count += mxlnd_conn_cancel_pending_rxs(conn); - - if (count) /* let connd call kmxlnd_abort_msgs() */ - up(&kmxlnd_data.kmx_conn_sem); - - if (send_bye && valid && - conn->mxk_peer->mxp_nid != kmxlnd_data.kmx_ni->ni_nid) { - /* send a BYE to the peer */ - CDEBUG(D_NET, "%s: sending a BYE msg to %s\n", __func__, - libcfs_nid2str(conn->mxk_peer->mxp_nid)); - mxlnd_send_message(epa, MXLND_MSG_BYE, 0, 0); - /* wait to allow the peer to ack our message */ - mxlnd_sleep(msecs_to_jiffies(20)); - } - - if (atomic_read(&kmxlnd_data.kmx_shutdown) != 1) { - unsigned long last_msg = 0; - - /* notify LNET that we are giving up on this peer */ - if (cfs_time_after(conn->mxk_last_rx, conn->mxk_last_tx)) - last_msg = conn->mxk_last_rx; - else - last_msg = conn->mxk_last_tx; - - lnet_notify(kmxlnd_data.kmx_ni, conn->mxk_peer->mxp_nid, 0, last_msg); - - if (mx_dis && valid && - (memcmp(&epa, &kmxlnd_data.kmx_epa, sizeof(epa) != 0))) - mx_disconnect(kmxlnd_data.kmx_endpt, epa); - } - mxlnd_conn_decref(conn); /* drop the owning peer's reference */ - - return; -} - -/** - * mxlnd_conn_alloc - allocate and initialize a new conn struct - * @connp - address of a kmx_conn pointer - * @peer - owning kmx_peer - * - * Returns 0 on success and -ENOMEM on failure - */ -int -mxlnd_conn_alloc_locked(kmx_conn_t **connp, kmx_peer_t *peer) -{ - int i = 0; - int ret = 0; - int ipage = 0; - int offset = 0; - void *addr = NULL; - kmx_conn_t *conn = NULL; - kmx_pages_t *pages = NULL; - struct page *page = NULL; - kmx_ctx_t *rx = NULL; - - LASSERT(peer != NULL); - - MXLND_ALLOC(conn, sizeof (*conn)); - if (conn == NULL) { - CNETERR("Cannot allocate conn\n"); - return -ENOMEM; - } - CDEBUG(D_NET, "allocated conn 0x%p for peer 0x%p\n", conn, peer); - - memset(conn, 0, sizeof(*conn)); - - ret = mxlnd_alloc_pages(&pages, MXLND_RX_MSG_PAGES()); - if (ret != 0) { - CERROR("Can't allocate rx pages\n"); - MXLND_FREE(conn, sizeof(*conn)); - return -ENOMEM; - } - conn->mxk_rx_pages = pages; - - MXLND_ALLOC(conn->mxk_rxs, MXLND_RX_MSGS() * sizeof(kmx_ctx_t)); - if (conn->mxk_rxs == NULL) { - CERROR("Can't allocate %d rx descriptors\n", MXLND_RX_MSGS()); - mxlnd_free_pages(pages); - MXLND_FREE(conn, sizeof(*conn)); - return -ENOMEM; - } - - memset(conn->mxk_rxs, 0, MXLND_RX_MSGS() * sizeof(kmx_ctx_t)); - - conn->mxk_peer = peer; - CFS_INIT_LIST_HEAD(&conn->mxk_list); - CFS_INIT_LIST_HEAD(&conn->mxk_zombie); - atomic_set(&conn->mxk_refcount, 2); /* ref for owning peer - and one for the caller */ - if (peer->mxp_nid == kmxlnd_data.kmx_ni->ni_nid) { - u64 nic_id = 0ULL; - u32 ep_id = 0; - - /* this is localhost, set the epa and status as up */ - mxlnd_set_conn_status(conn, MXLND_CONN_READY); - conn->mxk_epa = kmxlnd_data.kmx_epa; - mx_set_endpoint_addr_context(conn->mxk_epa, (void *) conn); - peer->mxp_reconnect_time = 0; - mx_decompose_endpoint_addr(kmxlnd_data.kmx_epa, &nic_id, &ep_id); - peer->mxp_nic_id = nic_id; - peer->mxp_ep_id = ep_id; - conn->mxk_incarnation = kmxlnd_data.kmx_incarnation; - conn->mxk_timeout = 0; - } else { - /* conn->mxk_incarnation = 0 - will be set by peer */ - /* conn->mxk_sid = 0 - will be set by peer */ - mxlnd_set_conn_status(conn, MXLND_CONN_INIT); - /* mxk_epa - to be set after mx_iconnect() */ - } - spin_lock_init(&conn->mxk_lock); - /* conn->mxk_timeout = 0 */ - /* conn->mxk_last_tx = 0 */ - /* conn->mxk_last_rx = 0 */ - CFS_INIT_LIST_HEAD(&conn->mxk_rx_idle); - - conn->mxk_credits = *kmxlnd_tunables.kmx_peercredits; - /* mxk_outstanding = 0 */ - - CFS_INIT_LIST_HEAD(&conn->mxk_tx_credit_queue); - CFS_INIT_LIST_HEAD(&conn->mxk_tx_free_queue); - /* conn->mxk_ntx_msgs = 0 */ - /* conn->mxk_ntx_data = 0 */ - /* conn->mxk_ntx_posted = 0 */ - /* conn->mxk_data_posted = 0 */ - CFS_INIT_LIST_HEAD(&conn->mxk_pending); - - for (i = 0; i < MXLND_RX_MSGS(); i++) { - - rx = &conn->mxk_rxs[i]; - rx->mxc_type = MXLND_REQ_RX; - CFS_INIT_LIST_HEAD(&rx->mxc_list); - - /* map mxc_msg to page */ - page = pages->mxg_pages[ipage]; - addr = page_address(page); - LASSERT(addr != NULL); - rx->mxc_msg = (kmx_msg_t *)(addr + offset); - rx->mxc_seg.segment_ptr = MX_PA_TO_U64(virt_to_phys(rx->mxc_msg)); - - rx->mxc_conn = conn; - rx->mxc_peer = peer; - rx->mxc_nid = peer->mxp_nid; - - mxlnd_ctx_init(rx); - - offset += MXLND_MSG_SIZE; - LASSERT (offset <= PAGE_SIZE); - - if (offset == PAGE_SIZE) { - offset = 0; - ipage++; - LASSERT (ipage <= MXLND_TX_MSG_PAGES()); - } - - cfs_list_add_tail(&rx->mxc_list, &conn->mxk_rx_idle); - } - - *connp = conn; - - mxlnd_peer_addref(peer); /* add a ref for this conn */ - - /* add to front of peer's conns list */ - cfs_list_add(&conn->mxk_list, &peer->mxp_conns); - peer->mxp_conn = conn; - return 0; -} - -int -mxlnd_conn_alloc(kmx_conn_t **connp, kmx_peer_t *peer) -{ - int ret = 0; - rwlock_t *g_lock = &kmxlnd_data.kmx_global_lock; - - write_lock(g_lock); - ret = mxlnd_conn_alloc_locked(connp, peer); - write_unlock(g_lock); - return ret; -} - -int -mxlnd_q_pending_ctx(kmx_ctx_t *ctx) -{ - int ret = 0; - kmx_conn_t *conn = ctx->mxc_conn; - - ctx->mxc_state = MXLND_CTX_PENDING; - if (conn != NULL) { - spin_lock(&conn->mxk_lock); - if (conn->mxk_status >= MXLND_CONN_INIT) { - cfs_list_add_tail(&ctx->mxc_list, &conn->mxk_pending); - if (conn->mxk_timeout == 0 || ctx->mxc_deadline < conn->mxk_timeout) { - conn->mxk_timeout = ctx->mxc_deadline; - } - } else { - ctx->mxc_state = MXLND_CTX_COMPLETED; - ret = -1; - } - spin_unlock(&conn->mxk_lock); - } - return ret; -} - -int -mxlnd_deq_pending_ctx(kmx_ctx_t *ctx) -{ - LASSERT(ctx->mxc_state == MXLND_CTX_PENDING || - ctx->mxc_state == MXLND_CTX_COMPLETED); - if (ctx->mxc_state != MXLND_CTX_PENDING && - ctx->mxc_state != MXLND_CTX_COMPLETED) { - CNETERR("deq ctx->mxc_state = %s\n", - mxlnd_ctxstate_to_str(ctx->mxc_state)); - } - ctx->mxc_state = MXLND_CTX_COMPLETED; - if (!cfs_list_empty(&ctx->mxc_list)) { - kmx_conn_t *conn = ctx->mxc_conn; - kmx_ctx_t *next = NULL; - - LASSERT(conn != NULL); - spin_lock(&conn->mxk_lock); - cfs_list_del_init(&ctx->mxc_list); - conn->mxk_timeout = 0; - if (!cfs_list_empty(&conn->mxk_pending)) { - next = cfs_list_entry(conn->mxk_pending.next, - kmx_ctx_t, mxc_list); - conn->mxk_timeout = next->mxc_deadline; - } - spin_unlock(&conn->mxk_lock); - } - return 0; -} - -/** - * mxlnd_peer_free - free the peer - * @peer - a kmx_peer pointer - * - * The calling function should decrement the rxs, drain the tx queues and - * remove the peer from the peers list first then destroy it. - */ -void -mxlnd_peer_free(kmx_peer_t *peer) -{ - CDEBUG(D_NET, "freeing peer 0x%p %s\n", peer, libcfs_nid2str(peer->mxp_nid)); - - LASSERT (atomic_read(&peer->mxp_refcount) == 0); - - if (!cfs_list_empty(&peer->mxp_list)) { - /* assume we are locked */ - cfs_list_del_init(&peer->mxp_list); - } - - MXLND_FREE(peer, sizeof (*peer)); - atomic_dec(&kmxlnd_data.kmx_npeers); - return; -} - -static int -mxlnd_lookup_mac(u32 ip, u64 *tmp_id) -{ - int ret = -EHOSTUNREACH; - unsigned char *haddr = NULL; - struct net_device *dev = NULL; - struct neighbour *n = NULL; - __be32 dst_ip = htonl(ip); - - dev = dev_get_by_name(*kmxlnd_tunables.kmx_default_ipif); - if (dev == NULL) - return -ENODEV; - - haddr = (unsigned char *) tmp_id + 2; /* MAC is only 6 bytes */ - - n = neigh_lookup(&arp_tbl, &dst_ip, dev); - if (n) { - n->used = jiffies; - if (n->nud_state & NUD_VALID) { - memcpy(haddr, n->ha, dev->addr_len); - neigh_release(n); - ret = 0; - } - } - - dev_put(dev); - - return ret; -} - - -/* We only want the MAC address of the peer's Myricom NIC. We - * require that each node has the IPoMX interface (myriN) up. - * We will not pass any traffic over IPoMX, but it allows us - * to get the MAC address. */ -static int -mxlnd_ip2nic_id(u32 ip, u64 *nic_id, int tries) -{ - int ret = 0; - int try = 1; - int fatal = 0; - u64 tmp_id = 0ULL; - cfs_socket_t *sock = NULL; - - do { - CDEBUG(D_NET, "try %d of %d tries\n", try, tries); - ret = mxlnd_lookup_mac(ip, &tmp_id); - if (ret == 0) { - break; - } else { - /* not found, try to connect (force an arp) */ - ret = libcfs_sock_connect(&sock, &fatal, 0, 0, ip, 987); - if (ret == -ECONNREFUSED) { - /* peer is there, get the MAC address */ - mxlnd_lookup_mac(ip, &tmp_id); - if (tmp_id != 0ULL) - ret = 0; - break; - } else if (ret == -EHOSTUNREACH && try < tries) { - /* add a little backoff */ - CDEBUG(D_NET, "sleeping for %lu jiffies\n", - msecs_to_jiffies(MSEC_PER_SEC / 4)); - mxlnd_sleep(msecs_to_jiffies(MSEC_PER_SEC / 4)); - } - } - } while (try++ < tries); - CDEBUG(D_NET, "done trying. ret = %d\n", ret); - - if (tmp_id == 0ULL) - ret = -EHOSTUNREACH; -#if __BYTE_ORDER == __LITTLE_ENDIAN - *nic_id = ___arch__swab64(tmp_id); -#else - *nic_id = tmp_id; -#endif - return ret; -} - -/** - * mxlnd_peer_alloc - allocate and initialize a new peer struct - * @peerp - address of a kmx_peer pointer - * @nid - LNET node id - * - * Returns 0 on success and -ENOMEM on failure - */ -int -mxlnd_peer_alloc(kmx_peer_t **peerp, lnet_nid_t nid, u32 board, u32 ep_id, u64 nic_id) -{ - int ret = 0; - u32 ip = LNET_NIDADDR(nid); - kmx_peer_t *peer = NULL; - - LASSERT (nid != LNET_NID_ANY && nid != 0LL); - - MXLND_ALLOC(peer, sizeof (*peer)); - if (peer == NULL) { - CNETERR("Cannot allocate peer for NID 0x%llx\n", - nid); - return -ENOMEM; - } - CDEBUG(D_NET, "allocated peer 0x%p for NID 0x%llx\n", peer, nid); - - memset(peer, 0, sizeof(*peer)); - - CFS_INIT_LIST_HEAD(&peer->mxp_list); - peer->mxp_nid = nid; - /* peer->mxp_ni unused - may be used for multi-rail */ - atomic_set(&peer->mxp_refcount, 1); /* ref for kmx_peers list */ - - peer->mxp_board = board; - peer->mxp_ep_id = ep_id; - peer->mxp_nic_id = nic_id; - - CFS_INIT_LIST_HEAD(&peer->mxp_conns); - ret = mxlnd_conn_alloc(&peer->mxp_conn, peer); /* adds 2nd conn ref here... */ - if (ret != 0) { - mxlnd_peer_decref(peer); - return ret; - } - CFS_INIT_LIST_HEAD(&peer->mxp_tx_queue); - - if (peer->mxp_nic_id != 0ULL) - nic_id = peer->mxp_nic_id; - - if (nic_id == 0ULL) { - ret = mxlnd_ip2nic_id(ip, &nic_id, 1); - if (ret == 0) { - peer->mxp_nic_id = nic_id; - mx_nic_id_to_board_number(nic_id, &peer->mxp_board); - } - } - - peer->mxp_nic_id = nic_id; /* may be 0ULL if ip2nic_id() failed */ - - /* peer->mxp_reconnect_time = 0 */ - /* peer->mxp_incompatible = 0 */ - - *peerp = peer; - return 0; -} - -static inline kmx_peer_t * -mxlnd_find_peer_by_nid_locked(lnet_nid_t nid) -{ - int found = 0; - int hash = 0; - kmx_peer_t *peer = NULL; - - hash = mxlnd_nid_to_hash(nid); - - cfs_list_for_each_entry(peer, &kmxlnd_data.kmx_peers[hash], mxp_list) { - if (peer->mxp_nid == nid) { - found = 1; - mxlnd_peer_addref(peer); - break; - } - } - return (found ? peer : NULL); -} - -static kmx_peer_t * -mxlnd_find_peer_by_nid(lnet_nid_t nid, int create) -{ - int ret = 0; - int hash = 0; - kmx_peer_t *peer = NULL; - kmx_peer_t *old = NULL; - rwlock_t *g_lock = &kmxlnd_data.kmx_global_lock; - - read_lock(g_lock); - peer = mxlnd_find_peer_by_nid_locked(nid); /* adds peer ref */ - - if ((peer && peer->mxp_conn) || /* found peer with conn or */ - (!peer && !create)) { /* did not find peer and do not create one */ - read_unlock(g_lock); - return peer; - } - - read_unlock(g_lock); - - /* if peer but _not_ conn */ - if (peer && !peer->mxp_conn) { - if (create) { - write_lock(g_lock); - if (!peer->mxp_conn) { /* check again */ - /* create the conn */ - ret = mxlnd_conn_alloc_locked(&peer->mxp_conn, peer); - if (ret != 0) { - /* we tried, return the peer only. - * the caller needs to see if the conn exists */ - CNETERR("%s: %s could not alloc conn\n", - __func__, libcfs_nid2str(peer->mxp_nid)); - } else { - /* drop extra conn ref */ - mxlnd_conn_decref(peer->mxp_conn); - } - } - write_unlock(g_lock); - } - return peer; - } - - /* peer not found and we need to create one */ - hash = mxlnd_nid_to_hash(nid); - - /* create peer (and conn) */ - /* adds conn ref for peer and one for this function */ - ret = mxlnd_peer_alloc(&peer, nid, *kmxlnd_tunables.kmx_board, - *kmxlnd_tunables.kmx_ep_id, 0ULL); - if (ret != 0) /* no memory, peer is NULL */ - return NULL; - - write_lock(g_lock); - - /* look again */ - old = mxlnd_find_peer_by_nid_locked(nid); - if (old) { - /* someone already created one */ - mxlnd_conn_decref(peer->mxp_conn); /* drop ref taken above.. */ - mxlnd_conn_decref(peer->mxp_conn); /* drop peer's ref */ - mxlnd_peer_decref(peer); - peer = old; - } else { - /* no other peer, use this one */ - cfs_list_add_tail(&peer->mxp_list, - &kmxlnd_data.kmx_peers[hash]); - atomic_inc(&kmxlnd_data.kmx_npeers); - mxlnd_peer_addref(peer); - mxlnd_conn_decref(peer->mxp_conn); /* drop ref from peer_alloc */ - } - - write_unlock(g_lock); - - return peer; -} - -static inline int -mxlnd_tx_requires_credit(kmx_ctx_t *tx) -{ - return (tx->mxc_msg_type == MXLND_MSG_EAGER || - tx->mxc_msg_type == MXLND_MSG_GET_REQ || - tx->mxc_msg_type == MXLND_MSG_PUT_REQ || - tx->mxc_msg_type == MXLND_MSG_NOOP); -} - -/** - * mxlnd_init_msg - set type and number of bytes - * @msg - msg pointer - * @type - of message - * @body_nob - bytes in msg body - */ -static inline void -mxlnd_init_msg(kmx_msg_t *msg, u8 type, int body_nob) -{ - msg->mxm_type = type; - msg->mxm_nob = offsetof(kmx_msg_t, mxm_u) + body_nob; -} - -static inline void -mxlnd_init_tx_msg (kmx_ctx_t *tx, u8 type, int body_nob, lnet_nid_t nid) -{ - int nob = offsetof (kmx_msg_t, mxm_u) + body_nob; - kmx_msg_t *msg = NULL; - - LASSERT (tx != NULL); - LASSERT (nob <= MXLND_MSG_SIZE); - - tx->mxc_nid = nid; - /* tx->mxc_peer should have already been set if we know it */ - tx->mxc_msg_type = type; - tx->mxc_nseg = 1; - /* tx->mxc_seg.segment_ptr is already pointing to mxc_page */ - tx->mxc_seg.segment_length = nob; - tx->mxc_pin_type = MX_PIN_PHYSICAL; - - msg = tx->mxc_msg; - msg->mxm_type = type; - msg->mxm_nob = nob; - - return; -} - -static inline __u32 -mxlnd_cksum (void *ptr, int nob) -{ - char *c = ptr; - __u32 sum = 0; - - while (nob-- > 0) - sum = ((sum << 1) | (sum >> 31)) + *c++; - - /* ensure I don't return 0 (== no checksum) */ - return (sum == 0) ? 1 : sum; -} - -/** - * mxlnd_pack_msg_locked - complete msg info - * @tx - msg to send - */ -static inline void -mxlnd_pack_msg_locked(kmx_ctx_t *tx) -{ - kmx_msg_t *msg = tx->mxc_msg; - - /* type and nob should already be set in init_msg() */ - msg->mxm_magic = MXLND_MSG_MAGIC; - msg->mxm_version = MXLND_MSG_VERSION; - /* mxm_type */ - /* don't use mxlnd_tx_requires_credit() since we want PUT_ACK to - * return credits as well */ - if (tx->mxc_msg_type != MXLND_MSG_CONN_REQ && - tx->mxc_msg_type != MXLND_MSG_CONN_ACK) { - msg->mxm_credits = tx->mxc_conn->mxk_outstanding; - tx->mxc_conn->mxk_outstanding = 0; - } else { - msg->mxm_credits = 0; - } - /* mxm_nob */ - msg->mxm_cksum = 0; - msg->mxm_srcnid = kmxlnd_data.kmx_ni->ni_nid; - msg->mxm_srcstamp = kmxlnd_data.kmx_incarnation; - msg->mxm_dstnid = tx->mxc_nid; - /* if it is a new peer, the dststamp will be 0 */ - msg->mxm_dststamp = tx->mxc_conn->mxk_incarnation; - - if (*kmxlnd_tunables.kmx_cksum) { - msg->mxm_cksum = mxlnd_cksum(msg, msg->mxm_nob); - } -} - -int -mxlnd_unpack_msg(kmx_msg_t *msg, int nob) -{ - const int hdr_size = offsetof(kmx_msg_t, mxm_u); - __u32 msg_cksum = 0; - int flip = 0; - int msg_nob = 0; - - /* 6 bytes are enough to have received magic + version */ - if (nob < 6) { - CNETERR("not enough bytes for magic + hdr: %d\n", nob); - return -EPROTO; - } - - if (msg->mxm_magic == MXLND_MSG_MAGIC) { - flip = 0; - } else if (msg->mxm_magic == __swab32(MXLND_MSG_MAGIC)) { - flip = 1; - } else { - CNETERR("Bad magic: %08x\n", msg->mxm_magic); - return -EPROTO; - } - - if (msg->mxm_version != - (flip ? __swab16(MXLND_MSG_VERSION) : MXLND_MSG_VERSION)) { - CNETERR("Bad version: %d\n", msg->mxm_version); - return -EPROTO; - } - - if (nob < hdr_size) { - CNETERR("not enough for a header: %d\n", nob); - return -EPROTO; - } - - msg_nob = flip ? __swab32(msg->mxm_nob) : msg->mxm_nob; - if (msg_nob > nob) { - CNETERR("Short message: got %d, wanted %d\n", nob, msg_nob); - return -EPROTO; - } - - /* checksum must be computed with mxm_cksum zero and BEFORE anything - * gets flipped */ - msg_cksum = flip ? __swab32(msg->mxm_cksum) : msg->mxm_cksum; - msg->mxm_cksum = 0; - if (msg_cksum != 0 && msg_cksum != mxlnd_cksum(msg, msg_nob)) { - CNETERR("Bad checksum\n"); - return -EPROTO; - } - msg->mxm_cksum = msg_cksum; - - if (flip) { - /* leave magic unflipped as a clue to peer endianness */ - __swab16s(&msg->mxm_version); - CLASSERT (sizeof(msg->mxm_type) == 1); - CLASSERT (sizeof(msg->mxm_credits) == 1); - msg->mxm_nob = msg_nob; - __swab64s(&msg->mxm_srcnid); - __swab64s(&msg->mxm_srcstamp); - __swab64s(&msg->mxm_dstnid); - __swab64s(&msg->mxm_dststamp); - } - - if (msg->mxm_srcnid == LNET_NID_ANY) { - CNETERR("Bad src nid: %s\n", libcfs_nid2str(msg->mxm_srcnid)); - return -EPROTO; - } - - switch (msg->mxm_type) { - default: - CNETERR("Unknown message type %x\n", msg->mxm_type); - return -EPROTO; - - case MXLND_MSG_NOOP: - break; - - case MXLND_MSG_EAGER: - if (msg_nob < offsetof(kmx_msg_t, mxm_u.eager.mxem_payload[0])) { - CNETERR("Short EAGER: %d(%d)\n", msg_nob, - (int)offsetof(kmx_msg_t, mxm_u.eager.mxem_payload[0])); - return -EPROTO; - } - break; - - case MXLND_MSG_PUT_REQ: - if (msg_nob < hdr_size + sizeof(msg->mxm_u.put_req)) { - CNETERR("Short PUT_REQ: %d(%d)\n", msg_nob, - (int)(hdr_size + sizeof(msg->mxm_u.put_req))); - return -EPROTO; - } - if (flip) - __swab64s(&msg->mxm_u.put_req.mxprm_cookie); - break; - - case MXLND_MSG_PUT_ACK: - if (msg_nob < hdr_size + sizeof(msg->mxm_u.put_ack)) { - CNETERR("Short PUT_ACK: %d(%d)\n", msg_nob, - (int)(hdr_size + sizeof(msg->mxm_u.put_ack))); - return -EPROTO; - } - if (flip) { - __swab64s(&msg->mxm_u.put_ack.mxpam_src_cookie); - __swab64s(&msg->mxm_u.put_ack.mxpam_dst_cookie); - } - break; - - case MXLND_MSG_GET_REQ: - if (msg_nob < hdr_size + sizeof(msg->mxm_u.get_req)) { - CNETERR("Short GET_REQ: %d(%d)\n", msg_nob, - (int)(hdr_size + sizeof(msg->mxm_u.get_req))); - return -EPROTO; - } - if (flip) { - __swab64s(&msg->mxm_u.get_req.mxgrm_cookie); - } - break; - - case MXLND_MSG_CONN_REQ: - case MXLND_MSG_CONN_ACK: - if (msg_nob < hdr_size + sizeof(msg->mxm_u.conn_req)) { - CNETERR("Short connreq/ack: %d(%d)\n", msg_nob, - (int)(hdr_size + sizeof(msg->mxm_u.conn_req))); - return -EPROTO; - } - if (flip) { - __swab32s(&msg->mxm_u.conn_req.mxcrm_queue_depth); - __swab32s(&msg->mxm_u.conn_req.mxcrm_eager_size); - } - break; - } - return 0; -} - - -/** - * mxlnd_recv_msg - * @lntmsg - the LNET msg that this is continuing. If EAGER, then NULL. - * @rx - * @msg_type - * @cookie - * @length - length of incoming message - * @pending - add to kmx_pending (0 is NO and 1 is YES) - * - * The caller gets the rx and sets nid, peer and conn if known. - * - * Returns 0 on success and -1 on failure - */ -int -mxlnd_recv_msg(lnet_msg_t *lntmsg, kmx_ctx_t *rx, u8 msg_type, u64 cookie, u32 length) -{ - int ret = 0; - mx_return_t mxret = MX_SUCCESS; - uint64_t mask = ~(MXLND_ERROR_MASK); - - rx->mxc_msg_type = msg_type; - rx->mxc_lntmsg[0] = lntmsg; /* may be NULL if EAGER */ - rx->mxc_cookie = cookie; - /* rx->mxc_match may already be set */ - /* rx->mxc_seg.segment_ptr is already set */ - rx->mxc_seg.segment_length = length; - ret = mxlnd_q_pending_ctx(rx); - if (ret == -1) { - /* the caller is responsible for calling conn_decref() if needed */ - return -1; - } - mxret = mx_kirecv(kmxlnd_data.kmx_endpt, &rx->mxc_seg, 1, MX_PIN_PHYSICAL, - cookie, mask, (void *) rx, &rx->mxc_mxreq); - if (mxret != MX_SUCCESS) { - mxlnd_deq_pending_ctx(rx); - CNETERR("mx_kirecv() failed with %s (%d)\n", - mx_strerror(mxret), (int) mxret); - return -1; - } - return 0; -} - - -/** - * mxlnd_unexpected_recv - this is the callback function that will handle - * unexpected receives - * @context - NULL, ignore - * @source - the peer's mx_endpoint_addr_t - * @match_value - the msg's bits, should be MXLND_MSG_EAGER - * @length - length of incoming message - * @data_if_available - used for CONN_[REQ|ACK] - * - * If it is an eager-sized msg, we will call recv_msg() with the actual - * length. If it is a large message, we will call recv_msg() with a - * length of 0 bytes to drop it because we should never have a large, - * unexpected message. - * - * NOTE - The MX library blocks until this function completes. Make it as fast as - * possible. DO NOT allocate memory which can block! - * - * If we cannot get a rx or the conn is closed, drop the message on the floor - * (i.e. recv 0 bytes and ignore). - */ -mx_unexp_handler_action_t -mxlnd_unexpected_recv(void *context, mx_endpoint_addr_t source, - uint64_t match_value, uint32_t length, void *data_if_available) -{ - int ret = 0; - kmx_ctx_t *rx = NULL; - mx_ksegment_t seg; - u8 msg_type = 0; - u8 error = 0; - u64 cookie = 0ULL; - kmx_conn_t *conn = NULL; - kmx_peer_t *peer = NULL; - u64 nic_id = 0ULL; - u32 ep_id = 0; - u32 sid = 0; - - /* TODO this will change to the net struct */ - if (context != NULL) { - CNETERR("non-NULL context\n"); - } - -#if MXLND_DEBUG - CDEBUG(D_NET, "bits=0x%llx length=%d\n", match_value, length); -#endif - - mx_decompose_endpoint_addr2(source, &nic_id, &ep_id, &sid); - mxlnd_parse_match(match_value, &msg_type, &error, &cookie); - read_lock(&kmxlnd_data.kmx_global_lock); - mx_get_endpoint_addr_context(source, (void **) &conn); - if (conn) { - mxlnd_conn_addref(conn); /* add ref for this function */ - peer = conn->mxk_peer; - } - read_unlock(&kmxlnd_data.kmx_global_lock); - - if (msg_type == MXLND_MSG_BYE) { - if (conn) { - CDEBUG(D_NET, "peer %s sent BYE msg\n", - libcfs_nid2str(peer->mxp_nid)); - mxlnd_conn_disconnect(conn, 1, 0); - mxlnd_conn_decref(conn); /* drop ref taken above */ - } - return MX_RECV_FINISHED; - } - - if (msg_type == MXLND_MSG_CONN_REQ) { - kmx_connparams_t *cp = NULL; - const int expected = offsetof(kmx_msg_t, mxm_u) + - sizeof(kmx_connreq_msg_t); - - if (conn) mxlnd_conn_decref(conn); /* drop ref taken above */ - if (unlikely(length != expected || !data_if_available)) { - CNETERR("received invalid CONN_REQ from %llx " - "length=%d (expected %d)\n", nic_id, length, expected); - mxlnd_send_message(source, MXLND_MSG_CONN_ACK, EPROTO, 0); - return MX_RECV_FINISHED; - } - - ret = mxlnd_connparams_alloc(&cp, context, source, match_value, length, - conn, peer, data_if_available); - if (unlikely(ret != 0)) { - CNETERR("unable to alloc CONN_REQ from %llx:%d\n", - nic_id, ep_id); - mxlnd_send_message(source, MXLND_MSG_CONN_ACK, ENOMEM, 0); - return MX_RECV_FINISHED; - } - spin_lock(&kmxlnd_data.kmx_conn_lock); - cfs_list_add_tail(&cp->mxr_list, &kmxlnd_data.kmx_conn_reqs); - spin_unlock(&kmxlnd_data.kmx_conn_lock); - up(&kmxlnd_data.kmx_conn_sem); - return MX_RECV_FINISHED; - } - if (msg_type == MXLND_MSG_CONN_ACK) { - kmx_connparams_t *cp = NULL; - const int expected = offsetof(kmx_msg_t, mxm_u) + - sizeof(kmx_connreq_msg_t); - - LASSERT(conn); - if (unlikely(error != 0)) { - CNETERR("received CONN_ACK from %s with error -%d\n", - libcfs_nid2str(peer->mxp_nid), (int) error); - mxlnd_conn_disconnect(conn, 1, 0); - } else if (unlikely(length != expected || !data_if_available)) { - CNETERR("received %s CONN_ACK from %s " - "length=%d (expected %d)\n", - data_if_available ? "short" : "missing", - libcfs_nid2str(peer->mxp_nid), length, expected); - mxlnd_conn_disconnect(conn, 1, 1); - } else { - /* peer is ready for messages */ - ret = mxlnd_connparams_alloc(&cp, context, source, match_value, length, - conn, peer, data_if_available); - if (unlikely(ret != 0)) { - CNETERR("unable to alloc kmx_connparams_t" - " from %llx:%d\n", nic_id, ep_id); - mxlnd_conn_disconnect(conn, 1, 1); - } else { - spin_lock(&kmxlnd_data.kmx_conn_lock); - cfs_list_add_tail(&cp->mxr_list, - &kmxlnd_data.kmx_conn_reqs); - spin_unlock(&kmxlnd_data.kmx_conn_lock); - up(&kmxlnd_data.kmx_conn_sem); - } - } - mxlnd_conn_decref(conn); /* drop ref taken above */ - - return MX_RECV_FINISHED; - } - - /* Handle unexpected messages (PUT_REQ and GET_REQ) */ - - LASSERT(peer != NULL && conn != NULL); - - rx = mxlnd_get_idle_rx(conn); - if (rx != NULL) { - if (length <= MXLND_MSG_SIZE) { - ret = mxlnd_recv_msg(NULL, rx, msg_type, match_value, length); - } else { - CNETERR("unexpected large receive with " - "match_value=0x%llx length=%d\n", - match_value, length); - ret = mxlnd_recv_msg(NULL, rx, msg_type, match_value, 0); - } - - if (ret == 0) { - /* hold conn ref until rx completes */ - rx->mxc_conn = conn; - rx->mxc_peer = peer; - rx->mxc_nid = peer->mxp_nid; - } else { - CNETERR("could not post receive\n"); - mxlnd_put_idle_rx(rx); - } - } - - /* Encountered error, drop incoming message on the floor */ - /* We could use MX_RECV_FINISHED but posting the receive of 0 bytes - * uses the standard code path and acks the sender normally */ - - if (rx == NULL || ret != 0) { - mxlnd_conn_decref(conn); /* drop ref taken above */ - if (rx == NULL) { - CNETERR("no idle rxs available - dropping rx" - " 0x%llx from %s\n", match_value, - libcfs_nid2str(peer->mxp_nid)); - } else { - /* ret != 0 */ - CNETERR("disconnected peer - dropping rx\n"); - } - seg.segment_ptr = 0ULL; - seg.segment_length = 0; - mx_kirecv(kmxlnd_data.kmx_endpt, &seg, 1, MX_PIN_PHYSICAL, - match_value, ~0ULL, NULL, NULL); - } - - return MX_RECV_CONTINUE; -} - - -int -mxlnd_get_peer_info(int index, lnet_nid_t *nidp, int *count) -{ - int i = 0; - int ret = -ENOENT; - kmx_peer_t *peer = NULL; - - read_lock(&kmxlnd_data.kmx_global_lock); - for (i = 0; i < MXLND_HASH_SIZE; i++) { - cfs_list_for_each_entry(peer, &kmxlnd_data.kmx_peers[i], - mxp_list) { - if (index-- == 0) { - *nidp = peer->mxp_nid; - *count = atomic_read(&peer->mxp_refcount); - ret = 0; - break; - } - } - } - read_unlock(&kmxlnd_data.kmx_global_lock); - - return ret; -} - -void -mxlnd_del_peer_locked(kmx_peer_t *peer) -{ - if (peer->mxp_conn) { - mxlnd_conn_disconnect(peer->mxp_conn, 1, 1); - } else { - cfs_list_del_init(&peer->mxp_list); /* remove from the global list */ - mxlnd_peer_decref(peer); /* drop global list ref */ - } - return; -} - -int -mxlnd_del_peer(lnet_nid_t nid) -{ - int i = 0; - int ret = 0; - kmx_peer_t *peer = NULL; - kmx_peer_t *next = NULL; - - if (nid != LNET_NID_ANY) { - peer = mxlnd_find_peer_by_nid(nid, 0); /* adds peer ref */ - } - write_lock(&kmxlnd_data.kmx_global_lock); - if (nid != LNET_NID_ANY) { - if (peer == NULL) { - ret = -ENOENT; - } else { - mxlnd_peer_decref(peer); /* and drops it */ - mxlnd_del_peer_locked(peer); - } - } else { /* LNET_NID_ANY */ - for (i = 0; i < MXLND_HASH_SIZE; i++) { - cfs_list_for_each_entry_safe(peer, next, - &kmxlnd_data.kmx_peers[i], - mxp_list) { - mxlnd_del_peer_locked(peer); - } - } - } - write_unlock(&kmxlnd_data.kmx_global_lock); - - return ret; -} - -kmx_conn_t * -mxlnd_get_conn_by_idx(int index) -{ - int i = 0; - kmx_peer_t *peer = NULL; - kmx_conn_t *conn = NULL; - - read_lock(&kmxlnd_data.kmx_global_lock); - for (i = 0; i < MXLND_HASH_SIZE; i++) { - cfs_list_for_each_entry(peer, &kmxlnd_data.kmx_peers[i], - mxp_list) { - cfs_list_for_each_entry(conn, &peer->mxp_conns, - mxk_list) { - if (index-- > 0) { - continue; - } - - mxlnd_conn_addref(conn); /* add ref here, dec in ctl() */ - read_unlock(&kmxlnd_data.kmx_global_lock); - return conn; - } - } - } - read_unlock(&kmxlnd_data.kmx_global_lock); - - return NULL; -} - -void -mxlnd_close_matching_conns_locked(kmx_peer_t *peer) -{ - kmx_conn_t *conn = NULL; - kmx_conn_t *next = NULL; - - cfs_list_for_each_entry_safe(conn, next, &peer->mxp_conns, mxk_list) - mxlnd_conn_disconnect(conn, 0, 1); - - return; -} - -int -mxlnd_close_matching_conns(lnet_nid_t nid) -{ - int i = 0; - int ret = 0; - kmx_peer_t *peer = NULL; - - write_lock(&kmxlnd_data.kmx_global_lock); - if (nid != LNET_NID_ANY) { - peer = mxlnd_find_peer_by_nid_locked(nid); /* adds peer ref */ - if (peer == NULL) { - ret = -ENOENT; - } else { - mxlnd_close_matching_conns_locked(peer); - mxlnd_peer_decref(peer); /* and drops it here */ - } - } else { /* LNET_NID_ANY */ - for (i = 0; i < MXLND_HASH_SIZE; i++) { - cfs_list_for_each_entry(peer, &kmxlnd_data.kmx_peers[i], mxp_list) - mxlnd_close_matching_conns_locked(peer); - } - } - write_unlock(&kmxlnd_data.kmx_global_lock); - - return ret; -} - -/** - * mxlnd_ctl - modify MXLND parameters - * @ni - LNET interface handle - * @cmd - command to change - * @arg - the ioctl data - */ -int -mxlnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) -{ - struct libcfs_ioctl_data *data = arg; - int ret = -EINVAL; - - LASSERT (ni == kmxlnd_data.kmx_ni); - - switch (cmd) { - case IOC_LIBCFS_GET_PEER: { - lnet_nid_t nid = 0; - int count = 0; - - ret = mxlnd_get_peer_info(data->ioc_count, &nid, &count); - data->ioc_nid = nid; - data->ioc_count = count; - break; - } - case IOC_LIBCFS_DEL_PEER: { - ret = mxlnd_del_peer(data->ioc_nid); - break; - } - case IOC_LIBCFS_GET_CONN: { - kmx_conn_t *conn = NULL; - - conn = mxlnd_get_conn_by_idx(data->ioc_count); - if (conn == NULL) { - ret = -ENOENT; - } else { - ret = 0; - data->ioc_nid = conn->mxk_peer->mxp_nid; - mxlnd_conn_decref(conn); /* dec ref taken in get_conn_by_idx() */ - } - break; - } - case IOC_LIBCFS_CLOSE_CONNECTION: { - ret = mxlnd_close_matching_conns(data->ioc_nid); - break; - } - default: - CNETERR("unknown ctl(%d)\n", cmd); - break; - } - - return ret; -} - -/** - * mxlnd_peer_queue_tx_locked - add the tx to the peer's tx queue - * @tx - * - * Add the tx to the peer's msg or data queue. The caller has locked the peer. - */ -void -mxlnd_peer_queue_tx_locked(kmx_ctx_t *tx) -{ - u8 msg_type = tx->mxc_msg_type; - kmx_conn_t *conn = tx->mxc_conn; - - LASSERT (msg_type != 0); - LASSERT (tx->mxc_nid != 0); - LASSERT (tx->mxc_peer != NULL); - LASSERT (tx->mxc_conn != NULL); - - tx->mxc_incarnation = conn->mxk_incarnation; - - if (msg_type != MXLND_MSG_PUT_DATA && - msg_type != MXLND_MSG_GET_DATA) { - /* msg style tx */ - if (mxlnd_tx_requires_credit(tx)) { - cfs_list_add_tail(&tx->mxc_list, - &conn->mxk_tx_credit_queue); - conn->mxk_ntx_msgs++; - } else if (msg_type == MXLND_MSG_CONN_REQ || - msg_type == MXLND_MSG_CONN_ACK) { - /* put conn msgs at the front of the queue */ - cfs_list_add(&tx->mxc_list, &conn->mxk_tx_free_queue); - } else { - /* PUT_ACK, PUT_NAK */ - cfs_list_add_tail(&tx->mxc_list, - &conn->mxk_tx_free_queue); - conn->mxk_ntx_msgs++; - } - } else { - /* data style tx */ - cfs_list_add_tail(&tx->mxc_list, &conn->mxk_tx_free_queue); - conn->mxk_ntx_data++; - } - - return; -} - -/** - * mxlnd_peer_queue_tx - add the tx to the global tx queue - * @tx - * - * Add the tx to the peer's msg or data queue - */ -static inline void -mxlnd_peer_queue_tx(kmx_ctx_t *tx) -{ - LASSERT(tx->mxc_peer != NULL); - LASSERT(tx->mxc_conn != NULL); - spin_lock(&tx->mxc_conn->mxk_lock); - mxlnd_peer_queue_tx_locked(tx); - spin_unlock(&tx->mxc_conn->mxk_lock); - - return; -} - -/** - * mxlnd_queue_tx - add the tx to the global tx queue - * @tx - * - * Add the tx to the global queue and up the tx_queue_sem - */ -void -mxlnd_queue_tx(kmx_ctx_t *tx) -{ - kmx_peer_t *peer = tx->mxc_peer; - LASSERT (tx->mxc_nid != 0); - - if (peer != NULL) { - if (peer->mxp_incompatible && - tx->mxc_msg_type != MXLND_MSG_CONN_ACK) { - /* let this fail now */ - tx->mxc_errno = -ECONNABORTED; - mxlnd_conn_decref(peer->mxp_conn); - mxlnd_put_idle_tx(tx); - return; - } - if (tx->mxc_conn == NULL) { - int ret = 0; - kmx_conn_t *conn = NULL; - - ret = mxlnd_conn_alloc(&conn, peer); /* adds 2nd ref for tx... */ - if (ret != 0) { - tx->mxc_errno = ret; - mxlnd_put_idle_tx(tx); - goto done; - } - tx->mxc_conn = conn; - mxlnd_peer_decref(peer); /* and takes it from peer */ - } - LASSERT(tx->mxc_conn != NULL); - mxlnd_peer_queue_tx(tx); - mxlnd_check_sends(peer); - } else { - spin_lock(&kmxlnd_data.kmx_tx_queue_lock); - cfs_list_add_tail(&tx->mxc_list, &kmxlnd_data.kmx_tx_queue); - spin_unlock(&kmxlnd_data.kmx_tx_queue_lock); - up(&kmxlnd_data.kmx_tx_queue_sem); - } -done: - return; -} - -int -mxlnd_setup_iov(kmx_ctx_t *ctx, u32 niov, struct iovec *iov, u32 offset, u32 nob) -{ - int i = 0; - int sum = 0; - int old_sum = 0; - int nseg = 0; - int first_iov = -1; - int first_iov_offset = 0; - int first_found = 0; - int last_iov = -1; - int last_iov_length = 0; - mx_ksegment_t *seg = NULL; - - if (niov == 0) return 0; - LASSERT(iov != NULL); - - for (i = 0; i < niov; i++) { - sum = old_sum + (u32) iov[i].iov_len; - if (!first_found && (sum > offset)) { - first_iov = i; - first_iov_offset = offset - old_sum; - first_found = 1; - sum = (u32) iov[i].iov_len - first_iov_offset; - old_sum = 0; - } - if (sum >= nob) { - last_iov = i; - last_iov_length = (u32) iov[i].iov_len - (sum - nob); - if (first_iov == last_iov) last_iov_length -= first_iov_offset; - break; - } - old_sum = sum; - } - LASSERT(first_iov >= 0 && last_iov >= first_iov); - nseg = last_iov - first_iov + 1; - LASSERT(nseg > 0); - - MXLND_ALLOC(seg, nseg * sizeof(*seg)); - if (seg == NULL) { - CNETERR("MXLND_ALLOC() failed\n"); - return -1; - } - memset(seg, 0, nseg * sizeof(*seg)); - ctx->mxc_nseg = nseg; - sum = 0; - for (i = 0; i < nseg; i++) { - seg[i].segment_ptr = MX_PA_TO_U64(virt_to_phys(iov[first_iov + i].iov_base)); - seg[i].segment_length = (u32) iov[first_iov + i].iov_len; - if (i == 0) { - seg[i].segment_ptr += (u64) first_iov_offset; - seg[i].segment_length -= (u32) first_iov_offset; - } - if (i == (nseg - 1)) { - seg[i].segment_length = (u32) last_iov_length; - } - sum += seg[i].segment_length; - } - ctx->mxc_seg_list = seg; - ctx->mxc_pin_type = MX_PIN_PHYSICAL; -#ifdef MX_PIN_FULLPAGES - ctx->mxc_pin_type |= MX_PIN_FULLPAGES; -#endif - LASSERT(nob == sum); - return 0; -} - -int -mxlnd_setup_kiov(kmx_ctx_t *ctx, u32 niov, lnet_kiov_t *kiov, u32 offset, u32 nob) -{ - int i = 0; - int sum = 0; - int old_sum = 0; - int nseg = 0; - int first_kiov = -1; - int first_kiov_offset = 0; - int first_found = 0; - int last_kiov = -1; - int last_kiov_length = 0; - mx_ksegment_t *seg = NULL; - - if (niov == 0) return 0; - LASSERT(kiov != NULL); - - for (i = 0; i < niov; i++) { - sum = old_sum + kiov[i].kiov_len; - if (i == 0) sum -= kiov[i].kiov_offset; - if (!first_found && (sum > offset)) { - first_kiov = i; - first_kiov_offset = offset - old_sum; - if (i == 0) first_kiov_offset = kiov[i].kiov_offset; - first_found = 1; - sum = kiov[i].kiov_len - first_kiov_offset; - old_sum = 0; - } - if (sum >= nob) { - last_kiov = i; - last_kiov_length = kiov[i].kiov_len - (sum - nob); - if (first_kiov == last_kiov) last_kiov_length -= first_kiov_offset; - break; - } - old_sum = sum; - } - LASSERT(first_kiov >= 0 && last_kiov >= first_kiov); - nseg = last_kiov - first_kiov + 1; - LASSERT(nseg > 0); - - MXLND_ALLOC(seg, nseg * sizeof(*seg)); - if (seg == NULL) { - CNETERR("MXLND_ALLOC() failed\n"); - return -1; - } - memset(seg, 0, niov * sizeof(*seg)); - ctx->mxc_nseg = niov; - sum = 0; - for (i = 0; i < niov; i++) { - seg[i].segment_ptr = - page_to_phys(kiov[first_kiov + i].kiov_page); - seg[i].segment_length = kiov[first_kiov + i].kiov_len; - if (i == 0) { - seg[i].segment_ptr += (u64) first_kiov_offset; - /* we have to add back the original kiov_offset */ - seg[i].segment_length -= first_kiov_offset + - kiov[first_kiov].kiov_offset; - } - if (i == (nseg - 1)) { - seg[i].segment_length = last_kiov_length; - } - sum += seg[i].segment_length; - } - ctx->mxc_seg_list = seg; - ctx->mxc_pin_type = MX_PIN_PHYSICAL; -#ifdef MX_PIN_FULLPAGES - ctx->mxc_pin_type |= MX_PIN_FULLPAGES; -#endif - LASSERT(nob == sum); - return 0; -} - -void -mxlnd_send_nak(kmx_ctx_t *tx, lnet_nid_t nid, int type, int status, __u64 cookie) -{ - LASSERT(type == MXLND_MSG_PUT_ACK); - mxlnd_init_tx_msg(tx, type, sizeof(kmx_putack_msg_t), tx->mxc_nid); - tx->mxc_cookie = cookie; - tx->mxc_msg->mxm_u.put_ack.mxpam_src_cookie = cookie; - tx->mxc_msg->mxm_u.put_ack.mxpam_dst_cookie = ((u64) status << MXLND_ERROR_OFFSET); /* error code */ - tx->mxc_match = mxlnd_create_match(tx, status); - - mxlnd_queue_tx(tx); -} - - -/** - * mxlnd_send_data - get tx, map [k]iov, queue tx - * @ni - * @lntmsg - * @peer - * @msg_type - * @cookie - * - * This setups the DATA send for PUT or GET. - * - * On success, it queues the tx, on failure it calls lnet_finalize() - */ -void -mxlnd_send_data(lnet_ni_t *ni, lnet_msg_t *lntmsg, kmx_peer_t *peer, u8 msg_type, u64 cookie) -{ - int ret = 0; - lnet_process_id_t target = lntmsg->msg_target; - unsigned int niov = lntmsg->msg_niov; - struct iovec *iov = lntmsg->msg_iov; - lnet_kiov_t *kiov = lntmsg->msg_kiov; - unsigned int offset = lntmsg->msg_offset; - unsigned int nob = lntmsg->msg_len; - kmx_ctx_t *tx = NULL; - - LASSERT(lntmsg != NULL); - LASSERT(peer != NULL); - LASSERT(msg_type == MXLND_MSG_PUT_DATA || msg_type == MXLND_MSG_GET_DATA); - LASSERT((cookie>>MXLND_ERROR_OFFSET) == 0); - - tx = mxlnd_get_idle_tx(); - if (tx == NULL) { - CNETERR("Can't allocate %s tx for %s\n", - msg_type == MXLND_MSG_PUT_DATA ? "PUT_DATA" : "GET_DATA", - libcfs_nid2str(target.nid)); - goto failed_0; - } - tx->mxc_nid = target.nid; - /* NOTE called when we have a ref on the conn, get one for this tx */ - mxlnd_conn_addref(peer->mxp_conn); - tx->mxc_peer = peer; - tx->mxc_conn = peer->mxp_conn; - tx->mxc_msg_type = msg_type; - tx->mxc_lntmsg[0] = lntmsg; - tx->mxc_cookie = cookie; - tx->mxc_match = mxlnd_create_match(tx, 0); - - /* This setups up the mx_ksegment_t to send the DATA payload */ - if (nob == 0) { - /* do not setup the segments */ - CNETERR("nob = 0; why didn't we use an EAGER reply " - "to %s?\n", libcfs_nid2str(target.nid)); - ret = 0; - } else if (kiov == NULL) { - ret = mxlnd_setup_iov(tx, niov, iov, offset, nob); - } else { - ret = mxlnd_setup_kiov(tx, niov, kiov, offset, nob); - } - if (ret != 0) { - CNETERR("Can't setup send DATA for %s\n", - libcfs_nid2str(target.nid)); - tx->mxc_errno = -EIO; - goto failed_1; - } - mxlnd_queue_tx(tx); - return; - -failed_1: - mxlnd_conn_decref(peer->mxp_conn); - mxlnd_put_idle_tx(tx); - return; - -failed_0: - CNETERR("no tx avail\n"); - lnet_finalize(ni, lntmsg, -EIO); - return; -} - -/** - * mxlnd_recv_data - map [k]iov, post rx - * @ni - * @lntmsg - * @rx - * @msg_type - * @cookie - * - * This setups the DATA receive for PUT or GET. - * - * On success, it returns 0, on failure it returns -1 - */ -int -mxlnd_recv_data(lnet_ni_t *ni, lnet_msg_t *lntmsg, kmx_ctx_t *rx, u8 msg_type, u64 cookie) -{ - int ret = 0; - lnet_process_id_t target = lntmsg->msg_target; - unsigned int niov = lntmsg->msg_niov; - struct iovec *iov = lntmsg->msg_iov; - lnet_kiov_t *kiov = lntmsg->msg_kiov; - unsigned int offset = lntmsg->msg_offset; - unsigned int nob = lntmsg->msg_len; - mx_return_t mxret = MX_SUCCESS; - u64 mask = ~(MXLND_ERROR_MASK); - - /* above assumes MXLND_MSG_PUT_DATA */ - if (msg_type == MXLND_MSG_GET_DATA) { - niov = lntmsg->msg_md->md_niov; - iov = lntmsg->msg_md->md_iov.iov; - kiov = lntmsg->msg_md->md_iov.kiov; - offset = 0; - nob = lntmsg->msg_md->md_length; - } - - LASSERT(lntmsg != NULL); - LASSERT(rx != NULL); - LASSERT(msg_type == MXLND_MSG_PUT_DATA || msg_type == MXLND_MSG_GET_DATA); - LASSERT((cookie>>MXLND_ERROR_OFFSET) == 0); /* ensure top 12 bits are 0 */ - - rx->mxc_msg_type = msg_type; - rx->mxc_state = MXLND_CTX_PENDING; - rx->mxc_nid = target.nid; - /* if posting a GET_DATA, we may not yet know the peer */ - if (rx->mxc_peer != NULL) { - rx->mxc_conn = rx->mxc_peer->mxp_conn; - } - rx->mxc_lntmsg[0] = lntmsg; - rx->mxc_cookie = cookie; - rx->mxc_match = mxlnd_create_match(rx, 0); - /* This setups up the mx_ksegment_t to receive the DATA payload */ - if (kiov == NULL) { - ret = mxlnd_setup_iov(rx, niov, iov, offset, nob); - } else { - ret = mxlnd_setup_kiov(rx, niov, kiov, offset, nob); - } - if (msg_type == MXLND_MSG_GET_DATA) { - rx->mxc_lntmsg[1] = lnet_create_reply_msg(kmxlnd_data.kmx_ni, lntmsg); - if (rx->mxc_lntmsg[1] == NULL) { - CNETERR("Can't create reply for GET -> %s\n", - libcfs_nid2str(target.nid)); - ret = -1; - } - } - if (ret != 0) { - CNETERR("Can't setup %s rx for %s\n", - msg_type == MXLND_MSG_PUT_DATA ? "PUT_DATA" : "GET_DATA", - libcfs_nid2str(target.nid)); - return -1; - } - ret = mxlnd_q_pending_ctx(rx); - if (ret == -1) { - return -1; - } - CDEBUG(D_NET, "receiving %s 0x%llx\n", mxlnd_msgtype_to_str(msg_type), rx->mxc_cookie); - mxret = mx_kirecv(kmxlnd_data.kmx_endpt, - rx->mxc_seg_list, rx->mxc_nseg, - rx->mxc_pin_type, rx->mxc_match, - mask, (void *) rx, - &rx->mxc_mxreq); - if (mxret != MX_SUCCESS) { - if (rx->mxc_conn != NULL) { - mxlnd_deq_pending_ctx(rx); - } - CNETERR("mx_kirecv() failed with %d for %s\n", - (int) mxret, libcfs_nid2str(target.nid)); - return -1; - } - - return 0; -} - -/** - * mxlnd_send - the LND required send function - * @ni - * @private - * @lntmsg - * - * This must not block. Since we may not have a peer struct for the receiver, - * it will append send messages on a global tx list. We will then up the - * tx_queued's semaphore to notify it of the new send. - */ -int -mxlnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) -{ - int ret = 0; - int type = lntmsg->msg_type; - lnet_hdr_t *hdr = &lntmsg->msg_hdr; - lnet_process_id_t target = lntmsg->msg_target; - lnet_nid_t nid = target.nid; - int target_is_router = lntmsg->msg_target_is_router; - int routing = lntmsg->msg_routing; - unsigned int payload_niov = lntmsg->msg_niov; - struct iovec *payload_iov = lntmsg->msg_iov; - lnet_kiov_t *payload_kiov = lntmsg->msg_kiov; - unsigned int payload_offset = lntmsg->msg_offset; - unsigned int payload_nob = lntmsg->msg_len; - kmx_ctx_t *tx = NULL; - kmx_msg_t *txmsg = NULL; - kmx_ctx_t *rx = (kmx_ctx_t *) private; /* for REPLY */ - kmx_ctx_t *rx_data = NULL; - kmx_conn_t *conn = NULL; - int nob = 0; - uint32_t length = 0; - kmx_peer_t *peer = NULL; - rwlock_t *g_lock =&kmxlnd_data.kmx_global_lock; - - CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n", - payload_nob, payload_niov, libcfs_id2str(target)); - - LASSERT (payload_nob == 0 || payload_niov > 0); - LASSERT (payload_niov <= LNET_MAX_IOV); - /* payload is either all vaddrs or all pages */ - LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); - - /* private is used on LNET_GET_REPLY only, NULL for all other cases */ - - /* NOTE we may not know the peer if it is the very first PUT_REQ or GET_REQ - * to a new peer, so create one if not found */ - peer = mxlnd_find_peer_by_nid(nid, 1); /* adds peer ref */ - if (peer == NULL || peer->mxp_conn == NULL) { - /* we could not find it nor could we create one or - * one exists but we cannot create a conn, - * fail this message */ - if (peer) { - /* found peer without conn, drop ref taken above */ - LASSERT(peer->mxp_conn == NULL); - mxlnd_peer_decref(peer); - } - return -ENOMEM; - } - - /* we have a peer with a conn */ - - if (unlikely(peer->mxp_incompatible)) { - mxlnd_peer_decref(peer); /* drop ref taken above */ - } else { - read_lock(g_lock); - conn = peer->mxp_conn; - if (conn && conn->mxk_status != MXLND_CONN_DISCONNECT) - mxlnd_conn_addref(conn); - else - conn = NULL; - read_unlock(g_lock); - mxlnd_peer_decref(peer); /* drop peer ref taken above */ - if (!conn) - return -ENOTCONN; - } - - LASSERT(peer && conn); - - CDEBUG(D_NET, "%s: peer 0x%llx is 0x%p\n", __func__, nid, peer); - - switch (type) { - case LNET_MSG_ACK: - LASSERT (payload_nob == 0); - break; - - case LNET_MSG_REPLY: - case LNET_MSG_PUT: - /* Is the payload small enough not to need DATA? */ - nob = offsetof(kmx_msg_t, mxm_u.eager.mxem_payload[payload_nob]); - if (nob <= MXLND_MSG_SIZE) - break; /* send EAGER */ - - tx = mxlnd_get_idle_tx(); - if (unlikely(tx == NULL)) { - CNETERR("Can't allocate %s tx for %s\n", - type == LNET_MSG_PUT ? "PUT" : "REPLY", - libcfs_nid2str(nid)); - if (conn) mxlnd_conn_decref(conn); - return -ENOMEM; - } - - tx->mxc_peer = peer; - tx->mxc_conn = conn; - /* we added a conn ref above */ - mxlnd_init_tx_msg (tx, MXLND_MSG_PUT_REQ, sizeof(kmx_putreq_msg_t), nid); - txmsg = tx->mxc_msg; - txmsg->mxm_u.put_req.mxprm_hdr = *hdr; - txmsg->mxm_u.put_req.mxprm_cookie = tx->mxc_cookie; - tx->mxc_match = mxlnd_create_match(tx, 0); - - /* we must post a receive _before_ sending the request. - * we need to determine how much to receive, it will be either - * a put_ack or a put_nak. The put_ack is larger, so use it. */ - - rx = mxlnd_get_idle_rx(conn); - if (unlikely(rx == NULL)) { - CNETERR("Can't allocate rx for PUT_ACK for %s\n", - libcfs_nid2str(nid)); - mxlnd_put_idle_tx(tx); - if (conn) mxlnd_conn_decref(conn); /* for the ref taken above */ - return -ENOMEM; - } - rx->mxc_nid = nid; - rx->mxc_peer = peer; - mxlnd_conn_addref(conn); /* for this rx */ - rx->mxc_conn = conn; - rx->mxc_msg_type = MXLND_MSG_PUT_ACK; - rx->mxc_cookie = tx->mxc_cookie; - rx->mxc_match = mxlnd_create_match(rx, 0); - - length = offsetof(kmx_msg_t, mxm_u) + sizeof(kmx_putack_msg_t); - ret = mxlnd_recv_msg(lntmsg, rx, MXLND_MSG_PUT_ACK, rx->mxc_match, length); - if (unlikely(ret != 0)) { - CNETERR("recv_msg() failed for PUT_ACK for %s\n", - libcfs_nid2str(nid)); - rx->mxc_lntmsg[0] = NULL; - mxlnd_put_idle_rx(rx); - mxlnd_put_idle_tx(tx); - mxlnd_conn_decref(conn); /* for the rx... */ - mxlnd_conn_decref(conn); /* and for the tx */ - return -EHOSTUNREACH; - } - - mxlnd_queue_tx(tx); - return 0; - - case LNET_MSG_GET: - if (routing || target_is_router) - break; /* send EAGER */ - - /* is the REPLY message too small for DATA? */ - nob = offsetof(kmx_msg_t, mxm_u.eager.mxem_payload[lntmsg->msg_md->md_length]); - if (nob <= MXLND_MSG_SIZE) - break; /* send EAGER */ - - /* get tx (we need the cookie) , post rx for incoming DATA, - * then post GET_REQ tx */ - tx = mxlnd_get_idle_tx(); - if (unlikely(tx == NULL)) { - CNETERR("Can't allocate GET tx for %s\n", - libcfs_nid2str(nid)); - mxlnd_conn_decref(conn); /* for the ref taken above */ - return -ENOMEM; - } - rx_data = mxlnd_get_idle_rx(conn); - if (unlikely(rx_data == NULL)) { - CNETERR("Can't allocate DATA rx for %s\n", - libcfs_nid2str(nid)); - mxlnd_put_idle_tx(tx); - mxlnd_conn_decref(conn); /* for the ref taken above */ - return -ENOMEM; - } - rx_data->mxc_peer = peer; - /* NOTE no need to lock peer before adding conn ref since we took - * a conn ref for the tx (it cannot be freed between there and here ) */ - mxlnd_conn_addref(conn); /* for the rx_data */ - rx_data->mxc_conn = conn; - - ret = mxlnd_recv_data(ni, lntmsg, rx_data, MXLND_MSG_GET_DATA, tx->mxc_cookie); - if (unlikely(ret != 0)) { - CNETERR("Can't setup GET sink for %s\n", - libcfs_nid2str(nid)); - mxlnd_put_idle_rx(rx_data); - mxlnd_put_idle_tx(tx); - mxlnd_conn_decref(conn); /* for the rx_data... */ - mxlnd_conn_decref(conn); /* and for the tx */ - return -EIO; - } - - tx->mxc_peer = peer; - tx->mxc_conn = conn; - /* conn ref taken above */ - mxlnd_init_tx_msg(tx, MXLND_MSG_GET_REQ, sizeof(kmx_getreq_msg_t), nid); - txmsg = tx->mxc_msg; - txmsg->mxm_u.get_req.mxgrm_hdr = *hdr; - txmsg->mxm_u.get_req.mxgrm_cookie = tx->mxc_cookie; - tx->mxc_match = mxlnd_create_match(tx, 0); - - mxlnd_queue_tx(tx); - return 0; - - default: - LBUG(); - mxlnd_conn_decref(conn); /* drop ref taken above */ - return -EIO; - } - - /* send EAGER */ - - LASSERT (offsetof(kmx_msg_t, mxm_u.eager.mxem_payload[payload_nob]) - <= MXLND_MSG_SIZE); - - tx = mxlnd_get_idle_tx(); - if (unlikely(tx == NULL)) { - CNETERR("Can't send %s to %s: tx descs exhausted\n", - mxlnd_lnetmsg_to_str(type), libcfs_nid2str(nid)); - mxlnd_conn_decref(conn); /* drop ref taken above */ - return -ENOMEM; - } - - tx->mxc_peer = peer; - tx->mxc_conn = conn; - /* conn ref taken above */ - nob = offsetof(kmx_eager_msg_t, mxem_payload[payload_nob]); - mxlnd_init_tx_msg (tx, MXLND_MSG_EAGER, nob, nid); - tx->mxc_match = mxlnd_create_match(tx, 0); - - txmsg = tx->mxc_msg; - txmsg->mxm_u.eager.mxem_hdr = *hdr; - - if (payload_kiov != NULL) - lnet_copy_kiov2flat(MXLND_MSG_SIZE, txmsg, - offsetof(kmx_msg_t, mxm_u.eager.mxem_payload), - payload_niov, payload_kiov, payload_offset, payload_nob); - else - lnet_copy_iov2flat(MXLND_MSG_SIZE, txmsg, - offsetof(kmx_msg_t, mxm_u.eager.mxem_payload), - payload_niov, payload_iov, payload_offset, payload_nob); - - tx->mxc_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ - mxlnd_queue_tx(tx); - return 0; -} - -/** - * mxlnd_recv - the LND required recv function - * @ni - * @private - * @lntmsg - * @delayed - * @niov - * @kiov - * @offset - * @mlen - * @rlen - * - * This must not block. - */ -int -mxlnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, - unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, - unsigned int offset, unsigned int mlen, unsigned int rlen) -{ - int ret = 0; - int nob = 0; - int len = 0; - kmx_ctx_t *rx = private; - kmx_msg_t *rxmsg = rx->mxc_msg; - lnet_nid_t nid = rx->mxc_nid; - kmx_ctx_t *tx = NULL; - kmx_msg_t *txmsg = NULL; - kmx_peer_t *peer = rx->mxc_peer; - kmx_conn_t *conn = peer->mxp_conn; - u64 cookie = 0ULL; - int msg_type = rxmsg->mxm_type; - int repost = 1; - int credit = 0; - int finalize = 0; - - LASSERT (mlen <= rlen); - /* Either all pages or all vaddrs */ - LASSERT (!(kiov != NULL && iov != NULL)); - LASSERT (peer && conn); - - /* conn_addref(conn) already taken for the primary rx */ - - switch (msg_type) { - case MXLND_MSG_EAGER: - nob = offsetof(kmx_msg_t, mxm_u.eager.mxem_payload[rlen]); - len = rx->mxc_status.xfer_length; - if (unlikely(nob > len)) { - CNETERR("Eager message from %s too big: %d(%d)\n", - libcfs_nid2str(nid), nob, len); - ret = -EPROTO; - break; - } - - if (kiov != NULL) - lnet_copy_flat2kiov(niov, kiov, offset, - MXLND_MSG_SIZE, rxmsg, - offsetof(kmx_msg_t, mxm_u.eager.mxem_payload), - mlen); - else - lnet_copy_flat2iov(niov, iov, offset, - MXLND_MSG_SIZE, rxmsg, - offsetof(kmx_msg_t, mxm_u.eager.mxem_payload), - mlen); - finalize = 1; - credit = 1; - break; - - case MXLND_MSG_PUT_REQ: - /* we are going to reuse the rx, store the needed info */ - cookie = rxmsg->mxm_u.put_req.mxprm_cookie; - - /* get tx, post rx, send PUT_ACK */ - - tx = mxlnd_get_idle_tx(); - if (unlikely(tx == NULL)) { - CNETERR("Can't allocate tx for %s\n", libcfs_nid2str(nid)); - /* Not replying will break the connection */ - ret = -ENOMEM; - break; - } - if (unlikely(mlen == 0)) { - finalize = 1; - tx->mxc_peer = peer; - tx->mxc_conn = conn; - mxlnd_send_nak(tx, nid, MXLND_MSG_PUT_ACK, 0, cookie); - /* repost = 1 */ - break; - } - - mxlnd_init_tx_msg(tx, MXLND_MSG_PUT_ACK, sizeof(kmx_putack_msg_t), nid); - tx->mxc_peer = peer; - tx->mxc_conn = conn; - /* no need to lock peer first since we already have a ref */ - mxlnd_conn_addref(conn); /* for the tx */ - txmsg = tx->mxc_msg; - txmsg->mxm_u.put_ack.mxpam_src_cookie = cookie; - txmsg->mxm_u.put_ack.mxpam_dst_cookie = tx->mxc_cookie; - tx->mxc_cookie = cookie; - tx->mxc_match = mxlnd_create_match(tx, 0); - - /* we must post a receive _before_ sending the PUT_ACK */ - mxlnd_ctx_init(rx); - rx->mxc_state = MXLND_CTX_PREP; - rx->mxc_peer = peer; - rx->mxc_conn = conn; - /* do not take another ref for this rx, it is already taken */ - rx->mxc_nid = peer->mxp_nid; - ret = mxlnd_recv_data(ni, lntmsg, rx, MXLND_MSG_PUT_DATA, - txmsg->mxm_u.put_ack.mxpam_dst_cookie); - - if (unlikely(ret != 0)) { - /* Notify peer that it's over */ - CNETERR("Can't setup PUT_DATA rx for %s: %d\n", - libcfs_nid2str(nid), ret); - mxlnd_ctx_init(tx); - tx->mxc_state = MXLND_CTX_PREP; - tx->mxc_peer = peer; - tx->mxc_conn = conn; - /* finalize = 0, let the PUT_ACK tx finalize this */ - tx->mxc_lntmsg[0] = rx->mxc_lntmsg[0]; - tx->mxc_lntmsg[1] = rx->mxc_lntmsg[1]; - /* conn ref already taken above */ - mxlnd_send_nak(tx, nid, MXLND_MSG_PUT_ACK, ret, cookie); - /* repost = 1 */ - break; - } - - mxlnd_queue_tx(tx); - /* do not return a credit until after PUT_DATA returns */ - repost = 0; - break; - - case MXLND_MSG_GET_REQ: - cookie = rxmsg->mxm_u.get_req.mxgrm_cookie; - - if (likely(lntmsg != NULL)) { - mxlnd_send_data(ni, lntmsg, rx->mxc_peer, MXLND_MSG_GET_DATA, - cookie); - } else { - /* GET didn't match anything */ - /* The initiator has a rx mapped to [k]iov. We cannot send a nak. - * We have to embed the error code in the match bits. - * Send the error in bits 52-59 and the cookie in bits 0-51 */ - tx = mxlnd_get_idle_tx(); - if (unlikely(tx == NULL)) { - CNETERR("Can't get tx for GET NAK for %s\n", - libcfs_nid2str(nid)); - /* we can't get a tx, notify the peer that the GET failed */ - mxlnd_send_message(conn->mxk_epa, MXLND_MSG_GET_DATA, - ENODATA, cookie); - ret = -ENOMEM; - break; - } - tx->mxc_msg_type = MXLND_MSG_GET_DATA; - tx->mxc_state = MXLND_CTX_PENDING; - tx->mxc_nid = nid; - tx->mxc_peer = peer; - tx->mxc_conn = conn; - /* no need to lock peer first since we already have a ref */ - mxlnd_conn_addref(conn); /* for this tx */ - tx->mxc_cookie = cookie; - tx->mxc_match = mxlnd_create_match(tx, ENODATA); - tx->mxc_pin_type = MX_PIN_PHYSICAL; - mxlnd_queue_tx(tx); - } - /* finalize lntmsg after tx completes */ - break; - - default: - LBUG(); - } - - if (repost) { - /* we received a message, increment peer's outstanding credits */ - if (credit == 1) { - spin_lock(&conn->mxk_lock); - conn->mxk_outstanding++; - spin_unlock(&conn->mxk_lock); - } - /* we are done with the rx */ - mxlnd_put_idle_rx(rx); - mxlnd_conn_decref(conn); - } - - if (finalize == 1) lnet_finalize(kmxlnd_data.kmx_ni, lntmsg, 0); - - /* we received a credit, see if we can use it to send a msg */ - if (credit) mxlnd_check_sends(peer); - - return ret; -} - -void -mxlnd_sleep(unsigned long timeout) -{ - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(timeout); - return; -} - -/** - * mxlnd_tx_queued - the generic send queue thread - * @arg - thread id (as a void *) - * - * This thread moves send messages from the global tx_queue to the owning - * peer's tx_[msg|data]_queue. If the peer does not exist, it creates one and adds - * it to the global peer list. - */ -int -mxlnd_tx_queued(void *arg) -{ - long id = (long) arg; - int ret = 0; - int found = 0; - kmx_ctx_t *tx = NULL; - kmx_peer_t *peer = NULL; - cfs_list_t *queue = &kmxlnd_data.kmx_tx_queue; - spinlock_t *tx_q_lock = &kmxlnd_data.kmx_tx_queue_lock; - rwlock_t *g_lock = &kmxlnd_data.kmx_global_lock; - - while (!(atomic_read(&kmxlnd_data.kmx_shutdown))) { - ret = down_interruptible(&kmxlnd_data.kmx_tx_queue_sem); - if (atomic_read(&kmxlnd_data.kmx_shutdown)) - break; - if (ret != 0) /* Should we check for -EINTR? */ - continue; - spin_lock(tx_q_lock); - if (cfs_list_empty(&kmxlnd_data.kmx_tx_queue)) { - spin_unlock(tx_q_lock); - continue; - } - tx = cfs_list_entry(queue->next, kmx_ctx_t, mxc_list); - cfs_list_del_init(&tx->mxc_list); - spin_unlock(tx_q_lock); - - found = 0; - peer = mxlnd_find_peer_by_nid(tx->mxc_nid, 0); /* adds ref*/ - if (peer != NULL) { - tx->mxc_peer = peer; - write_lock(g_lock); - if (peer->mxp_conn == NULL) { - ret = mxlnd_conn_alloc_locked(&peer->mxp_conn, - peer); - if (ret != 0) { - /* out of memory: give up, fail tx */ - tx->mxc_errno = -ENOMEM; - mxlnd_peer_decref(peer); - write_unlock(g_lock); - mxlnd_put_idle_tx(tx); - continue; - } - } - tx->mxc_conn = peer->mxp_conn; - mxlnd_conn_addref(tx->mxc_conn); /* for this tx */ - mxlnd_peer_decref(peer); /* drop peer ref taken above */ - write_unlock(g_lock); - mxlnd_queue_tx(tx); - found = 1; - } - if (found == 0) { - int hash = 0; - kmx_peer_t *peer = NULL; - kmx_peer_t *old = NULL; - - hash = mxlnd_nid_to_hash(tx->mxc_nid); - - LASSERT(tx->mxc_msg_type != MXLND_MSG_PUT_DATA && - tx->mxc_msg_type != MXLND_MSG_GET_DATA); - /* create peer */ - /* adds conn ref for this function */ - ret = mxlnd_peer_alloc(&peer, tx->mxc_nid, - *kmxlnd_tunables.kmx_board, - *kmxlnd_tunables.kmx_ep_id, 0ULL); - if (ret != 0) { - /* finalize message */ - tx->mxc_errno = ret; - mxlnd_put_idle_tx(tx); - continue; - } - tx->mxc_peer = peer; - tx->mxc_conn = peer->mxp_conn; - /* this tx will keep the conn ref taken in peer_alloc() */ - - /* add peer to global peer list, but look to see - * if someone already created it after we released - * the read lock */ - write_lock(g_lock); - old = mxlnd_find_peer_by_nid_locked(peer->mxp_nid); - if (old) { - /* we have a peer ref on old */ - if (old->mxp_conn) { - found = 1; - } else { - /* no conn */ - /* drop our ref taken above... */ - mxlnd_peer_decref(old); - /* and delete it */ - mxlnd_del_peer_locked(old); - } - } - - if (found == 0) { - cfs_list_add_tail(&peer->mxp_list, - &kmxlnd_data.kmx_peers[hash]); - atomic_inc(&kmxlnd_data.kmx_npeers); - } else { - tx->mxc_peer = old; - tx->mxc_conn = old->mxp_conn; - LASSERT(old->mxp_conn != NULL); - mxlnd_conn_addref(old->mxp_conn); - mxlnd_conn_decref(peer->mxp_conn); /* drop ref taken above.. */ - mxlnd_conn_decref(peer->mxp_conn); /* drop peer's ref */ - mxlnd_peer_decref(peer); - } - write_unlock(g_lock); - - mxlnd_queue_tx(tx); - } - } - mxlnd_thread_stop(id); - return 0; -} - -/* When calling this, we must not have the peer lock. */ -void -mxlnd_iconnect(kmx_peer_t *peer, u8 msg_type) -{ - mx_return_t mxret = MX_SUCCESS; - mx_request_t request; - kmx_conn_t *conn = peer->mxp_conn; - u64 match = ((u64) msg_type) << MXLND_MSG_OFFSET; - - /* NOTE we are holding a conn ref every time we call this function, - * we do not need to lock the peer before taking another ref */ - mxlnd_conn_addref(conn); /* hold until CONN_REQ or CONN_ACK completes */ - - LASSERT(msg_type == MXLND_MSG_ICON_REQ || msg_type == MXLND_MSG_ICON_ACK); - - if (peer->mxp_reconnect_time == 0) { - peer->mxp_reconnect_time = jiffies; - } - - if (peer->mxp_nic_id == 0ULL) { - int ret = 0; - - ret = mxlnd_ip2nic_id(LNET_NIDADDR(peer->mxp_nid), - &peer->mxp_nic_id, MXLND_LOOKUP_COUNT); - if (ret == 0) { - mx_nic_id_to_board_number(peer->mxp_nic_id, &peer->mxp_board); - } - if (peer->mxp_nic_id == 0ULL && conn->mxk_status == MXLND_CONN_WAIT) { - /* not mapped yet, return */ - spin_lock(&conn->mxk_lock); - mxlnd_set_conn_status(conn, MXLND_CONN_INIT); - spin_unlock(&conn->mxk_lock); - } - } - - if (cfs_time_after(jiffies, - peer->mxp_reconnect_time + MXLND_CONNECT_TIMEOUT) && - conn->mxk_status != MXLND_CONN_DISCONNECT) { - /* give up and notify LNET */ - CDEBUG(D_NET, "timeout trying to connect to %s\n", - libcfs_nid2str(peer->mxp_nid)); - mxlnd_conn_disconnect(conn, 0, 0); - mxlnd_conn_decref(conn); - return; - } - - mxret = mx_iconnect(kmxlnd_data.kmx_endpt, peer->mxp_nic_id, - peer->mxp_ep_id, MXLND_MSG_MAGIC, match, - (void *) peer, &request); - if (unlikely(mxret != MX_SUCCESS)) { - spin_lock(&conn->mxk_lock); - mxlnd_set_conn_status(conn, MXLND_CONN_FAIL); - spin_unlock(&conn->mxk_lock); - CNETERR("mx_iconnect() failed with %s (%d) to %s\n", - mx_strerror(mxret), mxret, libcfs_nid2str(peer->mxp_nid)); - mxlnd_conn_decref(conn); - } - mx_set_request_timeout(kmxlnd_data.kmx_endpt, request, - jiffies_to_msecs(MXLND_CONNECT_TIMEOUT)); - return; -} - -#define MXLND_STATS 0 - -int -mxlnd_check_sends(kmx_peer_t *peer) -{ - int ret = 0; - int found = 0; - mx_return_t mxret = MX_SUCCESS; - kmx_ctx_t *tx = NULL; - kmx_conn_t *conn = NULL; - u8 msg_type = 0; - int credit = 0; - int status = 0; - int ntx_posted = 0; - int credits = 0; -#if MXLND_STATS - static unsigned long last = 0; -#endif - - if (unlikely(peer == NULL)) { - LASSERT(peer != NULL); - return -1; - } - write_lock(&kmxlnd_data.kmx_global_lock); - conn = peer->mxp_conn; - /* NOTE take a ref for the duration of this function since it is - * called when there might not be any queued txs for this peer */ - if (conn) { - if (conn->mxk_status == MXLND_CONN_DISCONNECT) { - write_unlock(&kmxlnd_data.kmx_global_lock); - return -1; - } - mxlnd_conn_addref(conn); /* for duration of this function */ - } - write_unlock(&kmxlnd_data.kmx_global_lock); - - /* do not add another ref for this tx */ - - if (conn == NULL) { - /* we do not have any conns */ - CNETERR("peer %s has no conn\n", libcfs_nid2str(peer->mxp_nid)); - return -1; - } - -#if MXLND_STATS - if (cfs_time_after(jiffies, last)) { - last = jiffies + msecs_to_jiffies(MSEC_PER_SEC); - CDEBUG(D_NET, "status= %s credits= %d outstanding= %d ntx_msgs= %d " - "ntx_posted= %d ntx_data= %d data_posted= %d\n", - mxlnd_connstatus_to_str(conn->mxk_status), conn->mxk_credits, - conn->mxk_outstanding, conn->mxk_ntx_msgs, conn->mxk_ntx_posted, - conn->mxk_ntx_data, conn->mxk_data_posted); - } -#endif - - spin_lock(&conn->mxk_lock); - ntx_posted = conn->mxk_ntx_posted; - credits = conn->mxk_credits; - - LASSERT(ntx_posted <= *kmxlnd_tunables.kmx_peercredits); - LASSERT(ntx_posted >= 0); - - LASSERT(credits <= *kmxlnd_tunables.kmx_peercredits); - LASSERT(credits >= 0); - - /* check number of queued msgs, ignore data */ - if (conn->mxk_outstanding >= MXLND_CREDIT_HIGHWATER()) { - /* check if any txs queued that could return credits... */ - if (cfs_list_empty(&conn->mxk_tx_credit_queue) || - conn->mxk_ntx_msgs == 0) { - /* if not, send a NOOP */ - tx = mxlnd_get_idle_tx(); - if (likely(tx != NULL)) { - tx->mxc_peer = peer; - tx->mxc_conn = peer->mxp_conn; - mxlnd_conn_addref(conn); /* for this tx */ - mxlnd_init_tx_msg (tx, MXLND_MSG_NOOP, 0, peer->mxp_nid); - tx->mxc_match = mxlnd_create_match(tx, 0); - mxlnd_peer_queue_tx_locked(tx); - found = 1; - goto done_locked; - } - } - } - - /* if the peer is not ready, try to connect */ - if (unlikely(conn->mxk_status == MXLND_CONN_INIT || - conn->mxk_status == MXLND_CONN_FAIL)) { - CDEBUG(D_NET, "status=%s\n", mxlnd_connstatus_to_str(conn->mxk_status)); - mxlnd_set_conn_status(conn, MXLND_CONN_WAIT); - spin_unlock(&conn->mxk_lock); - mxlnd_iconnect(peer, (u8) MXLND_MSG_ICON_REQ); - goto done; - } - - while (!cfs_list_empty(&conn->mxk_tx_free_queue) || - !cfs_list_empty(&conn->mxk_tx_credit_queue)) { - /* We have something to send. If we have a queued tx that does not - * require a credit (free), choose it since its completion will - * return a credit (here or at the peer), complete a DATA or - * CONN_REQ or CONN_ACK. */ - cfs_list_t *tmp_tx = NULL; - if (!cfs_list_empty(&conn->mxk_tx_free_queue)) { - tmp_tx = &conn->mxk_tx_free_queue; - } else { - tmp_tx = &conn->mxk_tx_credit_queue; - } - tx = cfs_list_entry(tmp_tx->next, kmx_ctx_t, mxc_list); - - msg_type = tx->mxc_msg_type; - - /* don't try to send a rx */ - LASSERT(tx->mxc_type == MXLND_REQ_TX); - - /* ensure that it is a valid msg type */ - LASSERT(msg_type == MXLND_MSG_CONN_REQ || - msg_type == MXLND_MSG_CONN_ACK || - msg_type == MXLND_MSG_NOOP || - msg_type == MXLND_MSG_EAGER || - msg_type == MXLND_MSG_PUT_REQ || - msg_type == MXLND_MSG_PUT_ACK || - msg_type == MXLND_MSG_PUT_DATA || - msg_type == MXLND_MSG_GET_REQ || - msg_type == MXLND_MSG_GET_DATA); - LASSERT(tx->mxc_peer == peer); - LASSERT(tx->mxc_nid == peer->mxp_nid); - - credit = mxlnd_tx_requires_credit(tx); - if (credit) { - - if (conn->mxk_ntx_posted == *kmxlnd_tunables.kmx_peercredits) { - CDEBUG(D_NET, "%s: posted enough\n", - libcfs_nid2str(peer->mxp_nid)); - goto done_locked; - } - - if (conn->mxk_credits == 0) { - CDEBUG(D_NET, "%s: no credits\n", - libcfs_nid2str(peer->mxp_nid)); - goto done_locked; - } - - if (conn->mxk_credits == 1 && /* last credit reserved for */ - conn->mxk_outstanding == 0) { /* giving back credits */ - CDEBUG(D_NET, "%s: not using last credit\n", - libcfs_nid2str(peer->mxp_nid)); - goto done_locked; - } - } - - if (unlikely(conn->mxk_status != MXLND_CONN_READY)) { - if ( ! (msg_type == MXLND_MSG_CONN_REQ || - msg_type == MXLND_MSG_CONN_ACK)) { - CDEBUG(D_NET, "peer status is %s for tx 0x%llx (%s)\n", - mxlnd_connstatus_to_str(conn->mxk_status), - tx->mxc_cookie, - mxlnd_msgtype_to_str(tx->mxc_msg_type)); - if (conn->mxk_status == MXLND_CONN_DISCONNECT || - cfs_time_aftereq(jiffies, tx->mxc_deadline)) { - cfs_list_del_init(&tx->mxc_list); - tx->mxc_errno = -ECONNABORTED; - spin_unlock(&conn->mxk_lock); - mxlnd_put_idle_tx(tx); - mxlnd_conn_decref(conn); - goto done; - } - goto done_locked; - } - } - - cfs_list_del_init(&tx->mxc_list); - - /* handle credits, etc now while we have the lock to avoid races */ - if (credit) { - conn->mxk_credits--; - conn->mxk_ntx_posted++; - } - if (msg_type != MXLND_MSG_PUT_DATA && - msg_type != MXLND_MSG_GET_DATA) { - if (msg_type != MXLND_MSG_CONN_REQ && - msg_type != MXLND_MSG_CONN_ACK) { - conn->mxk_ntx_msgs--; - } - } - if (tx->mxc_incarnation == 0 && - conn->mxk_incarnation != 0) { - tx->mxc_incarnation = conn->mxk_incarnation; - } - - /* if this is a NOOP and (1) mxp_conn->mxk_outstanding < CREDIT_HIGHWATER - * or (2) there is a non-DATA msg that can return credits in the - * queue, then drop this duplicate NOOP */ - if (unlikely(msg_type == MXLND_MSG_NOOP)) { - if ((conn->mxk_outstanding < MXLND_CREDIT_HIGHWATER()) || - (conn->mxk_ntx_msgs >= 1)) { - conn->mxk_credits++; - conn->mxk_ntx_posted--; - spin_unlock(&conn->mxk_lock); - /* redundant NOOP */ - mxlnd_put_idle_tx(tx); - mxlnd_conn_decref(conn); - CDEBUG(D_NET, "%s: redundant noop\n", - libcfs_nid2str(peer->mxp_nid)); - found = 1; - goto done; - } - } - - found = 1; - if (likely((msg_type != MXLND_MSG_PUT_DATA) && - (msg_type != MXLND_MSG_GET_DATA))) { - mxlnd_pack_msg_locked(tx); - } - - mxret = MX_SUCCESS; - - status = conn->mxk_status; - spin_unlock(&conn->mxk_lock); - - if (likely((status == MXLND_CONN_READY) || - (msg_type == MXLND_MSG_CONN_REQ) || - (msg_type == MXLND_MSG_CONN_ACK))) { - ret = 0; - if (msg_type != MXLND_MSG_CONN_REQ && - msg_type != MXLND_MSG_CONN_ACK) { - /* add to the pending list */ - ret = mxlnd_q_pending_ctx(tx); - } else { - /* CONN_REQ/ACK */ - tx->mxc_state = MXLND_CTX_PENDING; - } - - if (ret == 0) { - if (likely(msg_type != MXLND_MSG_PUT_DATA && - msg_type != MXLND_MSG_GET_DATA)) { - /* send a msg style tx */ - LASSERT(tx->mxc_nseg == 1); - LASSERT(tx->mxc_pin_type == MX_PIN_PHYSICAL); - CDEBUG(D_NET, "sending %s 0x%llx\n", - mxlnd_msgtype_to_str(msg_type), - tx->mxc_cookie); - mxret = mx_kisend(kmxlnd_data.kmx_endpt, - &tx->mxc_seg, - tx->mxc_nseg, - tx->mxc_pin_type, - conn->mxk_epa, - tx->mxc_match, - (void *) tx, - &tx->mxc_mxreq); - } else { - /* send a DATA tx */ - spin_lock(&conn->mxk_lock); - conn->mxk_ntx_data--; - conn->mxk_data_posted++; - spin_unlock(&conn->mxk_lock); - CDEBUG(D_NET, "sending %s 0x%llx\n", - mxlnd_msgtype_to_str(msg_type), - tx->mxc_cookie); - mxret = mx_kisend(kmxlnd_data.kmx_endpt, - tx->mxc_seg_list, - tx->mxc_nseg, - tx->mxc_pin_type, - conn->mxk_epa, - tx->mxc_match, - (void *) tx, - &tx->mxc_mxreq); - } - } else { - /* ret != 0 */ - mxret = MX_CONNECTION_FAILED; - } - if (likely(mxret == MX_SUCCESS)) { - ret = 0; - } else { - CNETERR("mx_kisend() failed with %s (%d) " - "sending to %s\n", mx_strerror(mxret), (int) mxret, - libcfs_nid2str(peer->mxp_nid)); - /* NOTE mx_kisend() only fails if there are not enough - * resources. Do not change the connection status. */ - if (mxret == MX_NO_RESOURCES) { - tx->mxc_errno = -ENOMEM; - } else { - tx->mxc_errno = -ECONNABORTED; - } - if (credit) { - spin_lock(&conn->mxk_lock); - conn->mxk_ntx_posted--; - conn->mxk_credits++; - spin_unlock(&conn->mxk_lock); - } else if (msg_type == MXLND_MSG_PUT_DATA || - msg_type == MXLND_MSG_GET_DATA) { - spin_lock(&conn->mxk_lock); - conn->mxk_data_posted--; - spin_unlock(&conn->mxk_lock); - } - if (msg_type != MXLND_MSG_PUT_DATA && - msg_type != MXLND_MSG_GET_DATA && - msg_type != MXLND_MSG_CONN_REQ && - msg_type != MXLND_MSG_CONN_ACK) { - spin_lock(&conn->mxk_lock); - conn->mxk_outstanding += - tx->mxc_msg->mxm_credits; - spin_unlock(&conn->mxk_lock); - } - if (msg_type != MXLND_MSG_CONN_REQ && - msg_type != MXLND_MSG_CONN_ACK) { - /* remove from the pending list */ - mxlnd_deq_pending_ctx(tx); - } - mxlnd_put_idle_tx(tx); - mxlnd_conn_decref(conn); - } - } - spin_lock(&conn->mxk_lock); - } -done_locked: - spin_unlock(&conn->mxk_lock); -done: - mxlnd_conn_decref(conn); /* drop ref taken at start of function */ - return found; -} - - -/** - * mxlnd_handle_tx_completion - a tx completed, progress or complete the msg - * @ctx - the tx descriptor - * - * Determine which type of send request it was and start the next step, if needed, - * or, if done, signal completion to LNET. After we are done, put back on the - * idle tx list. - */ -void -mxlnd_handle_tx_completion(kmx_ctx_t *tx) -{ - int code = tx->mxc_status.code; - int failed = (code != MX_STATUS_SUCCESS || tx->mxc_errno != 0); - kmx_msg_t *msg = tx->mxc_msg; - kmx_peer_t *peer = tx->mxc_peer; - kmx_conn_t *conn = tx->mxc_conn; - u8 type = tx->mxc_msg_type; - int credit = mxlnd_tx_requires_credit(tx); - u64 cookie = tx->mxc_cookie; - - CDEBUG(D_NET, "entering %s (0x%llx):\n", - mxlnd_msgtype_to_str(tx->mxc_msg_type), cookie); - - LASSERT (peer != NULL); - LASSERT (conn != NULL); - - if (type != MXLND_MSG_PUT_DATA && type != MXLND_MSG_GET_DATA) { - LASSERT (type == msg->mxm_type); - } - - if (failed) { - if (tx->mxc_errno == 0) tx->mxc_errno = -EIO; - } else { - spin_lock(&conn->mxk_lock); - conn->mxk_last_tx = cfs_time_current(); /* jiffies */ - spin_unlock(&conn->mxk_lock); - } - - switch (type) { - - case MXLND_MSG_GET_DATA: - spin_lock(&conn->mxk_lock); - if (conn->mxk_incarnation == tx->mxc_incarnation) { - conn->mxk_outstanding++; - conn->mxk_data_posted--; - } - spin_unlock(&conn->mxk_lock); - break; - - case MXLND_MSG_PUT_DATA: - spin_lock(&conn->mxk_lock); - if (conn->mxk_incarnation == tx->mxc_incarnation) { - conn->mxk_data_posted--; - } - spin_unlock(&conn->mxk_lock); - break; - - case MXLND_MSG_NOOP: - case MXLND_MSG_PUT_REQ: - case MXLND_MSG_PUT_ACK: - case MXLND_MSG_GET_REQ: - case MXLND_MSG_EAGER: - break; - - case MXLND_MSG_CONN_ACK: - if (peer->mxp_incompatible) { - /* we sent our params, now close this conn */ - mxlnd_conn_disconnect(conn, 0, 1); - } - case MXLND_MSG_CONN_REQ: - if (failed) { - CNETERR("%s failed with %s (%d) (errno = %d) to %s\n", - type == MXLND_MSG_CONN_REQ ? "CONN_REQ" : "CONN_ACK", - mx_strstatus(code), code, tx->mxc_errno, - libcfs_nid2str(tx->mxc_nid)); - if (!peer->mxp_incompatible) { - spin_lock(&conn->mxk_lock); - if (code == MX_STATUS_BAD_SESSION) - mxlnd_set_conn_status(conn, - MXLND_CONN_INIT); - else - mxlnd_set_conn_status(conn, - MXLND_CONN_FAIL); - spin_unlock(&conn->mxk_lock); - } - } - break; - - default: - CNETERR("Unknown msg type of %d\n", type); - LBUG(); - } - - if (credit) { - spin_lock(&conn->mxk_lock); - if (conn->mxk_incarnation == tx->mxc_incarnation) { - conn->mxk_ntx_posted--; - } - spin_unlock(&conn->mxk_lock); - } - - mxlnd_put_idle_tx(tx); - mxlnd_conn_decref(conn); - - mxlnd_check_sends(peer); - - CDEBUG(D_NET, "leaving\n"); - return; -} - -/* Handle completion of MSG or DATA rx. - * CONN_REQ and CONN_ACK are handled elsewhere. */ -void -mxlnd_handle_rx_completion(kmx_ctx_t *rx) -{ - int ret = 0; - int repost = 1; - int credit = 1; - u32 nob = rx->mxc_status.xfer_length; - u64 bits = rx->mxc_status.match_info; - kmx_msg_t *msg = rx->mxc_msg; - kmx_peer_t *peer = rx->mxc_peer; - kmx_conn_t *conn = rx->mxc_conn; - u8 type = rx->mxc_msg_type; - u64 seq = bits; - lnet_msg_t *lntmsg[2]; - int result = 0; - int peer_ref = 0; - int conn_ref = 0; - - /* NOTE We may only know the peer's nid if it is a PUT_REQ, GET_REQ, - * failed GET reply */ - - /* NOTE peer may still be NULL if it is a new peer and - * conn may be NULL if this is a re-connect */ - if (likely(peer != NULL && conn != NULL)) { - /* we have a reference on the conn */ - conn_ref = 1; - } else if (peer != NULL && conn == NULL) { - /* we have a reference on the peer */ - peer_ref = 1; - } else if (peer == NULL && conn != NULL) { - /* fatal error */ - CERROR("rx 0x%llx from %s has conn but no peer\n", - bits, libcfs_nid2str(rx->mxc_nid)); - LBUG(); - } /* else peer and conn == NULL */ - - if (conn == NULL && peer != NULL) { - write_lock(&kmxlnd_data.kmx_global_lock); - conn = peer->mxp_conn; - if (conn) { - mxlnd_conn_addref(conn); /* conn takes ref... */ - mxlnd_peer_decref(peer); /* from peer */ - conn_ref = 1; - peer_ref = 0; - } - write_unlock(&kmxlnd_data.kmx_global_lock); - rx->mxc_conn = conn; - } - -#if MXLND_DEBUG - CDEBUG(D_NET, "receiving msg bits=0x%llx nob=%d peer=0x%p\n", bits, nob, peer); -#endif - - lntmsg[0] = NULL; - lntmsg[1] = NULL; - - if (rx->mxc_status.code != MX_STATUS_SUCCESS && - rx->mxc_status.code != MX_STATUS_TRUNCATED) { - CNETERR("rx from %s failed with %s (%d)\n", - libcfs_nid2str(rx->mxc_nid), - mx_strstatus(rx->mxc_status.code), - rx->mxc_status.code); - credit = 0; - goto cleanup; - } - - if (nob == 0) { - /* this may be a failed GET reply */ - if (type == MXLND_MSG_GET_DATA) { - /* get the error (52-59) bits from the match bits */ - ret = (u32) MXLND_ERROR_VAL(rx->mxc_status.match_info); - lntmsg[0] = rx->mxc_lntmsg[0]; - result = -ret; - goto cleanup; - } else { - /* we had a rx complete with 0 bytes (no hdr, nothing) */ - CNETERR("rx from %s returned with 0 bytes\n", - libcfs_nid2str(rx->mxc_nid)); - goto cleanup; - } - } - - /* NOTE PUT_DATA and GET_DATA do not have mxc_msg, do not call unpack() */ - if (type == MXLND_MSG_PUT_DATA) { - /* result = 0; */ - lntmsg[0] = rx->mxc_lntmsg[0]; - goto cleanup; - } else if (type == MXLND_MSG_GET_DATA) { - /* result = 0; */ - lntmsg[0] = rx->mxc_lntmsg[0]; - lntmsg[1] = rx->mxc_lntmsg[1]; - goto cleanup; - } - - ret = mxlnd_unpack_msg(msg, nob); - if (ret != 0) { - CNETERR("Error %d unpacking rx from %s\n", - ret, libcfs_nid2str(rx->mxc_nid)); - goto cleanup; - } - rx->mxc_nob = nob; - type = msg->mxm_type; - - if (rx->mxc_nid != msg->mxm_srcnid || - kmxlnd_data.kmx_ni->ni_nid != msg->mxm_dstnid) { - CNETERR("rx with mismatched NID (type %s) (my nid is " - "0x%llx and rx msg dst is 0x%llx)\n", - mxlnd_msgtype_to_str(type), kmxlnd_data.kmx_ni->ni_nid, - msg->mxm_dstnid); - goto cleanup; - } - - if ((conn != NULL && msg->mxm_srcstamp != conn->mxk_incarnation) || - msg->mxm_dststamp != kmxlnd_data.kmx_incarnation) { - CNETERR("Stale rx from %s with type %s " - "(mxm_srcstamp (%lld) != mxk_incarnation (%lld) " - "|| mxm_dststamp (%lld) != kmx_incarnation (%lld))\n", - libcfs_nid2str(rx->mxc_nid), mxlnd_msgtype_to_str(type), - msg->mxm_srcstamp, conn->mxk_incarnation, - msg->mxm_dststamp, kmxlnd_data.kmx_incarnation); - credit = 0; - goto cleanup; - } - - CDEBUG(D_NET, "Received %s with %d credits\n", - mxlnd_msgtype_to_str(type), msg->mxm_credits); - - LASSERT(peer != NULL && conn != NULL); - if (msg->mxm_credits != 0) { - spin_lock(&conn->mxk_lock); - if (msg->mxm_srcstamp == conn->mxk_incarnation) { - if ((conn->mxk_credits + msg->mxm_credits) > - *kmxlnd_tunables.kmx_peercredits) { - CNETERR("mxk_credits %d mxm_credits %d\n", - conn->mxk_credits, msg->mxm_credits); - } - conn->mxk_credits += msg->mxm_credits; - LASSERT(conn->mxk_credits >= 0); - LASSERT(conn->mxk_credits <= *kmxlnd_tunables.kmx_peercredits); - } - spin_unlock(&conn->mxk_lock); - } - - CDEBUG(D_NET, "switch %s for rx (0x%llx)\n", mxlnd_msgtype_to_str(type), seq); - switch (type) { - case MXLND_MSG_NOOP: - break; - - case MXLND_MSG_EAGER: - ret = lnet_parse(kmxlnd_data.kmx_ni, &msg->mxm_u.eager.mxem_hdr, - msg->mxm_srcnid, rx, 0); - repost = ret < 0; - break; - - case MXLND_MSG_PUT_REQ: - ret = lnet_parse(kmxlnd_data.kmx_ni, &msg->mxm_u.put_req.mxprm_hdr, - msg->mxm_srcnid, rx, 1); - repost = ret < 0; - break; - - case MXLND_MSG_PUT_ACK: { - u64 cookie = (u64) msg->mxm_u.put_ack.mxpam_dst_cookie; - if (cookie > MXLND_MAX_COOKIE) { - CNETERR("NAK for msg_type %d from %s\n", rx->mxc_msg_type, - libcfs_nid2str(rx->mxc_nid)); - result = -((u32) MXLND_ERROR_VAL(cookie)); - lntmsg[0] = rx->mxc_lntmsg[0]; - } else { - mxlnd_send_data(kmxlnd_data.kmx_ni, rx->mxc_lntmsg[0], - rx->mxc_peer, MXLND_MSG_PUT_DATA, - rx->mxc_msg->mxm_u.put_ack.mxpam_dst_cookie); - } - /* repost == 1 */ - break; - } - case MXLND_MSG_GET_REQ: - ret = lnet_parse(kmxlnd_data.kmx_ni, &msg->mxm_u.get_req.mxgrm_hdr, - msg->mxm_srcnid, rx, 1); - repost = ret < 0; - break; - - default: - CNETERR("Bad MXLND message type %x from %s\n", msg->mxm_type, - libcfs_nid2str(rx->mxc_nid)); - ret = -EPROTO; - break; - } - - if (ret < 0) { - CDEBUG(D_NET, "setting PEER_CONN_FAILED\n"); - spin_lock(&conn->mxk_lock); - mxlnd_set_conn_status(conn, MXLND_CONN_FAIL); - spin_unlock(&conn->mxk_lock); - } - -cleanup: - if (conn != NULL) { - spin_lock(&conn->mxk_lock); - conn->mxk_last_rx = cfs_time_current(); /* jiffies */ - spin_unlock(&conn->mxk_lock); - } - - if (repost) { - /* lnet_parse() failed, etc., repost now */ - mxlnd_put_idle_rx(rx); - if (conn != NULL && credit == 1) { - if (type == MXLND_MSG_PUT_DATA || - type == MXLND_MSG_EAGER || - type == MXLND_MSG_PUT_REQ || - type == MXLND_MSG_NOOP) { - spin_lock(&conn->mxk_lock); - conn->mxk_outstanding++; - spin_unlock(&conn->mxk_lock); - } - } - if (conn_ref) mxlnd_conn_decref(conn); - LASSERT(peer_ref == 0); - } - - if (type == MXLND_MSG_PUT_DATA || type == MXLND_MSG_GET_DATA) { - CDEBUG(D_NET, "leaving for rx (0x%llx)\n", bits); - } else { - CDEBUG(D_NET, "leaving for rx (0x%llx)\n", seq); - } - - if (lntmsg[0] != NULL) lnet_finalize(kmxlnd_data.kmx_ni, lntmsg[0], result); - if (lntmsg[1] != NULL) lnet_finalize(kmxlnd_data.kmx_ni, lntmsg[1], result); - - if (conn != NULL && credit == 1) mxlnd_check_sends(peer); - - return; -} - -void -mxlnd_handle_connect_msg(kmx_peer_t *peer, u8 msg_type, mx_status_t status) -{ - kmx_ctx_t *tx = NULL; - kmx_msg_t *txmsg = NULL; - kmx_conn_t *conn = peer->mxp_conn; - u64 nic_id = 0ULL; - u32 ep_id = 0; - u32 sid = 0; - u8 type = (msg_type == MXLND_MSG_ICON_REQ ? - MXLND_MSG_CONN_REQ : MXLND_MSG_CONN_ACK); - - /* a conn ref was taken when calling mx_iconnect(), - * hold it until CONN_REQ or CONN_ACK completes */ - - CDEBUG(D_NET, "entering\n"); - if (status.code != MX_STATUS_SUCCESS) { - int send_bye = (msg_type == MXLND_MSG_ICON_REQ ? 0 : 1); - - CNETERR("mx_iconnect() failed for %s with %s (%d) " - "to %s mxp_nid = 0x%llx mxp_nic_id = 0x%0llx mxp_ep_id = %d\n", - mxlnd_msgtype_to_str(msg_type), - mx_strstatus(status.code), status.code, - libcfs_nid2str(peer->mxp_nid), - peer->mxp_nid, - peer->mxp_nic_id, - peer->mxp_ep_id); - spin_lock(&conn->mxk_lock); - mxlnd_set_conn_status(conn, MXLND_CONN_FAIL); - spin_unlock(&conn->mxk_lock); - - if (cfs_time_after(jiffies, peer->mxp_reconnect_time + - MXLND_CONNECT_TIMEOUT)) { - CNETERR("timeout, calling conn_disconnect()\n"); - mxlnd_conn_disconnect(conn, 0, send_bye); - } - - mxlnd_conn_decref(conn); - return; - } - mx_decompose_endpoint_addr2(status.source, &nic_id, &ep_id, &sid); - write_lock(&kmxlnd_data.kmx_global_lock); - spin_lock(&conn->mxk_lock); - conn->mxk_epa = status.source; - mx_set_endpoint_addr_context(conn->mxk_epa, (void *) conn); - if (msg_type == MXLND_MSG_ICON_ACK && likely(!peer->mxp_incompatible)) { - mxlnd_set_conn_status(conn, MXLND_CONN_READY); - } - spin_unlock(&conn->mxk_lock); - write_unlock(&kmxlnd_data.kmx_global_lock); - - /* mx_iconnect() succeeded, reset delay to 0 */ - write_lock(&kmxlnd_data.kmx_global_lock); - peer->mxp_reconnect_time = 0; - peer->mxp_conn->mxk_sid = sid; - write_unlock(&kmxlnd_data.kmx_global_lock); - - /* marshal CONN_REQ or CONN_ACK msg */ - /* we are still using the conn ref from iconnect() - do not take another */ - tx = mxlnd_get_idle_tx(); - if (tx == NULL) { - CNETERR("Can't obtain %s tx for %s\n", - mxlnd_msgtype_to_str(type), - libcfs_nid2str(peer->mxp_nid)); - spin_lock(&conn->mxk_lock); - mxlnd_set_conn_status(conn, MXLND_CONN_FAIL); - spin_unlock(&conn->mxk_lock); - mxlnd_conn_decref(conn); - return; - } - - tx->mxc_peer = peer; - tx->mxc_conn = conn; - tx->mxc_deadline = jiffies + MXLND_CONNECT_TIMEOUT; - CDEBUG(D_NET, "sending %s\n", mxlnd_msgtype_to_str(type)); - mxlnd_init_tx_msg (tx, type, sizeof(kmx_connreq_msg_t), peer->mxp_nid); - txmsg = tx->mxc_msg; - txmsg->mxm_u.conn_req.mxcrm_queue_depth = *kmxlnd_tunables.kmx_peercredits; - txmsg->mxm_u.conn_req.mxcrm_eager_size = MXLND_MSG_SIZE; - tx->mxc_match = mxlnd_create_match(tx, 0); - - mxlnd_queue_tx(tx); - return; -} - -/** - * mxlnd_request_waitd - the MX request completion thread(s) - * @arg - thread id (as a void *) - * - * This thread waits for a MX completion and then completes the request. - * We will create one thread per CPU. - */ -int -mxlnd_request_waitd(void *arg) -{ - long id = (long) arg; - __u32 result = 0; - mx_return_t mxret = MX_SUCCESS; - mx_status_t status; - kmx_ctx_t *ctx = NULL; - enum kmx_req_state req_type = MXLND_REQ_TX; - kmx_peer_t *peer = NULL; - kmx_conn_t *conn = NULL; -#if MXLND_POLLING - int count = 0; -#endif - - memset(&status, 0, sizeof(status)); - - CDEBUG(D_NET, "%s starting\n", name); - - while (!(atomic_read(&kmxlnd_data.kmx_shutdown))) { - u8 msg_type = 0; - - mxret = MX_SUCCESS; - result = 0; -#if MXLND_POLLING - if (id == 0 && count++ < *kmxlnd_tunables.kmx_polling) { - mxret = mx_test_any(kmxlnd_data.kmx_endpt, 0ULL, 0ULL, - &status, &result); - } else { - count = 0; - mxret = mx_wait_any(kmxlnd_data.kmx_endpt, MXLND_WAIT_TIMEOUT, - 0ULL, 0ULL, &status, &result); - } -#else - mxret = mx_wait_any(kmxlnd_data.kmx_endpt, MXLND_WAIT_TIMEOUT, - 0ULL, 0ULL, &status, &result); -#endif - if (unlikely(atomic_read(&kmxlnd_data.kmx_shutdown))) - break; - - if (result != 1) { - /* nothing completed... */ - continue; - } - - CDEBUG(D_NET, "wait_any() returned with %s (%d) with " - "match_info 0x%llx and length %d\n", - mx_strstatus(status.code), status.code, - (u64) status.match_info, status.msg_length); - - if (status.code != MX_STATUS_SUCCESS) { - CNETERR("wait_any() failed with %s (%d) with " - "match_info 0x%llx and length %d\n", - mx_strstatus(status.code), status.code, - (u64) status.match_info, status.msg_length); - } - - msg_type = MXLND_MSG_TYPE(status.match_info); - - /* This may be a mx_iconnect() request completing, - * check the bit mask for CONN_REQ and CONN_ACK */ - if (msg_type == MXLND_MSG_ICON_REQ || - msg_type == MXLND_MSG_ICON_ACK) { - peer = (kmx_peer_t*) status.context; - mxlnd_handle_connect_msg(peer, msg_type, status); - continue; - } - - /* This must be a tx or rx */ - - /* NOTE: if this is a RX from the unexpected callback, it may - * have very little info. If we dropped it in unexpected_recv(), - * it will not have a context. If so, ignore it. */ - ctx = (kmx_ctx_t *) status.context; - if (ctx != NULL) { - - req_type = ctx->mxc_type; - conn = ctx->mxc_conn; /* this may be NULL */ - mxlnd_deq_pending_ctx(ctx); - - /* copy status to ctx->mxc_status */ - ctx->mxc_status = status; - - switch (req_type) { - case MXLND_REQ_TX: - mxlnd_handle_tx_completion(ctx); - break; - case MXLND_REQ_RX: - mxlnd_handle_rx_completion(ctx); - break; - default: - CNETERR("Unknown ctx type %d\n", req_type); - LBUG(); - break; - } - - /* conn is always set except for the first CONN_REQ rx - * from a new peer */ - if (status.code != MX_STATUS_SUCCESS && conn != NULL) { - mxlnd_conn_disconnect(conn, 1, 1); - } - } - CDEBUG(D_NET, "waitd() completed task\n"); - } - CDEBUG(D_NET, "%s stopping\n", name); - mxlnd_thread_stop(id); - return 0; -} - - -unsigned long -mxlnd_check_timeouts(unsigned long now) -{ - int i = 0; - int disconnect = 0; - unsigned long next = 0; /* jiffies */ - kmx_peer_t *peer = NULL; - kmx_conn_t *conn = NULL; - rwlock_t *g_lock = &kmxlnd_data.kmx_global_lock; - - read_lock(g_lock); - for (i = 0; i < MXLND_HASH_SIZE; i++) { - cfs_list_for_each_entry(peer, &kmxlnd_data.kmx_peers[i], - mxp_list) { - - if (unlikely(atomic_read(&kmxlnd_data.kmx_shutdown))) { - read_unlock(g_lock); - return next; - } - - conn = peer->mxp_conn; - if (conn) { - mxlnd_conn_addref(conn); - } else { - continue; - } - - spin_lock(&conn->mxk_lock); - - /* if nothing pending (timeout == 0) or - * if conn is already disconnected, - * skip this conn */ - if (conn->mxk_timeout == 0 || - conn->mxk_status == MXLND_CONN_DISCONNECT) { - spin_unlock(&conn->mxk_lock); - mxlnd_conn_decref(conn); - continue; - } - - /* we want to find the timeout that will occur first. - * if it is in the future, we will sleep until then. - * if it is in the past, then we will sleep one - * second and repeat the process. */ - if ((next == 0) || - (cfs_time_before(conn->mxk_timeout, next))) { - next = conn->mxk_timeout; - } - - disconnect = 0; - - if (cfs_time_aftereq(now, conn->mxk_timeout)) - disconnect = 1; - spin_unlock(&conn->mxk_lock); - - if (disconnect) - mxlnd_conn_disconnect(conn, 1, 1); - mxlnd_conn_decref(conn); - } - } - read_unlock(g_lock); - if (next == 0) - next = now + MXLND_COMM_TIMEOUT; - - return next; -} - -void -mxlnd_passive_connect(kmx_connparams_t *cp) -{ - int ret = 0; - int incompatible = 0; - u64 nic_id = 0ULL; - u32 ep_id = 0; - u32 sid = 0; - int conn_ref = 0; - kmx_msg_t *msg = &cp->mxr_msg; - kmx_peer_t *peer = cp->mxr_peer; - kmx_conn_t *conn = NULL; - rwlock_t *g_lock = &kmxlnd_data.kmx_global_lock; - - mx_decompose_endpoint_addr2(cp->mxr_epa, &nic_id, &ep_id, &sid); - - ret = mxlnd_unpack_msg(msg, cp->mxr_nob); - if (ret != 0) { - if (peer) { - CNETERR("Error %d unpacking CONN_REQ from %s\n", - ret, libcfs_nid2str(peer->mxp_nid)); - } else { - CNETERR("Error %d unpacking CONN_REQ from " - "unknown host with nic_id 0x%llx\n", ret, nic_id); - } - goto cleanup; - } - if (kmxlnd_data.kmx_ni->ni_nid != msg->mxm_dstnid) { - CNETERR("Can't accept %s: bad dst nid %s\n", - libcfs_nid2str(msg->mxm_srcnid), - libcfs_nid2str(msg->mxm_dstnid)); - goto cleanup; - } - if (msg->mxm_u.conn_req.mxcrm_queue_depth != *kmxlnd_tunables.kmx_peercredits) { - CNETERR("Can't accept %s: incompatible queue depth " - "%d (%d wanted)\n", - libcfs_nid2str(msg->mxm_srcnid), - msg->mxm_u.conn_req.mxcrm_queue_depth, - *kmxlnd_tunables.kmx_peercredits); - incompatible = 1; - } - if (msg->mxm_u.conn_req.mxcrm_eager_size != MXLND_MSG_SIZE) { - CNETERR("Can't accept %s: incompatible EAGER size " - "%d (%d wanted)\n", - libcfs_nid2str(msg->mxm_srcnid), - msg->mxm_u.conn_req.mxcrm_eager_size, - (int) MXLND_MSG_SIZE); - incompatible = 1; - } - - if (peer == NULL) { - peer = mxlnd_find_peer_by_nid(msg->mxm_srcnid, 0); /* adds peer ref */ - if (peer == NULL) { - int hash = 0; - u32 board = 0; - kmx_peer_t *existing_peer = NULL; - - hash = mxlnd_nid_to_hash(msg->mxm_srcnid); - - mx_nic_id_to_board_number(nic_id, &board); - - /* adds conn ref for peer and one for this function */ - ret = mxlnd_peer_alloc(&peer, msg->mxm_srcnid, - board, ep_id, 0ULL); - if (ret != 0) { - goto cleanup; - } - peer->mxp_conn->mxk_sid = sid; - LASSERT(peer->mxp_ep_id == ep_id); - write_lock(g_lock); - existing_peer = mxlnd_find_peer_by_nid_locked(msg->mxm_srcnid); - if (existing_peer) { - mxlnd_conn_decref(peer->mxp_conn); - mxlnd_peer_decref(peer); - peer = existing_peer; - mxlnd_conn_addref(peer->mxp_conn); - conn = peer->mxp_conn; - } else { - cfs_list_add_tail(&peer->mxp_list, - &kmxlnd_data.kmx_peers[hash]); - atomic_inc(&kmxlnd_data.kmx_npeers); - } - write_unlock(g_lock); - } else { - ret = mxlnd_conn_alloc(&conn, peer); /* adds 2nd ref */ - write_lock(g_lock); - mxlnd_peer_decref(peer); /* drop ref taken above */ - write_unlock(g_lock); - if (ret != 0) { - CNETERR("Cannot allocate mxp_conn\n"); - goto cleanup; - } - } - conn_ref = 1; /* peer/conn_alloc() added ref for this function */ - conn = peer->mxp_conn; - } else { /* unexpected handler found peer */ - kmx_conn_t *old_conn = peer->mxp_conn; - - if (sid != peer->mxp_conn->mxk_sid) { - /* do not call mx_disconnect() or send a BYE */ - mxlnd_conn_disconnect(old_conn, 0, 0); - - /* This allocs a conn, points peer->mxp_conn to this one. - * The old conn is still on the peer->mxp_conns list. - * As the pending requests complete, they will call - * conn_decref() which will eventually free it. */ - ret = mxlnd_conn_alloc(&conn, peer); - if (ret != 0) { - CNETERR("Cannot allocate peer->mxp_conn\n"); - goto cleanup; - } - /* conn_alloc() adds one ref for the peer and one - * for this function */ - conn_ref = 1; - - peer->mxp_conn->mxk_sid = sid; - } else { - /* same sid */ - conn = peer->mxp_conn; - } - } - write_lock(g_lock); - peer->mxp_incompatible = incompatible; - write_unlock(g_lock); - spin_lock(&conn->mxk_lock); - conn->mxk_incarnation = msg->mxm_srcstamp; - mxlnd_set_conn_status(conn, MXLND_CONN_WAIT); - spin_unlock(&conn->mxk_lock); - - /* handle_conn_ack() will create the CONN_ACK msg */ - mxlnd_iconnect(peer, (u8) MXLND_MSG_ICON_ACK); - -cleanup: - if (conn_ref) mxlnd_conn_decref(conn); - - mxlnd_connparams_free(cp); - return; -} - -void -mxlnd_check_conn_ack(kmx_connparams_t *cp) -{ - int ret = 0; - int incompatible = 0; - u64 nic_id = 0ULL; - u32 ep_id = 0; - u32 sid = 0; - kmx_msg_t *msg = &cp->mxr_msg; - kmx_peer_t *peer = cp->mxr_peer; - kmx_conn_t *conn = cp->mxr_conn; - - mx_decompose_endpoint_addr2(cp->mxr_epa, &nic_id, &ep_id, &sid); - - ret = mxlnd_unpack_msg(msg, cp->mxr_nob); - if (ret != 0) { - if (peer) { - CNETERR("Error %d unpacking CONN_ACK from %s\n", - ret, libcfs_nid2str(peer->mxp_nid)); - } else { - CNETERR("Error %d unpacking CONN_ACK from " - "unknown host with nic_id 0x%llx\n", ret, nic_id); - } - ret = -1; - incompatible = 1; - goto failed; - } - if (kmxlnd_data.kmx_ni->ni_nid != msg->mxm_dstnid) { - CNETERR("Can't accept CONN_ACK from %s: " - "bad dst nid %s\n", libcfs_nid2str(msg->mxm_srcnid), - libcfs_nid2str(msg->mxm_dstnid)); - ret = -1; - goto failed; - } - if (msg->mxm_u.conn_req.mxcrm_queue_depth != *kmxlnd_tunables.kmx_peercredits) { - CNETERR("Can't accept CONN_ACK from %s: " - "incompatible queue depth %d (%d wanted)\n", - libcfs_nid2str(msg->mxm_srcnid), - msg->mxm_u.conn_req.mxcrm_queue_depth, - *kmxlnd_tunables.kmx_peercredits); - incompatible = 1; - ret = -1; - goto failed; - } - if (msg->mxm_u.conn_req.mxcrm_eager_size != MXLND_MSG_SIZE) { - CNETERR("Can't accept CONN_ACK from %s: " - "incompatible EAGER size %d (%d wanted)\n", - libcfs_nid2str(msg->mxm_srcnid), - msg->mxm_u.conn_req.mxcrm_eager_size, - (int) MXLND_MSG_SIZE); - incompatible = 1; - ret = -1; - goto failed; - } - write_lock(&kmxlnd_data.kmx_global_lock); - peer->mxp_incompatible = incompatible; - write_unlock(&kmxlnd_data.kmx_global_lock); - spin_lock(&conn->mxk_lock); - conn->mxk_credits = *kmxlnd_tunables.kmx_peercredits; - conn->mxk_outstanding = 0; - conn->mxk_incarnation = msg->mxm_srcstamp; - conn->mxk_timeout = 0; - if (!incompatible) { - CDEBUG(D_NET, "setting peer %s CONN_READY\n", - libcfs_nid2str(msg->mxm_srcnid)); - mxlnd_set_conn_status(conn, MXLND_CONN_READY); - } - spin_unlock(&conn->mxk_lock); - - if (!incompatible) - mxlnd_check_sends(peer); - -failed: - if (ret < 0) { - spin_lock(&conn->mxk_lock); - mxlnd_set_conn_status(conn, MXLND_CONN_FAIL); - spin_unlock(&conn->mxk_lock); - } - - if (incompatible) mxlnd_conn_disconnect(conn, 0, 0); - - mxlnd_connparams_free(cp); - return; -} - -int -mxlnd_abort_msgs(void) -{ - int count = 0; - cfs_list_t *orphans = &kmxlnd_data.kmx_orphan_msgs; - spinlock_t *g_conn_lock = &kmxlnd_data.kmx_conn_lock; - - /* abort orphans */ - spin_lock(g_conn_lock); - while (!cfs_list_empty(orphans)) { - kmx_ctx_t *ctx = NULL; - kmx_conn_t *conn = NULL; - - ctx = cfs_list_entry(orphans->next, kmx_ctx_t, mxc_list); - cfs_list_del_init(&ctx->mxc_list); - spin_unlock(g_conn_lock); - - ctx->mxc_errno = -ECONNABORTED; - conn = ctx->mxc_conn; - CDEBUG(D_NET, "aborting %s %s %s\n", - mxlnd_msgtype_to_str(ctx->mxc_msg_type), - ctx->mxc_type == MXLND_REQ_TX ? "(TX) to" : "(RX) from", - libcfs_nid2str(ctx->mxc_nid)); - if (ctx->mxc_type == MXLND_REQ_TX) { - mxlnd_put_idle_tx(ctx); /* do not hold any locks */ - if (conn) mxlnd_conn_decref(conn); /* for this tx */ - } else { - ctx->mxc_state = MXLND_CTX_CANCELED; - mxlnd_handle_rx_completion(ctx); - } - - count++; - spin_lock(g_conn_lock); - } - spin_unlock(g_conn_lock); - - return count; -} - -int -mxlnd_free_conn_zombies(void) -{ - int count = 0; - cfs_list_t *zombies = &kmxlnd_data.kmx_conn_zombies; - spinlock_t *g_conn_lock = &kmxlnd_data.kmx_conn_lock; - rwlock_t *g_lock = &kmxlnd_data.kmx_global_lock; - - /* cleanup any zombies */ - spin_lock(g_conn_lock); - while (!cfs_list_empty(zombies)) { - kmx_conn_t *conn = NULL; - - conn = cfs_list_entry(zombies->next, kmx_conn_t, mxk_zombie); - cfs_list_del_init(&conn->mxk_zombie); - spin_unlock(g_conn_lock); - - write_lock(g_lock); - mxlnd_conn_free_locked(conn); - write_unlock(g_lock); - - count++; - spin_lock(g_conn_lock); - } - spin_unlock(g_conn_lock); - CDEBUG(D_NET, "%s: freed %d zombies\n", __func__, count); - return count; -} - -/** - * mxlnd_connd - handles incoming connection requests - * @arg - thread id (as a void *) - * - * This thread handles incoming connection requests - */ -int -mxlnd_connd(void *arg) -{ - long id = (long) arg; - - CDEBUG(D_NET, "connd starting\n"); - - while (!(atomic_read(&kmxlnd_data.kmx_shutdown))) { - int ret = 0; - kmx_connparams_t *cp = NULL; - spinlock_t *g_conn_lock = &kmxlnd_data.kmx_conn_lock; - cfs_list_t *conn_reqs = &kmxlnd_data.kmx_conn_reqs; - - ret = down_interruptible(&kmxlnd_data.kmx_conn_sem); - - if (atomic_read(&kmxlnd_data.kmx_shutdown)) - break; - - if (ret != 0) - continue; - - ret = mxlnd_abort_msgs(); - ret += mxlnd_free_conn_zombies(); - - spin_lock(g_conn_lock); - if (cfs_list_empty(conn_reqs)) { - if (ret == 0) - CNETERR("connd woke up but did not find a " - "kmx_connparams_t or zombie conn\n"); - spin_unlock(g_conn_lock); - continue; - } - cp = cfs_list_entry(conn_reqs->next, kmx_connparams_t, - mxr_list); - cfs_list_del_init(&cp->mxr_list); - spin_unlock(g_conn_lock); - - switch (MXLND_MSG_TYPE(cp->mxr_match)) { - case MXLND_MSG_CONN_REQ: - /* We have a connection request. Handle it. */ - mxlnd_passive_connect(cp); - break; - case MXLND_MSG_CONN_ACK: - /* The peer is ready for messages */ - mxlnd_check_conn_ack(cp); - break; - } - } - - mxlnd_free_conn_zombies(); - - CDEBUG(D_NET, "connd stopping\n"); - mxlnd_thread_stop(id); - return 0; -} - -/** - * mxlnd_timeoutd - enforces timeouts on messages - * @arg - thread id (as a void *) - * - * This thread queries each peer for its earliest timeout. If a peer has timed out, - * it calls mxlnd_conn_disconnect(). - * - * After checking for timeouts, try progressing sends (call check_sends()). - */ -int -mxlnd_timeoutd(void *arg) -{ - int i = 0; - long id = (long) arg; - unsigned long now = 0; - unsigned long next = 0; - unsigned long delay = msecs_to_jiffies(MSEC_PER_SEC); - kmx_peer_t *peer = NULL; - kmx_peer_t *temp = NULL; - kmx_conn_t *conn = NULL; - rwlock_t *g_lock = &kmxlnd_data.kmx_global_lock; - - CDEBUG(D_NET, "timeoutd starting\n"); - - while (!(atomic_read(&kmxlnd_data.kmx_shutdown))) { - - now = jiffies; - /* if the next timeout has not arrived, go back to sleep */ - if (cfs_time_after(now, next)) { - next = mxlnd_check_timeouts(now); - } - - /* try to progress peers' txs */ - write_lock(g_lock); - for (i = 0; i < MXLND_HASH_SIZE; i++) { - cfs_list_t *peers = &kmxlnd_data.kmx_peers[i]; - - /* NOTE we are safe against the removal of peer, but - * not against the removal of temp */ - cfs_list_for_each_entry_safe(peer, temp, peers, - mxp_list) { - if (atomic_read(&kmxlnd_data.kmx_shutdown)) - break; - mxlnd_peer_addref(peer); /* add ref... */ - conn = peer->mxp_conn; - if (conn && conn->mxk_status != MXLND_CONN_DISCONNECT) { - mxlnd_conn_addref(conn); /* take ref... */ - } else { - CDEBUG(D_NET, "ignoring %s\n", - libcfs_nid2str(peer->mxp_nid)); - mxlnd_peer_decref(peer); /* ...to here */ - continue; - } - - if ((conn->mxk_status == MXLND_CONN_READY || - conn->mxk_status == MXLND_CONN_FAIL) && - cfs_time_after(now, - conn->mxk_last_tx + - msecs_to_jiffies(MSEC_PER_SEC))) { - write_unlock(g_lock); - mxlnd_check_sends(peer); - write_lock(g_lock); - } - mxlnd_conn_decref(conn); /* until here */ - mxlnd_peer_decref(peer); /* ...to here */ - } - } - write_unlock(g_lock); - - mxlnd_sleep(delay); - } - CDEBUG(D_NET, "timeoutd stopping\n"); - mxlnd_thread_stop(id); - return 0; -} diff --git a/lnet/klnds/mxlnd/mxlnd_modparams.c b/lnet/klnds/mxlnd/mxlnd_modparams.c deleted file mode 100644 index 5da8d89..0000000 --- a/lnet/klnds/mxlnd/mxlnd_modparams.c +++ /dev/null @@ -1,253 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (C) 2006 Myricom, Inc. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/mxlnd/mxlnd.c - * - * Author: Eric Barton - * Author: Scott Atchley - */ - -#include "mxlnd.h" - -static int n_waitd = MXLND_N_SCHED; -CFS_MODULE_PARM(n_waitd, "i", int, 0444, - "# of completion daemons"); - -/* this was used to allocate global rxs which are no londer used */ -static int max_peers = MXLND_MAX_PEERS; -CFS_MODULE_PARM(max_peers, "i", int, 0444, - "Unused - was maximum number of peers that may connect"); - -static int cksum = MXLND_CKSUM; -CFS_MODULE_PARM(cksum, "i", int, 0644, - "set non-zero to enable message (not data payload) checksums"); - -static int ntx = MXLND_NTX; -CFS_MODULE_PARM(ntx, "i", int, 0444, - "# of total tx message descriptors"); - -/* this duplicates ntx */ -static int credits = MXLND_NTX; -CFS_MODULE_PARM(credits, "i", int, 0444, - "Unused - was # concurrent sends to all peers"); - -static int peercredits = MXLND_MSG_QUEUE_DEPTH; -CFS_MODULE_PARM(peercredits, "i", int, 0444, - "# concurrent sends to one peer"); - -static int board = MXLND_MX_BOARD; -CFS_MODULE_PARM(board, "i", int, 0444, - "index value of the Myrinet board (NIC)"); - -static int ep_id = MXLND_MX_EP_ID; -CFS_MODULE_PARM(ep_id, "i", int, 0444, "MX endpoint ID"); - -static char *ipif_name = "myri0"; -CFS_MODULE_PARM(ipif_name, "s", charp, 0444, - "IPoMX interface name"); - -static int polling = MXLND_POLLING; -CFS_MODULE_PARM(polling, "i", int, 0444, - "Use 0 to block (wait). A value > 0 will poll that many times before blocking"); - -static char *hosts = NULL; -CFS_MODULE_PARM(hosts, "s", charp, 0444, - "Unused - was IP-to-hostname resolution file"); - -kmx_tunables_t kmxlnd_tunables = { - .kmx_n_waitd = &n_waitd, - .kmx_max_peers = &max_peers, - .kmx_cksum = &cksum, - .kmx_ntx = &ntx, - .kmx_credits = &credits, - .kmx_peercredits = &peercredits, - .kmx_board = &board, - .kmx_ep_id = &ep_id, - .kmx_default_ipif = &ipif_name, - .kmx_polling = &polling -}; - -#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM - -static char ipif_basename_space[32]; - -static struct ctl_table kmxlnd_ctl_table[] = { - { - INIT_CTL_NAME - .procname = "n_waitd", - .data = &n_waitd, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - INIT_CTL_NAME - .procname = "max_peers", - .data = &max_peers, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - INIT_CTL_NAME - .procname = "cksum", - .data = &cksum, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - INIT_CTL_NAME - .procname = "ntx", - .data = &ntx, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - INIT_CTL_NAME - .procname = "credits", - .data = &credits, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - INIT_CTL_NAME - .procname = "peercredits", - .data = &peercredits, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - INIT_CTL_NAME - .procname = "board", - .data = &board, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - INIT_CTL_NAME - .procname = "ep_id", - .data = &ep_id, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - .procname = "ipif_name", - .data = ipif_basename_space, - .maxlen = sizeof(ipif_basename_space), - .mode = 0444, - .proc_handler = &proc_dostring - }, - { - INIT_CTL_NAME - .procname = "polling", - .data = &polling, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { 0 } -}; - -static struct ctl_table kmxlnd_top_ctl_table[] = { - { - INIT_CTL_NAME - .procname = "mxlnd", - .data = NULL, - .maxlen = 0, - .mode = 0555, - .child = kmxlnd_ctl_table - }, - { 0 } -}; - -void -kmxlnd_initstrtunable(char *space, char *str, int size) -{ - strncpy(space, str, size); - space[size-1] = 0; -} - -void -kmxlnd_sysctl_init (void) -{ - kmxlnd_initstrtunable(ipif_basename_space, ipif_name, - sizeof(ipif_basename_space)); - - kmxlnd_tunables.kib_sysctl = - register_sysctl_table(kmxlnd_top_ctl_table); - - if (kmxlnd_tunables.kib_sysctl == NULL) - CWARN("Can't setup /proc tunables\n"); -} - -void -kmxlnd_sysctl_fini (void) -{ - if (kmxlnd_tunables.kib_sysctl != NULL) - unregister_sysctl_table(kmxlnd_tunables.kib_sysctl); -} - -#else - -void -kmxlnd_sysctl_init (void) -{ -} - -void -kmxlnd_sysctl_fini (void) -{ -} - -#endif - -int -kmxlnd_tunables_init (void) -{ - kmxlnd_sysctl_init(); - return 0; -} - -void -kmxlnd_tunables_fini (void) -{ - kmxlnd_sysctl_fini(); -} diff --git a/lnet/klnds/qswlnd/Makefile.in b/lnet/klnds/qswlnd/Makefile.in deleted file mode 100644 index b623e02..0000000 --- a/lnet/klnds/qswlnd/Makefile.in +++ /dev/null @@ -1,6 +0,0 @@ -MODULES := kqswlnd -kqswlnd-objs := qswlnd.o qswlnd_cb.o qswlnd_modparams.o - -EXTRA_POST_CFLAGS := @QSWCPPFLAGS@ -I/usr/include - -@INCLUDE_RULES@ diff --git a/lnet/klnds/qswlnd/autoMakefile.am b/lnet/klnds/qswlnd/autoMakefile.am deleted file mode 100644 index 9a5c168..0000000 --- a/lnet/klnds/qswlnd/autoMakefile.am +++ /dev/null @@ -1,44 +0,0 @@ -# -# GPL HEADER START -# -# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 only, -# as published by the Free Software Foundation. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License version 2 for more details (a copy is included -# in the LICENSE file that accompanied this code). -# -# You should have received a copy of the GNU General Public License -# version 2 along with this program; If not, see -# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf -# -# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, -# CA 95054 USA or visit www.sun.com if you need additional information or -# have any questions. -# -# GPL HEADER END -# - -# -# Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved. -# Use is subject to license terms. -# - -# -# This file is part of Lustre, http://www.lustre.org/ -# Lustre is a trademark of Sun Microsystems, Inc. -# - -if MODULES -if BUILD_QSWLND -modulenet_DATA = kqswlnd$(KMODEXT) -endif -endif - -MOSTLYCLEANFILES = @MOSTLYCLEANFILES@ -EXTRA_DIST = $(kqswlnd-objs:%.o=%.c) qswlnd.h diff --git a/lnet/klnds/qswlnd/qswlnd.c b/lnet/klnds/qswlnd/qswlnd.c deleted file mode 100644 index fa8e8f4..0000000 --- a/lnet/klnds/qswlnd/qswlnd.c +++ /dev/null @@ -1,567 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/qswlnd/qswlnd.c - * - * Author: Eric Barton - */ - -#include "qswlnd.h" - - -lnd_t the_kqswlnd = -{ - .lnd_type = QSWLND, - .lnd_startup = kqswnal_startup, - .lnd_shutdown = kqswnal_shutdown, - .lnd_ctl = kqswnal_ctl, - .lnd_send = kqswnal_send, - .lnd_recv = kqswnal_recv, -}; - -kqswnal_data_t kqswnal_data; - -int -kqswnal_get_tx_desc (struct libcfs_ioctl_data *data) -{ - unsigned long flags; - cfs_list_t *tmp; - kqswnal_tx_t *ktx; - lnet_hdr_t *hdr; - int index = data->ioc_count; - int rc = -ENOENT; - - spin_lock_irqsave(&kqswnal_data.kqn_idletxd_lock, flags); - - cfs_list_for_each (tmp, &kqswnal_data.kqn_activetxds) { - if (index-- != 0) - continue; - - ktx = cfs_list_entry (tmp, kqswnal_tx_t, ktx_list); - hdr = (lnet_hdr_t *)ktx->ktx_buffer; - - data->ioc_count = le32_to_cpu(hdr->payload_length); - data->ioc_nid = le64_to_cpu(hdr->dest_nid); - data->ioc_u64[0] = ktx->ktx_nid; - data->ioc_u32[0] = le32_to_cpu(hdr->type); - data->ioc_u32[1] = ktx->ktx_launcher; - data->ioc_flags = - (cfs_list_empty (&ktx->ktx_schedlist) ? 0 : 1) | - (ktx->ktx_state << 2); - rc = 0; - break; - } - - spin_unlock_irqrestore(&kqswnal_data.kqn_idletxd_lock, flags); - return (rc); -} - -int -kqswnal_ctl (lnet_ni_t *ni, unsigned int cmd, void *arg) -{ - struct libcfs_ioctl_data *data = arg; - - LASSERT (ni == kqswnal_data.kqn_ni); - - switch (cmd) { - case IOC_LIBCFS_GET_TXDESC: - return (kqswnal_get_tx_desc (data)); - - case IOC_LIBCFS_REGISTER_MYNID: - if (data->ioc_nid == ni->ni_nid) - return 0; - - LASSERT (LNET_NIDNET(data->ioc_nid) == LNET_NIDNET(ni->ni_nid)); - - CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID for %s(%s)\n", - libcfs_nid2str(data->ioc_nid), - libcfs_nid2str(ni->ni_nid)); - return 0; - - default: - return (-EINVAL); - } -} - -void -kqswnal_shutdown(lnet_ni_t *ni) -{ - unsigned long flags; - kqswnal_tx_t *ktx; - kqswnal_rx_t *krx; - - CDEBUG (D_NET, "shutdown\n"); - LASSERT (ni->ni_data == &kqswnal_data); - LASSERT (ni == kqswnal_data.kqn_ni); - - switch (kqswnal_data.kqn_init) - { - default: - LASSERT (0); - - case KQN_INIT_ALL: - case KQN_INIT_DATA: - break; - } - - /**********************************************************************/ - /* Signal the start of shutdown... */ - spin_lock_irqsave(&kqswnal_data.kqn_idletxd_lock, flags); - kqswnal_data.kqn_shuttingdown = 1; - spin_unlock_irqrestore(&kqswnal_data.kqn_idletxd_lock, flags); - - /**********************************************************************/ - /* wait for sends that have allocated a tx desc to launch or give up */ - while (atomic_read (&kqswnal_data.kqn_pending_txs) != 0) { - CDEBUG(D_NET, "waiting for %d pending sends\n", - atomic_read (&kqswnal_data.kqn_pending_txs)); - cfs_pause(cfs_time_seconds(1)); - } - - /**********************************************************************/ - /* close elan comms */ - /* Shut down receivers first; rx callbacks might try sending... */ - if (kqswnal_data.kqn_eprx_small != NULL) - ep_free_rcvr (kqswnal_data.kqn_eprx_small); - - if (kqswnal_data.kqn_eprx_large != NULL) - ep_free_rcvr (kqswnal_data.kqn_eprx_large); - - /* NB ep_free_rcvr() returns only after we've freed off all receive - * buffers (see shutdown handling in kqswnal_requeue_rx()). This - * means we must have completed any messages we passed to - * lnet_parse() */ - - if (kqswnal_data.kqn_eptx != NULL) - ep_free_xmtr (kqswnal_data.kqn_eptx); - - /* NB ep_free_xmtr() returns only after all outstanding transmits - * have called their callback... */ - LASSERT(cfs_list_empty(&kqswnal_data.kqn_activetxds)); - - /**********************************************************************/ - /* flag threads to terminate, wake them and wait for them to die */ - kqswnal_data.kqn_shuttingdown = 2; - wake_up_all (&kqswnal_data.kqn_sched_waitq); - - while (atomic_read (&kqswnal_data.kqn_nthreads) != 0) { - CDEBUG(D_NET, "waiting for %d threads to terminate\n", - atomic_read (&kqswnal_data.kqn_nthreads)); - cfs_pause(cfs_time_seconds(1)); - } - - /**********************************************************************/ - /* No more threads. No more portals, router or comms callbacks! - * I control the horizontals and the verticals... - */ - - LASSERT (cfs_list_empty (&kqswnal_data.kqn_readyrxds)); - LASSERT (cfs_list_empty (&kqswnal_data.kqn_donetxds)); - LASSERT (cfs_list_empty (&kqswnal_data.kqn_delayedtxds)); - - /**********************************************************************/ - /* Unmap message buffers and free all descriptors and buffers - */ - - /* FTTB, we need to unmap any remaining mapped memory. When - * ep_dvma_release() get fixed (and releases any mappings in the - * region), we can delete all the code from here --------> */ - - for (ktx = kqswnal_data.kqn_txds; ktx != NULL; ktx = ktx->ktx_alloclist) { - /* If ktx has a buffer, it got mapped; unmap now. NB only - * the pre-mapped stuff is still mapped since all tx descs - * must be idle */ - - if (ktx->ktx_buffer != NULL) - ep_dvma_unload(kqswnal_data.kqn_ep, - kqswnal_data.kqn_ep_tx_nmh, - &ktx->ktx_ebuffer); - } - - for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) { - /* If krx_kiov[0].kiov_page got allocated, it got mapped. - * NB subsequent pages get merged */ - - if (krx->krx_kiov[0].kiov_page != NULL) - ep_dvma_unload(kqswnal_data.kqn_ep, - kqswnal_data.kqn_ep_rx_nmh, - &krx->krx_elanbuffer); - } - /* <----------- to here */ - - if (kqswnal_data.kqn_ep_rx_nmh != NULL) - ep_dvma_release(kqswnal_data.kqn_ep, kqswnal_data.kqn_ep_rx_nmh); - - if (kqswnal_data.kqn_ep_tx_nmh != NULL) - ep_dvma_release(kqswnal_data.kqn_ep, kqswnal_data.kqn_ep_tx_nmh); - - while (kqswnal_data.kqn_txds != NULL) { - ktx = kqswnal_data.kqn_txds; - - if (ktx->ktx_buffer != NULL) - LIBCFS_FREE(ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE); - - kqswnal_data.kqn_txds = ktx->ktx_alloclist; - LIBCFS_FREE(ktx, sizeof(*ktx)); - } - - while (kqswnal_data.kqn_rxds != NULL) { - int i; - - krx = kqswnal_data.kqn_rxds; - for (i = 0; i < krx->krx_npages; i++) - if (krx->krx_kiov[i].kiov_page != NULL) - __free_page (krx->krx_kiov[i].kiov_page); - - kqswnal_data.kqn_rxds = krx->krx_alloclist; - LIBCFS_FREE(krx, sizeof (*krx)); - } - - /* resets flags, pointers to NULL etc */ - memset(&kqswnal_data, 0, sizeof (kqswnal_data)); - - CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read(&libcfs_kmemory)); - - module_put(THIS_MODULE); -} - -int -kqswnal_startup (lnet_ni_t *ni) -{ - EP_RAILMASK all_rails = EP_RAILMASK_ALL; - int rc; - int i; - kqswnal_rx_t *krx; - kqswnal_tx_t *ktx; - int elan_page_idx; - - LASSERT (ni->ni_lnd == &the_kqswlnd); - - /* Only 1 instance supported */ - if (kqswnal_data.kqn_init != KQN_INIT_NOTHING) { - CERROR ("Only 1 instance supported\n"); - return -EPERM; - } - - if (ni->ni_interfaces[0] != NULL) { - CERROR("Explicit interface config not supported\n"); - return -EPERM; - } - - if (*kqswnal_tunables.kqn_credits >= - *kqswnal_tunables.kqn_ntxmsgs) { - LCONSOLE_ERROR_MSG(0x12e, "Configuration error: please set " - "ntxmsgs(%d) > credits(%d)\n", - *kqswnal_tunables.kqn_ntxmsgs, - *kqswnal_tunables.kqn_credits); - } - - CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read(&libcfs_kmemory)); - - /* ensure all pointers NULL etc */ - memset (&kqswnal_data, 0, sizeof (kqswnal_data)); - - kqswnal_data.kqn_ni = ni; - ni->ni_data = &kqswnal_data; - ni->ni_peertxcredits = *kqswnal_tunables.kqn_peercredits; - ni->ni_maxtxcredits = *kqswnal_tunables.kqn_credits; - - CFS_INIT_LIST_HEAD (&kqswnal_data.kqn_idletxds); - CFS_INIT_LIST_HEAD (&kqswnal_data.kqn_activetxds); - spin_lock_init(&kqswnal_data.kqn_idletxd_lock); - - CFS_INIT_LIST_HEAD (&kqswnal_data.kqn_delayedtxds); - CFS_INIT_LIST_HEAD (&kqswnal_data.kqn_donetxds); - CFS_INIT_LIST_HEAD (&kqswnal_data.kqn_readyrxds); - - spin_lock_init(&kqswnal_data.kqn_sched_lock); - init_waitqueue_head (&kqswnal_data.kqn_sched_waitq); - - /* pointers/lists/locks initialised */ - kqswnal_data.kqn_init = KQN_INIT_DATA; - try_module_get(THIS_MODULE); - - kqswnal_data.kqn_ep = ep_system(); - if (kqswnal_data.kqn_ep == NULL) { - CERROR("Can't initialise EKC\n"); - kqswnal_shutdown(ni); - return (-ENODEV); - } - - if (ep_waitfor_nodeid(kqswnal_data.kqn_ep) == ELAN_INVALID_NODE) { - CERROR("Can't get elan ID\n"); - kqswnal_shutdown(ni); - return (-ENODEV); - } - - kqswnal_data.kqn_nnodes = ep_numnodes (kqswnal_data.kqn_ep); - kqswnal_data.kqn_elanid = ep_nodeid (kqswnal_data.kqn_ep); - - ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), kqswnal_data.kqn_elanid); - - /**********************************************************************/ - /* Get the transmitter */ - - kqswnal_data.kqn_eptx = ep_alloc_xmtr (kqswnal_data.kqn_ep); - if (kqswnal_data.kqn_eptx == NULL) - { - CERROR ("Can't allocate transmitter\n"); - kqswnal_shutdown (ni); - return (-ENOMEM); - } - - /**********************************************************************/ - /* Get the receivers */ - - kqswnal_data.kqn_eprx_small = - ep_alloc_rcvr (kqswnal_data.kqn_ep, - EP_MSG_SVC_PORTALS_SMALL, - *kqswnal_tunables.kqn_ep_envelopes_small); - if (kqswnal_data.kqn_eprx_small == NULL) - { - CERROR ("Can't install small msg receiver\n"); - kqswnal_shutdown (ni); - return (-ENOMEM); - } - - kqswnal_data.kqn_eprx_large = - ep_alloc_rcvr (kqswnal_data.kqn_ep, - EP_MSG_SVC_PORTALS_LARGE, - *kqswnal_tunables.kqn_ep_envelopes_large); - if (kqswnal_data.kqn_eprx_large == NULL) - { - CERROR ("Can't install large msg receiver\n"); - kqswnal_shutdown (ni); - return (-ENOMEM); - } - - /**********************************************************************/ - /* Reserve Elan address space for transmit descriptors NB we may - * either send the contents of associated buffers immediately, or - * map them for the peer to suck/blow... */ - kqswnal_data.kqn_ep_tx_nmh = - ep_dvma_reserve(kqswnal_data.kqn_ep, - KQSW_NTXMSGPAGES*(*kqswnal_tunables.kqn_ntxmsgs), - EP_PERM_WRITE); - if (kqswnal_data.kqn_ep_tx_nmh == NULL) { - CERROR("Can't reserve tx dma space\n"); - kqswnal_shutdown(ni); - return (-ENOMEM); - } - - /**********************************************************************/ - /* Reserve Elan address space for receive buffers */ - kqswnal_data.kqn_ep_rx_nmh = - ep_dvma_reserve(kqswnal_data.kqn_ep, - KQSW_NRXMSGPAGES_SMALL * - (*kqswnal_tunables.kqn_nrxmsgs_small) + - KQSW_NRXMSGPAGES_LARGE * - (*kqswnal_tunables.kqn_nrxmsgs_large), - EP_PERM_WRITE); - if (kqswnal_data.kqn_ep_tx_nmh == NULL) { - CERROR("Can't reserve rx dma space\n"); - kqswnal_shutdown(ni); - return (-ENOMEM); - } - - /**********************************************************************/ - /* Allocate/Initialise transmit descriptors */ - - kqswnal_data.kqn_txds = NULL; - for (i = 0; i < (*kqswnal_tunables.kqn_ntxmsgs); i++) - { - int premapped_pages; - int basepage = i * KQSW_NTXMSGPAGES; - - LIBCFS_ALLOC (ktx, sizeof(*ktx)); - if (ktx == NULL) { - kqswnal_shutdown (ni); - return (-ENOMEM); - } - - memset(ktx, 0, sizeof(*ktx)); /* NULL pointers; zero flags */ - ktx->ktx_alloclist = kqswnal_data.kqn_txds; - kqswnal_data.kqn_txds = ktx; - - LIBCFS_ALLOC (ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE); - if (ktx->ktx_buffer == NULL) - { - kqswnal_shutdown (ni); - return (-ENOMEM); - } - - /* Map pre-allocated buffer NOW, to save latency on transmit */ - premapped_pages = kqswnal_pages_spanned(ktx->ktx_buffer, - KQSW_TX_BUFFER_SIZE); - ep_dvma_load(kqswnal_data.kqn_ep, NULL, - ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE, - kqswnal_data.kqn_ep_tx_nmh, basepage, - &all_rails, &ktx->ktx_ebuffer); - - ktx->ktx_basepage = basepage + premapped_pages; /* message mapping starts here */ - ktx->ktx_npages = KQSW_NTXMSGPAGES - premapped_pages; /* for this many pages */ - - CFS_INIT_LIST_HEAD (&ktx->ktx_schedlist); - - ktx->ktx_state = KTX_IDLE; - ktx->ktx_rail = -1; /* unset rail */ - - cfs_list_add_tail (&ktx->ktx_list, &kqswnal_data.kqn_idletxds); - } - - /**********************************************************************/ - /* Allocate/Initialise receive descriptors */ - kqswnal_data.kqn_rxds = NULL; - elan_page_idx = 0; - for (i = 0; i < *kqswnal_tunables.kqn_nrxmsgs_small + *kqswnal_tunables.kqn_nrxmsgs_large; i++) - { - EP_NMD elanbuffer; - int j; - - LIBCFS_ALLOC(krx, sizeof(*krx)); - if (krx == NULL) { - kqswnal_shutdown(ni); - return (-ENOMEM); - } - - memset(krx, 0, sizeof(*krx)); /* clear flags, null pointers etc */ - krx->krx_alloclist = kqswnal_data.kqn_rxds; - kqswnal_data.kqn_rxds = krx; - - if (i < *kqswnal_tunables.kqn_nrxmsgs_small) - { - krx->krx_npages = KQSW_NRXMSGPAGES_SMALL; - krx->krx_eprx = kqswnal_data.kqn_eprx_small; - } - else - { - krx->krx_npages = KQSW_NRXMSGPAGES_LARGE; - krx->krx_eprx = kqswnal_data.kqn_eprx_large; - } - - LASSERT (krx->krx_npages > 0); - for (j = 0; j < krx->krx_npages; j++) - { - struct page *page = alloc_page(GFP_KERNEL); - - if (page == NULL) { - kqswnal_shutdown (ni); - return (-ENOMEM); - } - - krx->krx_kiov[j] = (lnet_kiov_t) {.kiov_page = page, - .kiov_offset = 0, - .kiov_len = PAGE_SIZE}; - LASSERT(page_address(page) != NULL); - - ep_dvma_load(kqswnal_data.kqn_ep, NULL, - page_address(page), - PAGE_SIZE, kqswnal_data.kqn_ep_rx_nmh, - elan_page_idx, &all_rails, &elanbuffer); - - if (j == 0) { - krx->krx_elanbuffer = elanbuffer; - } else { - rc = ep_nmd_merge(&krx->krx_elanbuffer, - &krx->krx_elanbuffer, - &elanbuffer); - /* NB contiguous mapping */ - LASSERT(rc); - } - elan_page_idx++; - - } - } - LASSERT (elan_page_idx == - (*kqswnal_tunables.kqn_nrxmsgs_small * KQSW_NRXMSGPAGES_SMALL) + - (*kqswnal_tunables.kqn_nrxmsgs_large * KQSW_NRXMSGPAGES_LARGE)); - - /**********************************************************************/ - /* Queue receives, now that it's OK to run their completion callbacks */ - - for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) { - /* NB this enqueue can allocate/sleep (attr == 0) */ - krx->krx_state = KRX_POSTED; - rc = ep_queue_receive(krx->krx_eprx, kqswnal_rxhandler, krx, - &krx->krx_elanbuffer, 0); - if (rc != EP_SUCCESS) { - CERROR ("failed ep_queue_receive %d\n", rc); - kqswnal_shutdown (ni); - return (-EIO); - } - } - - /**********************************************************************/ - /* Spawn scheduling threads */ - for (i = 0; i < num_online_cpus(); i++) { - rc = kqswnal_thread_start(kqswnal_scheduler, NULL, - "kqswnal_sched"); - if (rc != 0) - { - CERROR ("failed to spawn scheduling thread: %d\n", rc); - kqswnal_shutdown (ni); - return (-ESRCH); - } - } - - kqswnal_data.kqn_init = KQN_INIT_ALL; - return (0); -} - -void __exit -kqswnal_finalise (void) -{ - lnet_unregister_lnd(&the_kqswlnd); - kqswnal_tunables_fini(); -} - -static int __init -kqswnal_initialise (void) -{ - int rc = kqswnal_tunables_init(); - - if (rc != 0) - return rc; - - lnet_register_lnd(&the_kqswlnd); - return (0); -} - -MODULE_AUTHOR("Sun Microsystems, Inc. "); -MODULE_DESCRIPTION("Kernel Quadrics/Elan LND v1.01"); -MODULE_LICENSE("GPL"); - -module_init (kqswnal_initialise); -module_exit (kqswnal_finalise); diff --git a/lnet/klnds/qswlnd/qswlnd.h b/lnet/klnds/qswlnd/qswlnd.h deleted file mode 100644 index cea5d40..0000000 --- a/lnet/klnds/qswlnd/qswlnd.h +++ /dev/null @@ -1,355 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/qswlnd/qswlnd.h - * - * Basic library routines. - */ - -#ifndef _QSWNAL_H -#define _QSWNAL_H - -#include -#undef printf /* nasty QSW #define */ -#include - -#include - -#include -#include -#include -#include -#include -#include /* wait_on_buffer */ -#include -#include -#include - -#include - -#include -#include -#include -#include -#include - -#define DEBUG_SUBSYSTEM S_LND - -#include -#include -#include - -/* fixed constants */ -#define KQSW_SMALLMSG (4<<10) /* small/large ep receiver breakpoint */ -#define KQSW_RESCHED 100 /* # busy loops that forces scheduler to yield */ - -#define KQSW_CKSUM 0 /* enable checksumming (protocol incompatible) */ - -/* - * derived constants - */ - -#define KQSW_TX_BUFFER_SIZE (offsetof(kqswnal_msg_t, \ - kqm_u.immediate.kqim_payload[*kqswnal_tunables.kqn_tx_maxcontig])) -/* The pre-allocated tx buffer (hdr + small payload) */ - -#define KQSW_NTXMSGPAGES (btopr(KQSW_TX_BUFFER_SIZE) + 1 + btopr(LNET_MAX_PAYLOAD) + 1) -/* Reserve elan address space for pre-allocated and pre-mapped transmit - * buffer and a full payload too. Extra pages allow for page alignment */ - -#define KQSW_NRXMSGPAGES_SMALL (btopr(KQSW_SMALLMSG)) -/* receive hdr/payload always contiguous and page aligned */ -#define KQSW_NRXMSGBYTES_SMALL (KQSW_NRXMSGPAGES_SMALL * PAGE_SIZE) - -#define KQSW_NRXMSGPAGES_LARGE (btopr(sizeof(lnet_msg_t) + LNET_MAX_PAYLOAD)) -/* receive hdr/payload always contiguous and page aligned */ -#define KQSW_NRXMSGBYTES_LARGE (KQSW_NRXMSGPAGES_LARGE * PAGE_SIZE) -/* biggest complete packet we can receive (or transmit) */ - -/* Wire messages */ -/* Remote memory descriptor */ -typedef struct -{ - __u32 kqrmd_nfrag; /* # frags */ - EP_NMD kqrmd_frag[0]; /* actual frags */ -} kqswnal_remotemd_t; - -/* Immediate data */ -typedef struct -{ - lnet_hdr_t kqim_hdr; /* LNET header */ - char kqim_payload[0]; /* piggy-backed payload */ -} WIRE_ATTR kqswnal_immediate_msg_t; - -/* RDMA request */ -typedef struct -{ - lnet_hdr_t kqrm_hdr; /* LNET header */ - kqswnal_remotemd_t kqrm_rmd; /* peer's buffer */ -} WIRE_ATTR kqswnal_rdma_msg_t; - -typedef struct -{ - __u32 kqm_magic; /* I'm a qswlnd message */ - __u16 kqm_version; /* this is my version number */ - __u16 kqm_type; /* msg type */ -#if KQSW_CKSUM - __u32 kqm_cksum; /* crc32 checksum */ - __u32 kqm_nob; /* original msg length */ -#endif - union { - kqswnal_immediate_msg_t immediate; - kqswnal_rdma_msg_t rdma; - } WIRE_ATTR kqm_u; -} WIRE_ATTR kqswnal_msg_t; - -#if KQSW_CKSUM /* enable checksums ? */ -# include -static inline __u32 kqswnal_csum(__u32 crc, unsigned char const *p, size_t len) -{ -#if 1 - return crc32_le(crc, p, len); -#else - while (len-- > 0) - crc = ((crc + 0x100) & ~0xff) | ((crc + *p++) & 0xff) ; - return crc; -#endif -} -# define QSWLND_PROTO_VERSION 0xbeef -#else -# define QSWLND_PROTO_VERSION 1 -#endif - -#define QSWLND_MSG_IMMEDIATE 0 -#define QSWLND_MSG_RDMA 1 - -typedef union { - EP_STATUSBLK ep_statusblk; - struct { - __u32 status; - __u32 magic; - __u32 version; - union { - struct { - __u32 len; - __u32 cksum; - } WIRE_ATTR get; - } WIRE_ATTR u; - } WIRE_ATTR msg; -} kqswnal_rpc_reply_t; - -typedef struct kqswnal_rx -{ - cfs_list_t krx_list; /* enqueue -> thread */ - struct kqswnal_rx *krx_alloclist;/* stack in kqn_rxds */ - EP_RCVR *krx_eprx; /* port to post receives to */ - EP_RXD *krx_rxd; /* receive descriptor (for repost) */ - EP_NMD krx_elanbuffer;/* contiguous Elan buffer */ - int krx_npages; /* # pages in receive buffer */ - int krx_nob; /* Number Of Bytes received into buffer */ - int krx_rpc_reply_needed:1; /* peer waiting for EKC RPC reply */ - int krx_state; /* what this RX is doing */ - atomic_t krx_refcount; /* how to tell when rpc is done */ -#if KQSW_CKSUM - __u32 krx_cksum; /* checksum */ -#endif - kqswnal_rpc_reply_t krx_rpc_reply; /* rpc reply status block */ - lnet_kiov_t krx_kiov[KQSW_NRXMSGPAGES_LARGE];/* buffer frags */ -} kqswnal_rx_t; - -#define KRX_POSTED 1 /* receiving */ -#define KRX_PARSE 2 /* ready to be parsed */ -#define KRX_COMPLETING 3 /* waiting to be completed */ - - -typedef struct kqswnal_tx -{ - cfs_list_t ktx_list; /* enqueue idle/active */ - cfs_list_t ktx_schedlist; /* enqueue on scheduler */ - struct kqswnal_tx *ktx_alloclist; /* stack in kqn_txds */ - unsigned int ktx_state:7; /* What I'm doing */ - unsigned int ktx_firsttmpfrag:1; /* ktx_frags[0] is in my ebuffer ? 0 : 1 */ - __u32 ktx_basepage; /* page offset in reserved elan tx vaddrs for mapping pages */ - int ktx_npages; /* pages reserved for mapping messages */ - int ktx_nmappedpages; /* # pages mapped for current message */ - int ktx_port; /* destination ep port */ - lnet_nid_t ktx_nid; /* destination node */ - void *ktx_args[3]; /* completion passthru */ - char *ktx_buffer; /* pre-allocated contiguous buffer for hdr + small payloads */ - cfs_time_t ktx_launchtime; /* when (in jiffies) the - * transmit was launched */ - int ktx_status; /* completion status */ -#if KQSW_CKSUM - __u32 ktx_cksum; /* optimized GET payload checksum */ -#endif - /* debug/info fields */ - pid_t ktx_launcher; /* pid of launching process */ - - int ktx_nfrag; /* # message frags */ - int ktx_rail; /* preferred rail */ - EP_NMD ktx_ebuffer; /* elan mapping of ktx_buffer */ - EP_NMD ktx_frags[EP_MAXFRAG];/* elan mapping of msg frags */ -} kqswnal_tx_t; - -#define KTX_IDLE 0 /* on kqn_idletxds */ -#define KTX_SENDING 1 /* normal send */ -#define KTX_GETTING 2 /* sending optimised get */ -#define KTX_PUTTING 3 /* sending optimised put */ -#define KTX_RDMA_FETCH 4 /* handling optimised put */ -#define KTX_RDMA_STORE 5 /* handling optimised get */ - -typedef struct -{ - int *kqn_tx_maxcontig; /* maximum payload to defrag */ - int *kqn_ntxmsgs; /* # normal tx msgs */ - int *kqn_credits; /* # concurrent sends */ - int *kqn_peercredits; /* # concurrent sends to 1 peer */ - int *kqn_nrxmsgs_large; /* # 'large' rx msgs */ - int *kqn_ep_envelopes_large; /* # 'large' rx ep envelopes */ - int *kqn_nrxmsgs_small; /* # 'small' rx msgs */ - int *kqn_ep_envelopes_small; /* # 'small' rx ep envelopes */ - int *kqn_optimized_puts; /* optimized PUTs? */ - int *kqn_optimized_gets; /* optimized GETs? */ -#if KQSW_CKSUM - int *kqn_inject_csum_error; /* # csum errors to inject */ -#endif - -#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM - struct ctl_table_header *kqn_sysctl; /* sysctl interface */ -#endif -} kqswnal_tunables_t; - -typedef struct -{ - char kqn_init; /* what's been initialised */ - char kqn_shuttingdown;/* I'm trying to shut down */ - atomic_t kqn_nthreads; /* # threads running */ - lnet_ni_t *kqn_ni; /* _the_ instance of me */ - - kqswnal_rx_t *kqn_rxds; /* stack of all the receive descriptors */ - kqswnal_tx_t *kqn_txds; /* stack of all the transmit descriptors */ - - cfs_list_t kqn_idletxds; /* transmit descriptors free to use */ - cfs_list_t kqn_activetxds; /* transmit descriptors being used */ - spinlock_t kqn_idletxd_lock; /* serialise idle txd access */ - atomic_t kqn_pending_txs; /* # transmits being prepped */ - - spinlock_t kqn_sched_lock; /* serialise packet schedulers */ - wait_queue_head_t kqn_sched_waitq;/* scheduler blocks here */ - - cfs_list_t kqn_readyrxds; /* rxds full of data */ - cfs_list_t kqn_donetxds; /* completed transmits */ - cfs_list_t kqn_delayedtxds;/* delayed transmits */ - - EP_SYS *kqn_ep; /* elan system */ - EP_NMH *kqn_ep_tx_nmh; /* elan reserved tx vaddrs */ - EP_NMH *kqn_ep_rx_nmh; /* elan reserved rx vaddrs */ - EP_XMTR *kqn_eptx; /* elan transmitter */ - EP_RCVR *kqn_eprx_small; /* elan receiver (small messages) */ - EP_RCVR *kqn_eprx_large; /* elan receiver (large messages) */ - - int kqn_nnodes; /* this cluster's size */ - int kqn_elanid; /* this nodes's elan ID */ - - EP_STATUSBLK kqn_rpc_success;/* preset RPC reply status blocks */ - EP_STATUSBLK kqn_rpc_failed; - EP_STATUSBLK kqn_rpc_version;/* reply to future version query */ - EP_STATUSBLK kqn_rpc_magic; /* reply to future version query */ -} kqswnal_data_t; - -/* kqn_init state */ -#define KQN_INIT_NOTHING 0 /* MUST BE ZERO so zeroed state is initialised OK */ -#define KQN_INIT_DATA 1 -#define KQN_INIT_ALL 2 - -extern kqswnal_tunables_t kqswnal_tunables; -extern kqswnal_data_t kqswnal_data; - -extern int kqswnal_thread_start (int (*fn)(void *arg), void *arg); -extern void kqswnal_rxhandler(EP_RXD *rxd); -extern int kqswnal_scheduler (void *); -extern void kqswnal_rx_done (kqswnal_rx_t *krx); - -static inline lnet_nid_t -kqswnal_elanid2nid (int elanid) -{ - return LNET_MKNID(LNET_NIDNET(kqswnal_data.kqn_ni->ni_nid), elanid); -} - -static inline int -kqswnal_nid2elanid (lnet_nid_t nid) -{ - __u32 elanid = LNET_NIDADDR(nid); - - /* not in this cluster? */ - return (elanid >= kqswnal_data.kqn_nnodes) ? -1 : elanid; -} - -static inline lnet_nid_t -kqswnal_rx_nid(kqswnal_rx_t *krx) -{ - return (kqswnal_elanid2nid(ep_rxd_node(krx->krx_rxd))); -} - -static inline int -kqswnal_pages_spanned (void *base, int nob) -{ - unsigned long first_page = ((unsigned long)base) >> PAGE_SHIFT; - unsigned long last_page = (((unsigned long)base) + (nob - 1)) >> PAGE_SHIFT; - - LASSERT (last_page >= first_page); /* can't wrap address space */ - return (last_page - first_page + 1); -} - -static inline void kqswnal_rx_decref (kqswnal_rx_t *krx) -{ - LASSERT (atomic_read (&krx->krx_refcount) > 0); - if (atomic_dec_and_test (&krx->krx_refcount)) - kqswnal_rx_done(krx); -} - -int kqswnal_startup (lnet_ni_t *ni); -void kqswnal_shutdown (lnet_ni_t *ni); -int kqswnal_ctl (lnet_ni_t *ni, unsigned int cmd, void *arg); -int kqswnal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); -int kqswnal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, - int delayed, unsigned int niov, - struct iovec *iov, lnet_kiov_t *kiov, - unsigned int offset, unsigned int mlen, unsigned int rlen); - -int kqswnal_tunables_init(void); -void kqswnal_tunables_fini(void); - -#endif /* _QSWNAL_H */ diff --git a/lnet/klnds/qswlnd/qswlnd_cb.c b/lnet/klnds/qswlnd/qswlnd_cb.c deleted file mode 100644 index 99eb1cc..0000000 --- a/lnet/klnds/qswlnd/qswlnd_cb.c +++ /dev/null @@ -1,1778 +0,0 @@ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * - * Copyright (c) 2012, Intel Corporation. - * - * Author: Eric Barton - * - * This file is part of Portals, http://www.lustre.org - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include "qswlnd.h" - -void -kqswnal_notify_peer_down(kqswnal_tx_t *ktx) -{ - time_t then; - - then = cfs_time_current_sec() - - cfs_duration_sec(cfs_time_current() - - ktx->ktx_launchtime); - - lnet_notify(kqswnal_data.kqn_ni, ktx->ktx_nid, 0, then); -} - -void -kqswnal_unmap_tx (kqswnal_tx_t *ktx) -{ - int i; - - ktx->ktx_rail = -1; /* unset rail */ - - if (ktx->ktx_nmappedpages == 0) - return; - - CDEBUG(D_NET, "%p unloading %d frags starting at %d\n", - ktx, ktx->ktx_nfrag, ktx->ktx_firsttmpfrag); - - for (i = ktx->ktx_firsttmpfrag; i < ktx->ktx_nfrag; i++) - ep_dvma_unload(kqswnal_data.kqn_ep, - kqswnal_data.kqn_ep_tx_nmh, - &ktx->ktx_frags[i]); - - ktx->ktx_nmappedpages = 0; -} - -int -kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int offset, int nob, - unsigned int niov, lnet_kiov_t *kiov) -{ - int nfrags = ktx->ktx_nfrag; - int nmapped = ktx->ktx_nmappedpages; - int maxmapped = ktx->ktx_npages; - __u32 basepage = ktx->ktx_basepage + nmapped; - char *ptr; - - EP_RAILMASK railmask; - int rail; - - if (ktx->ktx_rail < 0) - ktx->ktx_rail = ep_xmtr_prefrail(kqswnal_data.kqn_eptx, - EP_RAILMASK_ALL, - kqswnal_nid2elanid(ktx->ktx_nid)); - rail = ktx->ktx_rail; - if (rail < 0) { - CERROR("No rails available for %s\n", libcfs_nid2str(ktx->ktx_nid)); - return (-ENETDOWN); - } - railmask = 1 << rail; - - LASSERT (nmapped <= maxmapped); - LASSERT (nfrags >= ktx->ktx_firsttmpfrag); - LASSERT (nfrags <= EP_MAXFRAG); - LASSERT (niov > 0); - LASSERT (nob > 0); - - /* skip complete frags before 'offset' */ - while (offset >= kiov->kiov_len) { - offset -= kiov->kiov_len; - kiov++; - niov--; - LASSERT (niov > 0); - } - - do { - int fraglen = kiov->kiov_len - offset; - - /* each page frag is contained in one page */ - LASSERT (kiov->kiov_offset + kiov->kiov_len <= PAGE_SIZE); - - if (fraglen > nob) - fraglen = nob; - - nmapped++; - if (nmapped > maxmapped) { - CERROR("Can't map message in %d pages (max %d)\n", - nmapped, maxmapped); - return (-EMSGSIZE); - } - - if (nfrags == EP_MAXFRAG) { - CERROR("Message too fragmented in Elan VM (max %d frags)\n", - EP_MAXFRAG); - return (-EMSGSIZE); - } - - /* XXX this is really crap, but we'll have to kmap until - * EKC has a page (rather than vaddr) mapping interface */ - - ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset; - - CDEBUG(D_NET, - "%p[%d] loading %p for %d, page %d, %d total\n", - ktx, nfrags, ptr, fraglen, basepage, nmapped); - - ep_dvma_load(kqswnal_data.kqn_ep, NULL, - ptr, fraglen, - kqswnal_data.kqn_ep_tx_nmh, basepage, - &railmask, &ktx->ktx_frags[nfrags]); - - if (nfrags == ktx->ktx_firsttmpfrag || - !ep_nmd_merge(&ktx->ktx_frags[nfrags - 1], - &ktx->ktx_frags[nfrags - 1], - &ktx->ktx_frags[nfrags])) { - /* new frag if this is the first or can't merge */ - nfrags++; - } - - kunmap (kiov->kiov_page); - - /* keep in loop for failure case */ - ktx->ktx_nmappedpages = nmapped; - - basepage++; - kiov++; - niov--; - nob -= fraglen; - offset = 0; - - /* iov must not run out before end of data */ - LASSERT (nob == 0 || niov > 0); - - } while (nob > 0); - - ktx->ktx_nfrag = nfrags; - CDEBUG (D_NET, "%p got %d frags over %d pages\n", - ktx, ktx->ktx_nfrag, ktx->ktx_nmappedpages); - - return (0); -} - -#if KQSW_CKSUM -__u32 -kqswnal_csum_kiov (__u32 csum, int offset, int nob, - unsigned int niov, lnet_kiov_t *kiov) -{ - char *ptr; - - if (nob == 0) - return csum; - - LASSERT (niov > 0); - LASSERT (nob > 0); - - /* skip complete frags before 'offset' */ - while (offset >= kiov->kiov_len) { - offset -= kiov->kiov_len; - kiov++; - niov--; - LASSERT (niov > 0); - } - - do { - int fraglen = kiov->kiov_len - offset; - - /* each page frag is contained in one page */ - LASSERT (kiov->kiov_offset + kiov->kiov_len <= PAGE_SIZE); - - if (fraglen > nob) - fraglen = nob; - - ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset; - - csum = kqswnal_csum(csum, ptr, fraglen); - - kunmap (kiov->kiov_page); - - kiov++; - niov--; - nob -= fraglen; - offset = 0; - - /* iov must not run out before end of data */ - LASSERT (nob == 0 || niov > 0); - - } while (nob > 0); - - return csum; -} -#endif - -int -kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int offset, int nob, - unsigned int niov, struct iovec *iov) -{ - int nfrags = ktx->ktx_nfrag; - int nmapped = ktx->ktx_nmappedpages; - int maxmapped = ktx->ktx_npages; - __u32 basepage = ktx->ktx_basepage + nmapped; - - EP_RAILMASK railmask; - int rail; - - if (ktx->ktx_rail < 0) - ktx->ktx_rail = ep_xmtr_prefrail(kqswnal_data.kqn_eptx, - EP_RAILMASK_ALL, - kqswnal_nid2elanid(ktx->ktx_nid)); - rail = ktx->ktx_rail; - if (rail < 0) { - CERROR("No rails available for %s\n", libcfs_nid2str(ktx->ktx_nid)); - return (-ENETDOWN); - } - railmask = 1 << rail; - - LASSERT (nmapped <= maxmapped); - LASSERT (nfrags >= ktx->ktx_firsttmpfrag); - LASSERT (nfrags <= EP_MAXFRAG); - LASSERT (niov > 0); - LASSERT (nob > 0); - - /* skip complete frags before offset */ - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - iov++; - niov--; - LASSERT (niov > 0); - } - - do { - int fraglen = iov->iov_len - offset; - long npages; - - if (fraglen > nob) - fraglen = nob; - npages = kqswnal_pages_spanned (iov->iov_base, fraglen); - - nmapped += npages; - if (nmapped > maxmapped) { - CERROR("Can't map message in %d pages (max %d)\n", - nmapped, maxmapped); - return (-EMSGSIZE); - } - - if (nfrags == EP_MAXFRAG) { - CERROR("Message too fragmented in Elan VM (max %d frags)\n", - EP_MAXFRAG); - return (-EMSGSIZE); - } - - CDEBUG(D_NET, - "%p[%d] loading %p for %d, pages %d for %ld, %d total\n", - ktx, nfrags, iov->iov_base + offset, fraglen, - basepage, npages, nmapped); - - ep_dvma_load(kqswnal_data.kqn_ep, NULL, - iov->iov_base + offset, fraglen, - kqswnal_data.kqn_ep_tx_nmh, basepage, - &railmask, &ktx->ktx_frags[nfrags]); - - if (nfrags == ktx->ktx_firsttmpfrag || - !ep_nmd_merge(&ktx->ktx_frags[nfrags - 1], - &ktx->ktx_frags[nfrags - 1], - &ktx->ktx_frags[nfrags])) { - /* new frag if this is the first or can't merge */ - nfrags++; - } - - /* keep in loop for failure case */ - ktx->ktx_nmappedpages = nmapped; - - basepage += npages; - iov++; - niov--; - nob -= fraglen; - offset = 0; - - /* iov must not run out before end of data */ - LASSERT (nob == 0 || niov > 0); - - } while (nob > 0); - - ktx->ktx_nfrag = nfrags; - CDEBUG (D_NET, "%p got %d frags over %d pages\n", - ktx, ktx->ktx_nfrag, ktx->ktx_nmappedpages); - - return (0); -} - -#if KQSW_CKSUM -__u32 -kqswnal_csum_iov (__u32 csum, int offset, int nob, - unsigned int niov, struct iovec *iov) -{ - if (nob == 0) - return csum; - - LASSERT (niov > 0); - LASSERT (nob > 0); - - /* skip complete frags before offset */ - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - iov++; - niov--; - LASSERT (niov > 0); - } - - do { - int fraglen = iov->iov_len - offset; - - if (fraglen > nob) - fraglen = nob; - - csum = kqswnal_csum(csum, iov->iov_base + offset, fraglen); - - iov++; - niov--; - nob -= fraglen; - offset = 0; - - /* iov must not run out before end of data */ - LASSERT (nob == 0 || niov > 0); - - } while (nob > 0); - - return csum; -} -#endif - -void -kqswnal_put_idle_tx (kqswnal_tx_t *ktx) -{ - unsigned long flags; - - kqswnal_unmap_tx(ktx); /* release temporary mappings */ - ktx->ktx_state = KTX_IDLE; - - spin_lock_irqsave(&kqswnal_data.kqn_idletxd_lock, flags); - - cfs_list_del(&ktx->ktx_list); /* take off active list */ - cfs_list_add(&ktx->ktx_list, &kqswnal_data.kqn_idletxds); - - spin_unlock_irqrestore(&kqswnal_data.kqn_idletxd_lock, flags); -} - -kqswnal_tx_t * -kqswnal_get_idle_tx (void) -{ - unsigned long flags; - kqswnal_tx_t *ktx; - - spin_lock_irqsave(&kqswnal_data.kqn_idletxd_lock, flags); - - if (kqswnal_data.kqn_shuttingdown || - cfs_list_empty(&kqswnal_data.kqn_idletxds)) { - spin_unlock_irqrestore(&kqswnal_data.kqn_idletxd_lock, flags); - - return NULL; - } - - ktx = cfs_list_entry (kqswnal_data.kqn_idletxds.next, kqswnal_tx_t, - ktx_list); - cfs_list_del (&ktx->ktx_list); - - cfs_list_add (&ktx->ktx_list, &kqswnal_data.kqn_activetxds); - ktx->ktx_launcher = current->pid; - atomic_inc(&kqswnal_data.kqn_pending_txs); - - spin_unlock_irqrestore(&kqswnal_data.kqn_idletxd_lock, flags); - - /* Idle descs can't have any mapped (as opposed to pre-mapped) pages */ - LASSERT (ktx->ktx_nmappedpages == 0); - return (ktx); -} - -void -kqswnal_tx_done_in_thread_context (kqswnal_tx_t *ktx) -{ - lnet_msg_t *lnetmsg0 = NULL; - lnet_msg_t *lnetmsg1 = NULL; - int status0 = 0; - int status1 = 0; - kqswnal_rx_t *krx; - - LASSERT (!in_interrupt()); - - if (ktx->ktx_status == -EHOSTDOWN) - kqswnal_notify_peer_down(ktx); - - switch (ktx->ktx_state) { - case KTX_RDMA_FETCH: /* optimized PUT/REPLY handled */ - krx = (kqswnal_rx_t *)ktx->ktx_args[0]; - lnetmsg0 = (lnet_msg_t *)ktx->ktx_args[1]; - status0 = ktx->ktx_status; -#if KQSW_CKSUM - if (status0 == 0) { /* RDMA succeeded */ - kqswnal_msg_t *msg; - __u32 csum; - - msg = (kqswnal_msg_t *) - page_address(krx->krx_kiov[0].kiov_page); - - csum = (lnetmsg0->msg_kiov != NULL) ? - kqswnal_csum_kiov(krx->krx_cksum, - lnetmsg0->msg_offset, - lnetmsg0->msg_wanted, - lnetmsg0->msg_niov, - lnetmsg0->msg_kiov) : - kqswnal_csum_iov(krx->krx_cksum, - lnetmsg0->msg_offset, - lnetmsg0->msg_wanted, - lnetmsg0->msg_niov, - lnetmsg0->msg_iov); - - /* Can only check csum if I got it all */ - if (lnetmsg0->msg_wanted == lnetmsg0->msg_len && - csum != msg->kqm_cksum) { - ktx->ktx_status = -EIO; - krx->krx_rpc_reply.msg.status = -EIO; - CERROR("RDMA checksum failed %u(%u) from %s\n", - csum, msg->kqm_cksum, - libcfs_nid2str(kqswnal_rx_nid(krx))); - } - } -#endif - LASSERT (krx->krx_state == KRX_COMPLETING); - kqswnal_rx_decref (krx); - break; - - case KTX_RDMA_STORE: /* optimized GET handled */ - case KTX_PUTTING: /* optimized PUT sent */ - case KTX_SENDING: /* normal send */ - lnetmsg0 = (lnet_msg_t *)ktx->ktx_args[1]; - status0 = ktx->ktx_status; - break; - - case KTX_GETTING: /* optimized GET sent & payload received */ - /* Complete the GET with success since we can't avoid - * delivering a REPLY event; we committed to it when we - * launched the GET */ - lnetmsg0 = (lnet_msg_t *)ktx->ktx_args[1]; - status0 = 0; - lnetmsg1 = (lnet_msg_t *)ktx->ktx_args[2]; - status1 = ktx->ktx_status; -#if KQSW_CKSUM - if (status1 == 0) { /* RDMA succeeded */ - lnet_msg_t *lnetmsg0 = (lnet_msg_t *)ktx->ktx_args[1]; - lnet_libmd_t *md = lnetmsg0->msg_md; - __u32 csum; - - csum = ((md->md_options & LNET_MD_KIOV) != 0) ? - kqswnal_csum_kiov(~0, 0, - md->md_length, - md->md_niov, - md->md_iov.kiov) : - kqswnal_csum_iov(~0, 0, - md->md_length, - md->md_niov, - md->md_iov.iov); - - if (csum != ktx->ktx_cksum) { - CERROR("RDMA checksum failed %u(%u) from %s\n", - csum, ktx->ktx_cksum, - libcfs_nid2str(ktx->ktx_nid)); - status1 = -EIO; - } - } -#endif - break; - - default: - LASSERT (0); - } - - kqswnal_put_idle_tx (ktx); - - lnet_finalize (kqswnal_data.kqn_ni, lnetmsg0, status0); - if (lnetmsg1 != NULL) - lnet_finalize (kqswnal_data.kqn_ni, lnetmsg1, status1); -} - -void -kqswnal_tx_done (kqswnal_tx_t *ktx, int status) -{ - unsigned long flags; - - ktx->ktx_status = status; - - if (!in_interrupt()) { - kqswnal_tx_done_in_thread_context(ktx); - return; - } - - /* Complete the send in thread context */ - spin_lock_irqsave(&kqswnal_data.kqn_sched_lock, flags); - - cfs_list_add_tail(&ktx->ktx_schedlist, - &kqswnal_data.kqn_donetxds); - wake_up(&kqswnal_data.kqn_sched_waitq); - - spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, flags); -} - -static void -kqswnal_txhandler(EP_TXD *txd, void *arg, int status) -{ - kqswnal_tx_t *ktx = (kqswnal_tx_t *)arg; - kqswnal_rpc_reply_t *reply; - - LASSERT (txd != NULL); - LASSERT (ktx != NULL); - - CDEBUG(D_NET, "txd %p, arg %p status %d\n", txd, arg, status); - - if (status != EP_SUCCESS) { - - CNETERR("Tx completion to %s failed: %d\n", - libcfs_nid2str(ktx->ktx_nid), status); - - status = -EHOSTDOWN; - - } else switch (ktx->ktx_state) { - - case KTX_GETTING: - case KTX_PUTTING: - /* RPC complete! */ - reply = (kqswnal_rpc_reply_t *)ep_txd_statusblk(txd); - if (reply->msg.magic == 0) { /* "old" peer */ - status = reply->msg.status; - break; - } - - if (reply->msg.magic != LNET_PROTO_QSW_MAGIC) { - if (reply->msg.magic != swab32(LNET_PROTO_QSW_MAGIC)) { - CERROR("%s unexpected rpc reply magic %08x\n", - libcfs_nid2str(ktx->ktx_nid), - reply->msg.magic); - status = -EPROTO; - break; - } - - __swab32s(&reply->msg.status); - __swab32s(&reply->msg.version); - - if (ktx->ktx_state == KTX_GETTING) { - __swab32s(&reply->msg.u.get.len); - __swab32s(&reply->msg.u.get.cksum); - } - } - - status = reply->msg.status; - if (status != 0) { - CERROR("%s RPC status %08x\n", - libcfs_nid2str(ktx->ktx_nid), status); - break; - } - - if (ktx->ktx_state == KTX_GETTING) { - lnet_set_reply_msg_len(kqswnal_data.kqn_ni, - (lnet_msg_t *)ktx->ktx_args[2], - reply->msg.u.get.len); -#if KQSW_CKSUM - ktx->ktx_cksum = reply->msg.u.get.cksum; -#endif - } - break; - - case KTX_SENDING: - status = 0; - break; - - default: - LBUG(); - break; - } - - kqswnal_tx_done(ktx, status); -} - -int -kqswnal_launch (kqswnal_tx_t *ktx) -{ - /* Don't block for transmit descriptor if we're in interrupt context */ - int attr = in_interrupt() ? (EP_NO_SLEEP | EP_NO_ALLOC) : 0; - int dest = kqswnal_nid2elanid (ktx->ktx_nid); - unsigned long flags; - int rc; - - ktx->ktx_launchtime = cfs_time_current(); - - if (kqswnal_data.kqn_shuttingdown) - return (-ESHUTDOWN); - - LASSERT (dest >= 0); /* must be a peer */ - - if (ktx->ktx_nmappedpages != 0) - attr = EP_SET_PREFRAIL(attr, ktx->ktx_rail); - - switch (ktx->ktx_state) { - case KTX_GETTING: - case KTX_PUTTING: - if (the_lnet.ln_testprotocompat != 0) { - kqswnal_msg_t *msg = (kqswnal_msg_t *)ktx->ktx_buffer; - - /* single-shot proto test: - * Future version queries will use an RPC, so I'll - * co-opt one of the existing ones */ - LNET_LOCK(); - if ((the_lnet.ln_testprotocompat & 1) != 0) { - msg->kqm_version++; - the_lnet.ln_testprotocompat &= ~1; - } - if ((the_lnet.ln_testprotocompat & 2) != 0) { - msg->kqm_magic = LNET_PROTO_MAGIC; - the_lnet.ln_testprotocompat &= ~2; - } - LNET_UNLOCK(); - } - - /* NB ktx_frag[0] is the GET/PUT hdr + kqswnal_remotemd_t. - * The other frags are the payload, awaiting RDMA */ - rc = ep_transmit_rpc(kqswnal_data.kqn_eptx, dest, - ktx->ktx_port, attr, - kqswnal_txhandler, ktx, - NULL, ktx->ktx_frags, 1); - break; - - case KTX_SENDING: - rc = ep_transmit_message(kqswnal_data.kqn_eptx, dest, - ktx->ktx_port, attr, - kqswnal_txhandler, ktx, - NULL, ktx->ktx_frags, ktx->ktx_nfrag); - break; - - default: - LBUG(); - rc = -EINVAL; /* no compiler warning please */ - break; - } - - switch (rc) { - case EP_SUCCESS: /* success */ - return (0); - - case EP_ENOMEM: /* can't allocate ep txd => queue for later */ - spin_lock_irqsave(&kqswnal_data.kqn_sched_lock, flags); - - cfs_list_add_tail(&ktx->ktx_schedlist, - &kqswnal_data.kqn_delayedtxds); - wake_up(&kqswnal_data.kqn_sched_waitq); - - spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, - flags); - return (0); - - default: /* fatal error */ - CNETERR ("Tx to %s failed: %d\n", - libcfs_nid2str(ktx->ktx_nid), rc); - kqswnal_notify_peer_down(ktx); - return (-EHOSTUNREACH); - } -} - -#if 0 -static char * -hdr_type_string (lnet_hdr_t *hdr) -{ - switch (hdr->type) { - case LNET_MSG_ACK: - return ("ACK"); - case LNET_MSG_PUT: - return ("PUT"); - case LNET_MSG_GET: - return ("GET"); - case LNET_MSG_REPLY: - return ("REPLY"); - default: - return (""); - } -} - -static void -kqswnal_cerror_hdr(lnet_hdr_t * hdr) -{ - char *type_str = hdr_type_string (hdr); - - CERROR("P3 Header at %p of type %s length %d\n", hdr, type_str, - le32_to_cpu(hdr->payload_length)); - CERROR(" From nid/pid "LPU64"/%u\n", le64_to_cpu(hdr->src_nid), - le32_to_cpu(hdr->src_pid)); - CERROR(" To nid/pid "LPU64"/%u\n", le64_to_cpu(hdr->dest_nid), - le32_to_cpu(hdr->dest_pid)); - - switch (le32_to_cpu(hdr->type)) { - case LNET_MSG_PUT: - CERROR(" Ptl index %d, ack md "LPX64"."LPX64", " - "match bits "LPX64"\n", - le32_to_cpu(hdr->msg.put.ptl_index), - hdr->msg.put.ack_wmd.wh_interface_cookie, - hdr->msg.put.ack_wmd.wh_object_cookie, - le64_to_cpu(hdr->msg.put.match_bits)); - CERROR(" offset %d, hdr data "LPX64"\n", - le32_to_cpu(hdr->msg.put.offset), - hdr->msg.put.hdr_data); - break; - - case LNET_MSG_GET: - CERROR(" Ptl index %d, return md "LPX64"."LPX64", " - "match bits "LPX64"\n", - le32_to_cpu(hdr->msg.get.ptl_index), - hdr->msg.get.return_wmd.wh_interface_cookie, - hdr->msg.get.return_wmd.wh_object_cookie, - hdr->msg.get.match_bits); - CERROR(" Length %d, src offset %d\n", - le32_to_cpu(hdr->msg.get.sink_length), - le32_to_cpu(hdr->msg.get.src_offset)); - break; - - case LNET_MSG_ACK: - CERROR(" dst md "LPX64"."LPX64", manipulated length %d\n", - hdr->msg.ack.dst_wmd.wh_interface_cookie, - hdr->msg.ack.dst_wmd.wh_object_cookie, - le32_to_cpu(hdr->msg.ack.mlength)); - break; - - case LNET_MSG_REPLY: - CERROR(" dst md "LPX64"."LPX64"\n", - hdr->msg.reply.dst_wmd.wh_interface_cookie, - hdr->msg.reply.dst_wmd.wh_object_cookie); - } - -} /* end of print_hdr() */ -#endif - -int -kqswnal_check_rdma (int nlfrag, EP_NMD *lfrag, - int nrfrag, EP_NMD *rfrag) -{ - int i; - - if (nlfrag != nrfrag) { - CERROR("Can't cope with unequal # frags: %d local %d remote\n", - nlfrag, nrfrag); - return (-EINVAL); - } - - for (i = 0; i < nlfrag; i++) - if (lfrag[i].nmd_len != rfrag[i].nmd_len) { - CERROR("Can't cope with unequal frags %d(%d):" - " %d local %d remote\n", - i, nlfrag, lfrag[i].nmd_len, rfrag[i].nmd_len); - return (-EINVAL); - } - - return (0); -} - -kqswnal_remotemd_t * -kqswnal_get_portalscompat_rmd (kqswnal_rx_t *krx) -{ - /* Check that the RMD sent after the "raw" LNET header in a - * portals-compatible QSWLND message is OK */ - char *buffer = (char *)page_address(krx->krx_kiov[0].kiov_page); - kqswnal_remotemd_t *rmd = (kqswnal_remotemd_t *)(buffer + sizeof(lnet_hdr_t)); - - /* Note RDMA addresses are sent in native endian-ness in the "old" - * portals protocol so no swabbing... */ - - if (buffer + krx->krx_nob < (char *)(rmd + 1)) { - /* msg too small to discover rmd size */ - CERROR ("Incoming message [%d] too small for RMD (%d needed)\n", - krx->krx_nob, (int)(((char *)(rmd + 1)) - buffer)); - return (NULL); - } - - if (buffer + krx->krx_nob < (char *)&rmd->kqrmd_frag[rmd->kqrmd_nfrag]) { - /* rmd doesn't fit in the incoming message */ - CERROR ("Incoming message [%d] too small for RMD[%d] (%d needed)\n", - krx->krx_nob, rmd->kqrmd_nfrag, - (int)(((char *)&rmd->kqrmd_frag[rmd->kqrmd_nfrag]) - buffer)); - return (NULL); - } - - return (rmd); -} - -void -kqswnal_rdma_store_complete (EP_RXD *rxd) -{ - int status = ep_rxd_status(rxd); - kqswnal_tx_t *ktx = (kqswnal_tx_t *)ep_rxd_arg(rxd); - kqswnal_rx_t *krx = (kqswnal_rx_t *)ktx->ktx_args[0]; - - CDEBUG((status == EP_SUCCESS) ? D_NET : D_ERROR, - "rxd %p, ktx %p, status %d\n", rxd, ktx, status); - - LASSERT (ktx->ktx_state == KTX_RDMA_STORE); - LASSERT (krx->krx_rxd == rxd); - LASSERT (krx->krx_rpc_reply_needed); - - krx->krx_rpc_reply_needed = 0; - kqswnal_rx_decref (krx); - - /* free ktx & finalize() its lnet_msg_t */ - kqswnal_tx_done(ktx, (status == EP_SUCCESS) ? 0 : -ECONNABORTED); -} - -void -kqswnal_rdma_fetch_complete (EP_RXD *rxd) -{ - /* Completed fetching the PUT/REPLY data */ - int status = ep_rxd_status(rxd); - kqswnal_tx_t *ktx = (kqswnal_tx_t *)ep_rxd_arg(rxd); - kqswnal_rx_t *krx = (kqswnal_rx_t *)ktx->ktx_args[0]; - - CDEBUG((status == EP_SUCCESS) ? D_NET : D_ERROR, - "rxd %p, ktx %p, status %d\n", rxd, ktx, status); - - LASSERT (ktx->ktx_state == KTX_RDMA_FETCH); - LASSERT (krx->krx_rxd == rxd); - /* RPC completes with failure by default */ - LASSERT (krx->krx_rpc_reply_needed); - LASSERT (krx->krx_rpc_reply.msg.status != 0); - - if (status == EP_SUCCESS) { - krx->krx_rpc_reply.msg.status = 0; - status = 0; - } else { - /* Abandon RPC since get failed */ - krx->krx_rpc_reply_needed = 0; - status = -ECONNABORTED; - } - - /* krx gets decref'd in kqswnal_tx_done_in_thread_context() */ - LASSERT (krx->krx_state == KRX_PARSE); - krx->krx_state = KRX_COMPLETING; - - /* free ktx & finalize() its lnet_msg_t */ - kqswnal_tx_done(ktx, status); -} - -int -kqswnal_rdma (kqswnal_rx_t *krx, lnet_msg_t *lntmsg, - int type, kqswnal_remotemd_t *rmd, - unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, - unsigned int offset, unsigned int len) -{ - kqswnal_tx_t *ktx; - int eprc; - int rc; - - /* Not both mapped and paged payload */ - LASSERT (iov == NULL || kiov == NULL); - /* RPC completes with failure by default */ - LASSERT (krx->krx_rpc_reply_needed); - LASSERT (krx->krx_rpc_reply.msg.status != 0); - - if (len == 0) { - /* data got truncated to nothing. */ - lnet_finalize(kqswnal_data.kqn_ni, lntmsg, 0); - /* Let kqswnal_rx_done() complete the RPC with success */ - krx->krx_rpc_reply.msg.status = 0; - return (0); - } - - /* NB I'm using 'ktx' just to map the local RDMA buffers; I'm not - actually sending a portals message with it */ - ktx = kqswnal_get_idle_tx(); - if (ktx == NULL) { - CERROR ("Can't get txd for RDMA with %s\n", - libcfs_nid2str(kqswnal_rx_nid(krx))); - return (-ENOMEM); - } - - ktx->ktx_state = type; - ktx->ktx_nid = kqswnal_rx_nid(krx); - ktx->ktx_args[0] = krx; - ktx->ktx_args[1] = lntmsg; - - LASSERT (atomic_read(&krx->krx_refcount) > 0); - /* Take an extra ref for the completion callback */ - atomic_inc(&krx->krx_refcount); - - /* Map on the rail the RPC prefers */ - ktx->ktx_rail = ep_rcvr_prefrail(krx->krx_eprx, - ep_rxd_railmask(krx->krx_rxd)); - - /* Start mapping at offset 0 (we're not mapping any headers) */ - ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 0; - - if (kiov != NULL) - rc = kqswnal_map_tx_kiov(ktx, offset, len, niov, kiov); - else - rc = kqswnal_map_tx_iov(ktx, offset, len, niov, iov); - - if (rc != 0) { - CERROR ("Can't map local RDMA data: %d\n", rc); - goto out; - } - - rc = kqswnal_check_rdma (ktx->ktx_nfrag, ktx->ktx_frags, - rmd->kqrmd_nfrag, rmd->kqrmd_frag); - if (rc != 0) { - CERROR ("Incompatible RDMA descriptors\n"); - goto out; - } - - switch (type) { - default: - LBUG(); - - case KTX_RDMA_STORE: - krx->krx_rpc_reply.msg.status = 0; - krx->krx_rpc_reply.msg.magic = LNET_PROTO_QSW_MAGIC; - krx->krx_rpc_reply.msg.version = QSWLND_PROTO_VERSION; - krx->krx_rpc_reply.msg.u.get.len = len; -#if KQSW_CKSUM - krx->krx_rpc_reply.msg.u.get.cksum = (kiov != NULL) ? - kqswnal_csum_kiov(~0, offset, len, niov, kiov) : - kqswnal_csum_iov(~0, offset, len, niov, iov); - if (*kqswnal_tunables.kqn_inject_csum_error == 4) { - krx->krx_rpc_reply.msg.u.get.cksum++; - *kqswnal_tunables.kqn_inject_csum_error = 0; - } -#endif - eprc = ep_complete_rpc(krx->krx_rxd, - kqswnal_rdma_store_complete, ktx, - &krx->krx_rpc_reply.ep_statusblk, - ktx->ktx_frags, rmd->kqrmd_frag, - rmd->kqrmd_nfrag); - if (eprc != EP_SUCCESS) { - CERROR("can't complete RPC: %d\n", eprc); - /* don't re-attempt RPC completion */ - krx->krx_rpc_reply_needed = 0; - rc = -ECONNABORTED; - } - break; - - case KTX_RDMA_FETCH: - eprc = ep_rpc_get (krx->krx_rxd, - kqswnal_rdma_fetch_complete, ktx, - rmd->kqrmd_frag, ktx->ktx_frags, ktx->ktx_nfrag); - if (eprc != EP_SUCCESS) { - CERROR("ep_rpc_get failed: %d\n", eprc); - /* Don't attempt RPC completion: - * EKC nuked it when the get failed */ - krx->krx_rpc_reply_needed = 0; - rc = -ECONNABORTED; - } - break; - } - - out: - if (rc != 0) { - kqswnal_rx_decref(krx); /* drop callback's ref */ - kqswnal_put_idle_tx (ktx); - } - - atomic_dec(&kqswnal_data.kqn_pending_txs); - return (rc); -} - -int -kqswnal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) -{ - lnet_hdr_t *hdr = &lntmsg->msg_hdr; - int type = lntmsg->msg_type; - lnet_process_id_t target = lntmsg->msg_target; - int target_is_router = lntmsg->msg_target_is_router; - int routing = lntmsg->msg_routing; - unsigned int payload_niov = lntmsg->msg_niov; - struct iovec *payload_iov = lntmsg->msg_iov; - lnet_kiov_t *payload_kiov = lntmsg->msg_kiov; - unsigned int payload_offset = lntmsg->msg_offset; - unsigned int payload_nob = lntmsg->msg_len; - int nob; - kqswnal_tx_t *ktx; - int rc; - - /* NB 1. hdr is in network byte order */ - /* 2. 'private' depends on the message type */ - - CDEBUG(D_NET, "sending %u bytes in %d frags to %s\n", - payload_nob, payload_niov, libcfs_id2str(target)); - - LASSERT (payload_nob == 0 || payload_niov > 0); - LASSERT (payload_niov <= LNET_MAX_IOV); - - /* It must be OK to kmap() if required */ - LASSERT (payload_kiov == NULL || !in_interrupt ()); - /* payload is either all vaddrs or all pages */ - LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); - - if (kqswnal_nid2elanid (target.nid) < 0) { - CERROR("%s not in my cluster\n", libcfs_nid2str(target.nid)); - return -EIO; - } - - /* I may not block for a transmit descriptor if I might block the - * router, receiver, or an interrupt handler. */ - ktx = kqswnal_get_idle_tx(); - if (ktx == NULL) { - CERROR ("Can't get txd for msg type %d for %s\n", - type, libcfs_nid2str(target.nid)); - return (-ENOMEM); - } - - ktx->ktx_state = KTX_SENDING; - ktx->ktx_nid = target.nid; - ktx->ktx_args[0] = private; - ktx->ktx_args[1] = lntmsg; - ktx->ktx_args[2] = NULL; /* set when a GET commits to REPLY */ - - /* The first frag will be the pre-mapped buffer. */ - ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 1; - - if ((!target_is_router && /* target.nid is final dest */ - !routing && /* I'm the source */ - type == LNET_MSG_GET && /* optimize GET? */ - *kqswnal_tunables.kqn_optimized_gets != 0 && - lntmsg->msg_md->md_length >= - *kqswnal_tunables.kqn_optimized_gets) || - ((type == LNET_MSG_PUT || /* optimize PUT? */ - type == LNET_MSG_REPLY) && /* optimize REPLY? */ - *kqswnal_tunables.kqn_optimized_puts != 0 && - payload_nob >= *kqswnal_tunables.kqn_optimized_puts)) { - lnet_libmd_t *md = lntmsg->msg_md; - kqswnal_msg_t *msg = (kqswnal_msg_t *)ktx->ktx_buffer; - lnet_hdr_t *mhdr; - kqswnal_remotemd_t *rmd; - - /* Optimised path: I send over the Elan vaddrs of the local - * buffers, and my peer DMAs directly to/from them. - * - * First I set up ktx as if it was going to send this - * payload, (it needs to map it anyway). This fills - * ktx_frags[1] and onward with the network addresses - * of the buffer frags. */ - - /* Send an RDMA message */ - msg->kqm_magic = LNET_PROTO_QSW_MAGIC; - msg->kqm_version = QSWLND_PROTO_VERSION; - msg->kqm_type = QSWLND_MSG_RDMA; - - mhdr = &msg->kqm_u.rdma.kqrm_hdr; - rmd = &msg->kqm_u.rdma.kqrm_rmd; - - *mhdr = *hdr; - nob = (((char *)rmd) - ktx->ktx_buffer); - - if (type == LNET_MSG_GET) { - if ((md->md_options & LNET_MD_KIOV) != 0) - rc = kqswnal_map_tx_kiov (ktx, 0, md->md_length, - md->md_niov, md->md_iov.kiov); - else - rc = kqswnal_map_tx_iov (ktx, 0, md->md_length, - md->md_niov, md->md_iov.iov); - ktx->ktx_state = KTX_GETTING; - } else { - if (payload_kiov != NULL) - rc = kqswnal_map_tx_kiov(ktx, 0, payload_nob, - payload_niov, payload_kiov); - else - rc = kqswnal_map_tx_iov(ktx, 0, payload_nob, - payload_niov, payload_iov); - ktx->ktx_state = KTX_PUTTING; - } - - if (rc != 0) - goto out; - - rmd->kqrmd_nfrag = ktx->ktx_nfrag - 1; - nob += offsetof(kqswnal_remotemd_t, - kqrmd_frag[rmd->kqrmd_nfrag]); - LASSERT (nob <= KQSW_TX_BUFFER_SIZE); - - memcpy(&rmd->kqrmd_frag[0], &ktx->ktx_frags[1], - rmd->kqrmd_nfrag * sizeof(EP_NMD)); - - ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, 0, nob); -#if KQSW_CKSUM - msg->kqm_nob = nob + payload_nob; - msg->kqm_cksum = 0; - msg->kqm_cksum = kqswnal_csum(~0, (char *)msg, nob); -#endif - if (type == LNET_MSG_GET) { - /* Allocate reply message now while I'm in thread context */ - ktx->ktx_args[2] = lnet_create_reply_msg ( - kqswnal_data.kqn_ni, lntmsg); - if (ktx->ktx_args[2] == NULL) - goto out; - - /* NB finalizing the REPLY message is my - * responsibility now, whatever happens. */ -#if KQSW_CKSUM - if (*kqswnal_tunables.kqn_inject_csum_error == 3) { - msg->kqm_cksum++; - *kqswnal_tunables.kqn_inject_csum_error = 0; - } - - } else if (payload_kiov != NULL) { - /* must checksum payload after header so receiver can - * compute partial header cksum before swab. Sadly - * this causes 2 rounds of kmap */ - msg->kqm_cksum = - kqswnal_csum_kiov(msg->kqm_cksum, 0, payload_nob, - payload_niov, payload_kiov); - if (*kqswnal_tunables.kqn_inject_csum_error == 2) { - msg->kqm_cksum++; - *kqswnal_tunables.kqn_inject_csum_error = 0; - } - } else { - msg->kqm_cksum = - kqswnal_csum_iov(msg->kqm_cksum, 0, payload_nob, - payload_niov, payload_iov); - if (*kqswnal_tunables.kqn_inject_csum_error == 2) { - msg->kqm_cksum++; - *kqswnal_tunables.kqn_inject_csum_error = 0; - } -#endif - } - - } else if (payload_nob <= *kqswnal_tunables.kqn_tx_maxcontig) { - lnet_hdr_t *mhdr; - char *payload; - kqswnal_msg_t *msg = (kqswnal_msg_t *)ktx->ktx_buffer; - - /* single frag copied into the pre-mapped buffer */ - msg->kqm_magic = LNET_PROTO_QSW_MAGIC; - msg->kqm_version = QSWLND_PROTO_VERSION; - msg->kqm_type = QSWLND_MSG_IMMEDIATE; - - mhdr = &msg->kqm_u.immediate.kqim_hdr; - payload = msg->kqm_u.immediate.kqim_payload; - - *mhdr = *hdr; - nob = (payload - ktx->ktx_buffer) + payload_nob; - - ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, 0, nob); - - if (payload_kiov != NULL) - lnet_copy_kiov2flat(KQSW_TX_BUFFER_SIZE, payload, 0, - payload_niov, payload_kiov, - payload_offset, payload_nob); - else - lnet_copy_iov2flat(KQSW_TX_BUFFER_SIZE, payload, 0, - payload_niov, payload_iov, - payload_offset, payload_nob); -#if KQSW_CKSUM - msg->kqm_nob = nob; - msg->kqm_cksum = 0; - msg->kqm_cksum = kqswnal_csum(~0, (char *)msg, nob); - if (*kqswnal_tunables.kqn_inject_csum_error == 1) { - msg->kqm_cksum++; - *kqswnal_tunables.kqn_inject_csum_error = 0; - } -#endif - } else { - lnet_hdr_t *mhdr; - kqswnal_msg_t *msg = (kqswnal_msg_t *)ktx->ktx_buffer; - - /* multiple frags: first is hdr in pre-mapped buffer */ - msg->kqm_magic = LNET_PROTO_QSW_MAGIC; - msg->kqm_version = QSWLND_PROTO_VERSION; - msg->kqm_type = QSWLND_MSG_IMMEDIATE; - - mhdr = &msg->kqm_u.immediate.kqim_hdr; - nob = offsetof(kqswnal_msg_t, kqm_u.immediate.kqim_payload); - - *mhdr = *hdr; - - ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, 0, nob); - - if (payload_kiov != NULL) - rc = kqswnal_map_tx_kiov (ktx, payload_offset, payload_nob, - payload_niov, payload_kiov); - else - rc = kqswnal_map_tx_iov (ktx, payload_offset, payload_nob, - payload_niov, payload_iov); - if (rc != 0) - goto out; - -#if KQSW_CKSUM - msg->kqm_nob = nob + payload_nob; - msg->kqm_cksum = 0; - msg->kqm_cksum = kqswnal_csum(~0, (char *)msg, nob); - - msg->kqm_cksum = (payload_kiov != NULL) ? - kqswnal_csum_kiov(msg->kqm_cksum, - payload_offset, payload_nob, - payload_niov, payload_kiov) : - kqswnal_csum_iov(msg->kqm_cksum, - payload_offset, payload_nob, - payload_niov, payload_iov); - - if (*kqswnal_tunables.kqn_inject_csum_error == 1) { - msg->kqm_cksum++; - *kqswnal_tunables.kqn_inject_csum_error = 0; - } -#endif - nob += payload_nob; - } - - ktx->ktx_port = (nob <= KQSW_SMALLMSG) ? - EP_MSG_SVC_PORTALS_SMALL : EP_MSG_SVC_PORTALS_LARGE; - - rc = kqswnal_launch (ktx); - - out: - CDEBUG_LIMIT(rc == 0? D_NET :D_NETERROR, "%s %d bytes to %s%s: rc %d\n", - routing ? (rc == 0 ? "Routed" : "Failed to route") : - (rc == 0 ? "Sent" : "Failed to send"), - nob, libcfs_nid2str(target.nid), - target_is_router ? "(router)" : "", rc); - - if (rc != 0) { - lnet_msg_t *repmsg = (lnet_msg_t *)ktx->ktx_args[2]; - int state = ktx->ktx_state; - - kqswnal_put_idle_tx (ktx); - - if (state == KTX_GETTING && repmsg != NULL) { - /* We committed to reply, but there was a problem - * launching the GET. We can't avoid delivering a - * REPLY event since we committed above, so we - * pretend the GET succeeded but the REPLY - * failed. */ - rc = 0; - lnet_finalize (kqswnal_data.kqn_ni, lntmsg, 0); - lnet_finalize (kqswnal_data.kqn_ni, repmsg, -EIO); - } - - } - - atomic_dec(&kqswnal_data.kqn_pending_txs); - return (rc == 0 ? 0 : -EIO); -} - -void -kqswnal_requeue_rx (kqswnal_rx_t *krx) -{ - LASSERT (atomic_read(&krx->krx_refcount) == 0); - LASSERT (!krx->krx_rpc_reply_needed); - - krx->krx_state = KRX_POSTED; - - if (kqswnal_data.kqn_shuttingdown) { - /* free EKC rxd on shutdown */ - ep_complete_receive(krx->krx_rxd); - } else { - /* repost receive */ - ep_requeue_receive(krx->krx_rxd, - kqswnal_rxhandler, krx, - &krx->krx_elanbuffer, 0); - } -} - -void -kqswnal_rpc_complete (EP_RXD *rxd) -{ - int status = ep_rxd_status(rxd); - kqswnal_rx_t *krx = (kqswnal_rx_t *)ep_rxd_arg(rxd); - - CDEBUG((status == EP_SUCCESS) ? D_NET : D_ERROR, - "rxd %p, krx %p, status %d\n", rxd, krx, status); - - LASSERT (krx->krx_rxd == rxd); - LASSERT (krx->krx_rpc_reply_needed); - - krx->krx_rpc_reply_needed = 0; - kqswnal_requeue_rx (krx); -} - -void -kqswnal_rx_done (kqswnal_rx_t *krx) -{ - int rc; - - LASSERT (atomic_read(&krx->krx_refcount) == 0); - - if (krx->krx_rpc_reply_needed) { - /* We've not completed the peer's RPC yet... */ - krx->krx_rpc_reply.msg.magic = LNET_PROTO_QSW_MAGIC; - krx->krx_rpc_reply.msg.version = QSWLND_PROTO_VERSION; - - LASSERT (!in_interrupt()); - - rc = ep_complete_rpc(krx->krx_rxd, - kqswnal_rpc_complete, krx, - &krx->krx_rpc_reply.ep_statusblk, - NULL, NULL, 0); - if (rc == EP_SUCCESS) - return; - - CERROR("can't complete RPC: %d\n", rc); - krx->krx_rpc_reply_needed = 0; - } - - kqswnal_requeue_rx(krx); -} - -void -kqswnal_parse (kqswnal_rx_t *krx) -{ - lnet_ni_t *ni = kqswnal_data.kqn_ni; - kqswnal_msg_t *msg = (kqswnal_msg_t *)page_address(krx->krx_kiov[0].kiov_page); - lnet_nid_t fromnid = kqswnal_rx_nid(krx); - int swab; - int n; - int i; - int nob; - int rc; - - LASSERT (atomic_read(&krx->krx_refcount) == 1); - - if (krx->krx_nob < offsetof(kqswnal_msg_t, kqm_u)) { - CERROR("Short message %d received from %s\n", - krx->krx_nob, libcfs_nid2str(fromnid)); - goto done; - } - - swab = msg->kqm_magic == __swab32(LNET_PROTO_QSW_MAGIC); - - if (swab || msg->kqm_magic == LNET_PROTO_QSW_MAGIC) { -#if KQSW_CKSUM - __u32 csum0; - __u32 csum1; - - /* csum byte array before swab */ - csum1 = msg->kqm_cksum; - msg->kqm_cksum = 0; - csum0 = kqswnal_csum_kiov(~0, 0, krx->krx_nob, - krx->krx_npages, krx->krx_kiov); - msg->kqm_cksum = csum1; -#endif - - if (swab) { - __swab16s(&msg->kqm_version); - __swab16s(&msg->kqm_type); -#if KQSW_CKSUM - __swab32s(&msg->kqm_cksum); - __swab32s(&msg->kqm_nob); -#endif - } - - if (msg->kqm_version != QSWLND_PROTO_VERSION) { - /* Future protocol version compatibility support! - * The next qswlnd-specific protocol rev will first - * send an RPC to check version. - * 1.4.6 and 1.4.7.early reply with a status - * block containing its current version. - * Later versions send a failure (-ve) status + - * magic/version */ - - if (!krx->krx_rpc_reply_needed) { - CERROR("Unexpected version %d from %s\n", - msg->kqm_version, libcfs_nid2str(fromnid)); - goto done; - } - - LASSERT (krx->krx_rpc_reply.msg.status == -EPROTO); - goto done; - } - - switch (msg->kqm_type) { - default: - CERROR("Bad request type %x from %s\n", - msg->kqm_type, libcfs_nid2str(fromnid)); - goto done; - - case QSWLND_MSG_IMMEDIATE: - if (krx->krx_rpc_reply_needed) { - /* Should have been a simple message */ - CERROR("IMMEDIATE sent as RPC from %s\n", - libcfs_nid2str(fromnid)); - goto done; - } - - nob = offsetof(kqswnal_msg_t, kqm_u.immediate.kqim_payload); - if (krx->krx_nob < nob) { - CERROR("Short IMMEDIATE %d(%d) from %s\n", - krx->krx_nob, nob, libcfs_nid2str(fromnid)); - goto done; - } - -#if KQSW_CKSUM - if (csum0 != msg->kqm_cksum) { - CERROR("Bad IMMEDIATE checksum %08x(%08x) from %s\n", - csum0, msg->kqm_cksum, libcfs_nid2str(fromnid)); - CERROR("nob %d (%d)\n", krx->krx_nob, msg->kqm_nob); - goto done; - } -#endif - rc = lnet_parse(ni, &msg->kqm_u.immediate.kqim_hdr, - fromnid, krx, 0); - if (rc < 0) - goto done; - return; - - case QSWLND_MSG_RDMA: - if (!krx->krx_rpc_reply_needed) { - /* Should have been a simple message */ - CERROR("RDMA sent as simple message from %s\n", - libcfs_nid2str(fromnid)); - goto done; - } - - nob = offsetof(kqswnal_msg_t, - kqm_u.rdma.kqrm_rmd.kqrmd_frag[0]); - if (krx->krx_nob < nob) { - CERROR("Short RDMA message %d(%d) from %s\n", - krx->krx_nob, nob, libcfs_nid2str(fromnid)); - goto done; - } - - if (swab) - __swab32s(&msg->kqm_u.rdma.kqrm_rmd.kqrmd_nfrag); - - n = msg->kqm_u.rdma.kqrm_rmd.kqrmd_nfrag; - nob = offsetof(kqswnal_msg_t, - kqm_u.rdma.kqrm_rmd.kqrmd_frag[n]); - - if (krx->krx_nob < nob) { - CERROR("short RDMA message %d(%d) from %s\n", - krx->krx_nob, nob, libcfs_nid2str(fromnid)); - goto done; - } - - if (swab) { - for (i = 0; i < n; i++) { - EP_NMD *nmd = &msg->kqm_u.rdma.kqrm_rmd.kqrmd_frag[i]; - - __swab32s(&nmd->nmd_addr); - __swab32s(&nmd->nmd_len); - __swab32s(&nmd->nmd_attr); - } - } - -#if KQSW_CKSUM - krx->krx_cksum = csum0; /* stash checksum so far */ -#endif - rc = lnet_parse(ni, &msg->kqm_u.rdma.kqrm_hdr, - fromnid, krx, 1); - if (rc < 0) - goto done; - return; - } - /* Not Reached */ - } - - if (msg->kqm_magic == LNET_PROTO_MAGIC || - msg->kqm_magic == __swab32(LNET_PROTO_MAGIC)) { - /* Future protocol version compatibility support! - * When LNET unifies protocols over all LNDs, the first thing a - * peer will send will be a version query RPC. - * 1.4.6 and 1.4.7.early reply with a status block containing - * LNET_PROTO_QSW_MAGIC.. - * Later versions send a failure (-ve) status + - * magic/version */ - - if (!krx->krx_rpc_reply_needed) { - CERROR("Unexpected magic %08x from %s\n", - msg->kqm_magic, libcfs_nid2str(fromnid)); - goto done; - } - - LASSERT (krx->krx_rpc_reply.msg.status == -EPROTO); - goto done; - } - - CERROR("Unrecognised magic %08x from %s\n", - msg->kqm_magic, libcfs_nid2str(fromnid)); - done: - kqswnal_rx_decref(krx); -} - -/* Receive Interrupt Handler: posts to schedulers */ -void -kqswnal_rxhandler(EP_RXD *rxd) -{ - unsigned long flags; - int nob = ep_rxd_len (rxd); - int status = ep_rxd_status (rxd); - kqswnal_rx_t *krx = (kqswnal_rx_t *)ep_rxd_arg (rxd); - CDEBUG(D_NET, "kqswnal_rxhandler: rxd %p, krx %p, nob %d, status %d\n", - rxd, krx, nob, status); - - LASSERT (krx != NULL); - LASSERT (krx->krx_state == KRX_POSTED); - - krx->krx_state = KRX_PARSE; - krx->krx_rxd = rxd; - krx->krx_nob = nob; - - /* RPC reply iff rpc request received without error */ - krx->krx_rpc_reply_needed = ep_rxd_isrpc(rxd) && - (status == EP_SUCCESS || - status == EP_MSG_TOO_BIG); - - /* Default to failure if an RPC reply is requested but not handled */ - krx->krx_rpc_reply.msg.status = -EPROTO; - atomic_set (&krx->krx_refcount, 1); - - if (status != EP_SUCCESS) { - /* receives complete with failure when receiver is removed */ - if (status == EP_SHUTDOWN) - LASSERT (kqswnal_data.kqn_shuttingdown); - else - CERROR("receive status failed with status %d nob %d\n", - ep_rxd_status(rxd), nob); - kqswnal_rx_decref(krx); - return; - } - - if (!in_interrupt()) { - kqswnal_parse(krx); - return; - } - - spin_lock_irqsave(&kqswnal_data.kqn_sched_lock, flags); - - cfs_list_add_tail(&krx->krx_list, &kqswnal_data.kqn_readyrxds); - wake_up(&kqswnal_data.kqn_sched_waitq); - - spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, flags); -} - -int -kqswnal_recv (lnet_ni_t *ni, - void *private, - lnet_msg_t *lntmsg, - int delayed, - unsigned int niov, - struct iovec *iov, - lnet_kiov_t *kiov, - unsigned int offset, - unsigned int mlen, - unsigned int rlen) -{ - kqswnal_rx_t *krx = (kqswnal_rx_t *)private; - lnet_nid_t fromnid; - kqswnal_msg_t *msg; - lnet_hdr_t *hdr; - kqswnal_remotemd_t *rmd; - int msg_offset; - int rc; - - LASSERT (!in_interrupt ()); /* OK to map */ - /* Either all pages or all vaddrs */ - LASSERT (!(kiov != NULL && iov != NULL)); - - fromnid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ep_rxd_node(krx->krx_rxd)); - msg = (kqswnal_msg_t *)page_address(krx->krx_kiov[0].kiov_page); - - if (krx->krx_rpc_reply_needed) { - /* optimized (rdma) request sent as RPC */ - - LASSERT (msg->kqm_type == QSWLND_MSG_RDMA); - hdr = &msg->kqm_u.rdma.kqrm_hdr; - rmd = &msg->kqm_u.rdma.kqrm_rmd; - - /* NB header is still in wire byte order */ - - switch (le32_to_cpu(hdr->type)) { - case LNET_MSG_PUT: - case LNET_MSG_REPLY: - /* This is an optimized PUT/REPLY */ - rc = kqswnal_rdma(krx, lntmsg, - KTX_RDMA_FETCH, rmd, - niov, iov, kiov, offset, mlen); - break; - - case LNET_MSG_GET: -#if KQSW_CKSUM - if (krx->krx_cksum != msg->kqm_cksum) { - CERROR("Bad GET checksum %08x(%08x) from %s\n", - krx->krx_cksum, msg->kqm_cksum, - libcfs_nid2str(fromnid)); - rc = -EIO; - break; - } -#endif - if (lntmsg == NULL) { - /* No buffer match: my decref will - * complete the RPC with failure */ - rc = 0; - } else { - /* Matched something! */ - rc = kqswnal_rdma(krx, lntmsg, - KTX_RDMA_STORE, rmd, - lntmsg->msg_niov, - lntmsg->msg_iov, - lntmsg->msg_kiov, - lntmsg->msg_offset, - lntmsg->msg_len); - } - break; - - default: - CERROR("Bad RPC type %d\n", - le32_to_cpu(hdr->type)); - rc = -EPROTO; - break; - } - - kqswnal_rx_decref(krx); - return rc; - } - - LASSERT (msg->kqm_type == QSWLND_MSG_IMMEDIATE); - msg_offset = offsetof(kqswnal_msg_t, kqm_u.immediate.kqim_payload); - - if (krx->krx_nob < msg_offset + rlen) { - CERROR("Bad message size from %s: have %d, need %d + %d\n", - libcfs_nid2str(fromnid), krx->krx_nob, - msg_offset, rlen); - kqswnal_rx_decref(krx); - return -EPROTO; - } - - if (kiov != NULL) - lnet_copy_kiov2kiov(niov, kiov, offset, - krx->krx_npages, krx->krx_kiov, - msg_offset, mlen); - else - lnet_copy_kiov2iov(niov, iov, offset, - krx->krx_npages, krx->krx_kiov, - msg_offset, mlen); - - lnet_finalize(ni, lntmsg, 0); - kqswnal_rx_decref(krx); - return 0; -} - -int -kqswnal_thread_start(int (*fn)(void *arg), void *arg, char *name) -{ - struct task_struct *task = cfs_thread_run(fn, arg, name); - - if (IS_ERR(task)) - return PTR_ERR(task); - - atomic_inc(&kqswnal_data.kqn_nthreads); - return 0; -} - -void -kqswnal_thread_fini (void) -{ - atomic_dec (&kqswnal_data.kqn_nthreads); -} - -int -kqswnal_scheduler (void *arg) -{ - kqswnal_rx_t *krx; - kqswnal_tx_t *ktx; - unsigned long flags; - int rc; - int counter = 0; - int did_something; - - cfs_block_allsigs (); - - spin_lock_irqsave(&kqswnal_data.kqn_sched_lock, flags); - - for (;;) - { - did_something = 0; - - if (!cfs_list_empty (&kqswnal_data.kqn_readyrxds)) - { - krx = cfs_list_entry(kqswnal_data.kqn_readyrxds.next, - kqswnal_rx_t, krx_list); - cfs_list_del (&krx->krx_list); - spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, - flags); - - LASSERT (krx->krx_state == KRX_PARSE); - kqswnal_parse (krx); - - did_something = 1; - spin_lock_irqsave(&kqswnal_data.kqn_sched_lock, - flags); - } - - if (!cfs_list_empty (&kqswnal_data.kqn_donetxds)) - { - ktx = cfs_list_entry(kqswnal_data.kqn_donetxds.next, - kqswnal_tx_t, ktx_schedlist); - cfs_list_del_init (&ktx->ktx_schedlist); - spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, - flags); - - kqswnal_tx_done_in_thread_context(ktx); - - did_something = 1; - spin_lock_irqsave(&kqswnal_data.kqn_sched_lock, - flags); - } - - if (!cfs_list_empty (&kqswnal_data.kqn_delayedtxds)) - { - ktx = cfs_list_entry(kqswnal_data.kqn_delayedtxds.next, - kqswnal_tx_t, ktx_schedlist); - cfs_list_del_init (&ktx->ktx_schedlist); - spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, - flags); - - rc = kqswnal_launch (ktx); - if (rc != 0) { - CERROR("Failed delayed transmit to %s: %d\n", - libcfs_nid2str(ktx->ktx_nid), rc); - kqswnal_tx_done (ktx, rc); - } - atomic_dec (&kqswnal_data.kqn_pending_txs); - - did_something = 1; - spin_lock_irqsave(&kqswnal_data.kqn_sched_lock, - flags); - } - - /* nothing to do or hogging CPU */ - if (!did_something || counter++ == KQSW_RESCHED) { - spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, - flags); - - counter = 0; - - if (!did_something) { - if (kqswnal_data.kqn_shuttingdown == 2) { - /* We only exit in stage 2 of shutdown - * when there's nothing left to do */ - break; - } - rc = wait_event_interruptible_exclusive ( - kqswnal_data.kqn_sched_waitq, - kqswnal_data.kqn_shuttingdown == 2 || - !cfs_list_empty(&kqswnal_data. \ - kqn_readyrxds) || - !cfs_list_empty(&kqswnal_data. \ - kqn_donetxds) || - !cfs_list_empty(&kqswnal_data. \ - kqn_delayedtxds)); - LASSERT (rc == 0); - } else if (need_resched()) - schedule (); - - spin_lock_irqsave(&kqswnal_data.kqn_sched_lock, - flags); - } - } - - kqswnal_thread_fini (); - return 0; -} diff --git a/lnet/klnds/qswlnd/qswlnd_modparams.c b/lnet/klnds/qswlnd/qswlnd_modparams.c deleted file mode 100644 index f3dcfbf..0000000 --- a/lnet/klnds/qswlnd/qswlnd_modparams.c +++ /dev/null @@ -1,223 +0,0 @@ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * - * Author: Eric Barton - * - * This file is part of Portals, http://www.lustre.org - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#include "qswlnd.h" - -static int tx_maxcontig = (1<<10); -CFS_MODULE_PARM(tx_maxcontig, "i", int, 0444, - "maximum payload to de-fragment"); - -static int ntxmsgs = 512; -CFS_MODULE_PARM(ntxmsgs, "i", int, 0444, - "# tx msg buffers"); - -static int credits = 128; -CFS_MODULE_PARM(credits, "i", int, 0444, - "# concurrent sends"); - -static int peer_credits = 8; -CFS_MODULE_PARM(peer_credits, "i", int, 0444, - "# per-peer concurrent sends"); - -static int nrxmsgs_large = 64; -CFS_MODULE_PARM(nrxmsgs_large, "i", int, 0444, - "# 'large' rx msg buffers"); - -static int ep_envelopes_large = 256; -CFS_MODULE_PARM(ep_envelopes_large, "i", int, 0444, - "# 'large' rx msg envelope buffers"); - -static int nrxmsgs_small = 256; -CFS_MODULE_PARM(nrxmsgs_small, "i", int, 0444, - "# 'small' rx msg buffers"); - -static int ep_envelopes_small = 2048; -CFS_MODULE_PARM(ep_envelopes_small, "i", int, 0444, - "# 'small' rx msg envelope buffers"); - -static int optimized_puts = (32<<10); -CFS_MODULE_PARM(optimized_puts, "i", int, 0644, - "zero-copy puts >= this size"); - -static int optimized_gets = 2048; -CFS_MODULE_PARM(optimized_gets, "i", int, 0644, - "zero-copy gets >= this size"); - -#if KQSW_CKSUM -static int inject_csum_error = 0; -CFS_MODULE_PARM(inject_csum_error, "i", int, 0644, - "test checksumming"); -#endif - -kqswnal_tunables_t kqswnal_tunables = { - .kqn_tx_maxcontig = &tx_maxcontig, - .kqn_ntxmsgs = &ntxmsgs, - .kqn_credits = &credits, - .kqn_peercredits = &peer_credits, - .kqn_nrxmsgs_large = &nrxmsgs_large, - .kqn_ep_envelopes_large = &ep_envelopes_large, - .kqn_nrxmsgs_small = &nrxmsgs_small, - .kqn_ep_envelopes_small = &ep_envelopes_small, - .kqn_optimized_puts = &optimized_puts, - .kqn_optimized_gets = &optimized_gets, -#if KQSW_CKSUM - .kqn_inject_csum_error = &inject_csum_error, -#endif -}; - -#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM - -static struct ctl_table kqswnal_ctl_table[] = { - { - INIT_CTL_NAME - .procname = "tx_maxcontig", - .data = &tx_maxcontig, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - INIT_CTL_NAME - .procname = "ntxmsgs", - .data = &ntxmsgs, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - INIT_CTL_NAME - .procname = "credits", - .data = &credits, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - INIT_CTL_NAME - .procname = "peer_credits", - .data = &peer_credits, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - INIT_CTL_NAME - .procname = "nrxmsgs_large", - .data = &nrxmsgs_large, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - INIT_CTL_NAME - .procname = "ep_envelopes_large", - .data = &ep_envelopes_large, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - INIT_CTL_NAME - .procname = "nrxmsgs_small", - .data = &nrxmsgs_small, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - INIT_CTL_NAME - .procname = "ep_envelopes_small", - .data = &ep_envelopes_small, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - INIT_CTL_NAME - .procname = "optimized_puts", - .data = &optimized_puts, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - INIT_CTL_NAME - .procname = "optimized_gets", - .data = &optimized_gets, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, -#if KQSW_CKSUM - { - INIT_CTL_NAME - .procname = "inject_csum_error", - .data = &inject_csum_error, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, -#endif - { 0 } -}; - -static struct ctl_table kqswnal_top_ctl_table[] = { - { - INIT_CTL_NAME - .procname = "qswnal", - .data = NULL, - .maxlen = 0, - .mode = 0555, - .child = kqswnal_ctl_table - }, - { 0 } -}; - -int -kqswnal_tunables_init () -{ - kqswnal_tunables.kqn_sysctl = - register_sysctl_table(kqswnal_top_ctl_table); - - if (kqswnal_tunables.kqn_sysctl == NULL) - CWARN("Can't setup /proc tunables\n"); - - return 0; -} - -void kqswnal_tunables_fini() -{ - if (kqswnal_tunables.kqn_sysctl != NULL) - unregister_sysctl_table(kqswnal_tunables.kqn_sysctl); -} -#else -int -kqswnal_tunables_init () -{ - return 0; -} - -void -kqswnal_tunables_fini () -{ -} -#endif diff --git a/lnet/klnds/ralnd/Makefile.in b/lnet/klnds/ralnd/Makefile.in deleted file mode 100644 index e1f5e82..0000000 --- a/lnet/klnds/ralnd/Makefile.in +++ /dev/null @@ -1,6 +0,0 @@ -MODULES := kralnd -kralnd-objs := ralnd.o ralnd_cb.o ralnd_modparams.o - -EXTRA_POST_CFLAGS := @RACPPFLAGS@ - -@INCLUDE_RULES@ diff --git a/lnet/klnds/ralnd/autoMakefile.am b/lnet/klnds/ralnd/autoMakefile.am deleted file mode 100644 index 0d79f3a..0000000 --- a/lnet/klnds/ralnd/autoMakefile.am +++ /dev/null @@ -1,44 +0,0 @@ -# -# GPL HEADER START -# -# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 only, -# as published by the Free Software Foundation. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License version 2 for more details (a copy is included -# in the LICENSE file that accompanied this code). -# -# You should have received a copy of the GNU General Public License -# version 2 along with this program; If not, see -# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf -# -# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, -# CA 95054 USA or visit www.sun.com if you need additional information or -# have any questions. -# -# GPL HEADER END -# - -# -# Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. -# Use is subject to license terms. -# - -# -# This file is part of Lustre, http://www.lustre.org/ -# Lustre is a trademark of Sun Microsystems, Inc. -# - -if MODULES -if BUILD_RALND -modulenet_DATA = kralnd$(KMODEXT) -endif -endif - -MOSTLYCLEANFILES = @MOSTLYCLEANFILES@ -EXTRA_DIST = $(kralnd-objs:%.o=%.c) ralnd.h diff --git a/lnet/klnds/ralnd/ralnd.c b/lnet/klnds/ralnd/ralnd.c deleted file mode 100644 index a4b06ca..0000000 --- a/lnet/klnds/ralnd/ralnd.c +++ /dev/null @@ -1,1744 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. - * - * GPL HEADER END - */ -/* - * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2014, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/ralnd/ralnd.c - * - * Author: Eric Barton - */ -#include "ralnd.h" - -static int kranal_devids[RANAL_MAXDEVS] = {RAPK_MAIN_DEVICE_ID, - RAPK_EXPANSION_DEVICE_ID}; - -lnd_t the_kralnd = { - .lnd_type = RALND, - .lnd_startup = kranal_startup, - .lnd_shutdown = kranal_shutdown, - .lnd_ctl = kranal_ctl, - .lnd_send = kranal_send, - .lnd_recv = kranal_recv, - .lnd_eager_recv = kranal_eager_recv, - .lnd_accept = kranal_accept, -}; - -kra_data_t kranal_data; - -void -kranal_pack_connreq(kra_connreq_t *connreq, kra_conn_t *conn, lnet_nid_t dstnid) -{ - RAP_RETURN rrc; - - memset(connreq, 0, sizeof(*connreq)); - - connreq->racr_magic = RANAL_MSG_MAGIC; - connreq->racr_version = RANAL_MSG_VERSION; - - if (conn == NULL) /* prepping a "stub" reply */ - return; - - connreq->racr_devid = conn->rac_device->rad_id; - connreq->racr_srcnid = kranal_data.kra_ni->ni_nid; - connreq->racr_dstnid = dstnid; - connreq->racr_peerstamp = kranal_data.kra_peerstamp; - connreq->racr_connstamp = conn->rac_my_connstamp; - connreq->racr_timeout = conn->rac_timeout; - - rrc = RapkGetRiParams(conn->rac_rihandle, &connreq->racr_riparams); - LASSERT(rrc == RAP_SUCCESS); -} - -int -kranal_recv_connreq(struct socket *sock, kra_connreq_t *connreq, int active) -{ - int timeout = active ? *kranal_tunables.kra_timeout : - lnet_acceptor_timeout(); - int swab; - int rc; - - /* return 0 on success, -ve on error, +ve to tell the peer I'm "old" */ - - rc = libcfs_sock_read(sock, &connreq->racr_magic, - sizeof(connreq->racr_magic), timeout); - if (rc != 0) { - CERROR("Read(magic) failed(1): %d\n", rc); - return -EIO; - } - - if (connreq->racr_magic != RANAL_MSG_MAGIC && - connreq->racr_magic != __swab32(RANAL_MSG_MAGIC)) { - /* Unexpected magic! */ - if (!active && - (connreq->racr_magic == LNET_PROTO_MAGIC || - connreq->racr_magic == __swab32(LNET_PROTO_MAGIC))) { - /* future protocol version compatibility! - * When LNET unifies protocols over all LNDs, the first - * thing sent will be a version query. +ve rc means I - * reply with my current magic/version */ - return EPROTO; - } - - CERROR("Unexpected magic %08x (%s)\n", - connreq->racr_magic, active ? "active" : "passive"); - return -EPROTO; - } - - swab = (connreq->racr_magic == __swab32(RANAL_MSG_MAGIC)); - - rc = libcfs_sock_read(sock, &connreq->racr_version, - sizeof(connreq->racr_version), timeout); - if (rc != 0) { - CERROR("Read(version) failed: %d\n", rc); - return -EIO; - } - - if (swab) - __swab16s(&connreq->racr_version); - - if (connreq->racr_version != RANAL_MSG_VERSION) { - if (active) { - CERROR("Unexpected version %d\n", connreq->racr_version); - return -EPROTO; - } - /* If this is a future version of the ralnd protocol, and I'm - * passive (accepted the connection), tell my peer I'm "old" - * (+ve rc) */ - return EPROTO; - } - - rc = libcfs_sock_read(sock, &connreq->racr_devid, - sizeof(connreq->racr_version) - - offsetof(kra_connreq_t, racr_devid), - timeout); - if (rc != 0) { - CERROR("Read(body) failed: %d\n", rc); - return -EIO; - } - - if (swab) { - __swab32s(&connreq->racr_magic); - __swab16s(&connreq->racr_version); - __swab16s(&connreq->racr_devid); - __swab64s(&connreq->racr_srcnid); - __swab64s(&connreq->racr_dstnid); - __swab64s(&connreq->racr_peerstamp); - __swab64s(&connreq->racr_connstamp); - __swab32s(&connreq->racr_timeout); - - __swab32s(&connreq->racr_riparams.HostId); - __swab32s(&connreq->racr_riparams.FmaDomainHndl); - __swab32s(&connreq->racr_riparams.PTag); - __swab32s(&connreq->racr_riparams.CompletionCookie); - } - - if (connreq->racr_srcnid == LNET_NID_ANY || - connreq->racr_dstnid == LNET_NID_ANY) { - CERROR("Received LNET_NID_ANY\n"); - return -EPROTO; - } - - if (connreq->racr_timeout < RANAL_MIN_TIMEOUT) { - CERROR("Received timeout %d < MIN %d\n", - connreq->racr_timeout, RANAL_MIN_TIMEOUT); - return -EPROTO; - } - - return 0; -} - -int -kranal_close_stale_conns_locked (kra_peer_t *peer, kra_conn_t *newconn) -{ - kra_conn_t *conn; - cfs_list_t *ctmp; - cfs_list_t *cnxt; - int loopback; - int count = 0; - - loopback = peer->rap_nid == kranal_data.kra_ni->ni_nid; - - cfs_list_for_each_safe (ctmp, cnxt, &peer->rap_conns) { - conn = cfs_list_entry(ctmp, kra_conn_t, rac_list); - - if (conn == newconn) - continue; - - if (conn->rac_peerstamp != newconn->rac_peerstamp) { - CDEBUG(D_NET, "Closing stale conn nid: %s " - " peerstamp:"LPX64"("LPX64")\n", - libcfs_nid2str(peer->rap_nid), - conn->rac_peerstamp, newconn->rac_peerstamp); - LASSERT (conn->rac_peerstamp < newconn->rac_peerstamp); - count++; - kranal_close_conn_locked(conn, -ESTALE); - continue; - } - - if (conn->rac_device != newconn->rac_device) - continue; - - if (loopback && - newconn->rac_my_connstamp == conn->rac_peer_connstamp && - newconn->rac_peer_connstamp == conn->rac_my_connstamp) - continue; - - LASSERT (conn->rac_peer_connstamp < newconn->rac_peer_connstamp); - - CDEBUG(D_NET, "Closing stale conn nid: %s" - " connstamp:"LPX64"("LPX64")\n", - libcfs_nid2str(peer->rap_nid), - conn->rac_peer_connstamp, newconn->rac_peer_connstamp); - - count++; - kranal_close_conn_locked(conn, -ESTALE); - } - - return count; -} - -int -kranal_conn_isdup_locked(kra_peer_t *peer, kra_conn_t *newconn) -{ - kra_conn_t *conn; - cfs_list_t *tmp; - int loopback; - - loopback = peer->rap_nid == kranal_data.kra_ni->ni_nid; - - cfs_list_for_each(tmp, &peer->rap_conns) { - conn = cfs_list_entry(tmp, kra_conn_t, rac_list); - - /* 'newconn' is from an earlier version of 'peer'!!! */ - if (newconn->rac_peerstamp < conn->rac_peerstamp) - return 1; - - /* 'conn' is from an earlier version of 'peer': it will be - * removed when we cull stale conns later on... */ - if (newconn->rac_peerstamp > conn->rac_peerstamp) - continue; - - /* Different devices are OK */ - if (conn->rac_device != newconn->rac_device) - continue; - - /* It's me connecting to myself */ - if (loopback && - newconn->rac_my_connstamp == conn->rac_peer_connstamp && - newconn->rac_peer_connstamp == conn->rac_my_connstamp) - continue; - - /* 'newconn' is an earlier connection from 'peer'!!! */ - if (newconn->rac_peer_connstamp < conn->rac_peer_connstamp) - return 2; - - /* 'conn' is an earlier connection from 'peer': it will be - * removed when we cull stale conns later on... */ - if (newconn->rac_peer_connstamp > conn->rac_peer_connstamp) - continue; - - /* 'newconn' has the SAME connection stamp; 'peer' isn't - * playing the game... */ - return 3; - } - - return 0; -} - -void -kranal_set_conn_uniqueness (kra_conn_t *conn) -{ - unsigned long flags; - - write_lock_irqsave(&kranal_data.kra_global_lock, flags); - - conn->rac_my_connstamp = kranal_data.kra_connstamp++; - - do { /* allocate a unique cqid */ - conn->rac_cqid = kranal_data.kra_next_cqid++; - } while (kranal_cqid2conn_locked(conn->rac_cqid) != NULL); - - write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); -} - -int -kranal_create_conn(kra_conn_t **connp, kra_device_t *dev) -{ - kra_conn_t *conn; - RAP_RETURN rrc; - - LASSERT (!in_interrupt()); - LIBCFS_ALLOC(conn, sizeof(*conn)); - - if (conn == NULL) - return -ENOMEM; - - memset(conn, 0, sizeof(*conn)); - atomic_set(&conn->rac_refcount, 1); - CFS_INIT_LIST_HEAD(&conn->rac_list); - CFS_INIT_LIST_HEAD(&conn->rac_hashlist); - CFS_INIT_LIST_HEAD(&conn->rac_schedlist); - CFS_INIT_LIST_HEAD(&conn->rac_fmaq); - CFS_INIT_LIST_HEAD(&conn->rac_rdmaq); - CFS_INIT_LIST_HEAD(&conn->rac_replyq); - spin_lock_init(&conn->rac_lock); - - kranal_set_conn_uniqueness(conn); - - conn->rac_device = dev; - conn->rac_timeout = MAX(*kranal_tunables.kra_timeout, RANAL_MIN_TIMEOUT); - kranal_update_reaper_timeout(conn->rac_timeout); - - rrc = RapkCreateRi(dev->rad_handle, conn->rac_cqid, - &conn->rac_rihandle); - if (rrc != RAP_SUCCESS) { - CERROR("RapkCreateRi failed: %d\n", rrc); - LIBCFS_FREE(conn, sizeof(*conn)); - return -ENETDOWN; - } - - atomic_inc(&kranal_data.kra_nconns); - *connp = conn; - return 0; -} - -void -kranal_destroy_conn(kra_conn_t *conn) -{ - RAP_RETURN rrc; - - LASSERT (!in_interrupt()); - LASSERT (!conn->rac_scheduled); - LASSERT (cfs_list_empty(&conn->rac_list)); - LASSERT (cfs_list_empty(&conn->rac_hashlist)); - LASSERT (cfs_list_empty(&conn->rac_schedlist)); - LASSERT (atomic_read(&conn->rac_refcount) == 0); - LASSERT (cfs_list_empty(&conn->rac_fmaq)); - LASSERT (cfs_list_empty(&conn->rac_rdmaq)); - LASSERT (cfs_list_empty(&conn->rac_replyq)); - - rrc = RapkDestroyRi(conn->rac_device->rad_handle, - conn->rac_rihandle); - LASSERT (rrc == RAP_SUCCESS); - - if (conn->rac_peer != NULL) - kranal_peer_decref(conn->rac_peer); - - LIBCFS_FREE(conn, sizeof(*conn)); - atomic_dec(&kranal_data.kra_nconns); -} - -void -kranal_terminate_conn_locked (kra_conn_t *conn) -{ - LASSERT (!in_interrupt()); - LASSERT (conn->rac_state == RANAL_CONN_CLOSING); - LASSERT (!cfs_list_empty(&conn->rac_hashlist)); - LASSERT (cfs_list_empty(&conn->rac_list)); - - /* Remove from conn hash table: no new callbacks */ - cfs_list_del_init(&conn->rac_hashlist); - kranal_conn_decref(conn); - - conn->rac_state = RANAL_CONN_CLOSED; - - /* schedule to clear out all uncompleted comms in context of dev's - * scheduler */ - kranal_schedule_conn(conn); -} - -void -kranal_close_conn_locked (kra_conn_t *conn, int error) -{ - kra_peer_t *peer = conn->rac_peer; - - CDEBUG_LIMIT(error == 0 ? D_NET : D_NETERROR, - "closing conn to %s: error %d\n", - libcfs_nid2str(peer->rap_nid), error); - - LASSERT (!in_interrupt()); - LASSERT (conn->rac_state == RANAL_CONN_ESTABLISHED); - LASSERT (!cfs_list_empty(&conn->rac_hashlist)); - LASSERT (!cfs_list_empty(&conn->rac_list)); - - cfs_list_del_init(&conn->rac_list); - - if (cfs_list_empty(&peer->rap_conns) && - peer->rap_persistence == 0) { - /* Non-persistent peer with no more conns... */ - kranal_unlink_peer_locked(peer); - } - - /* Reset RX timeout to ensure we wait for an incoming CLOSE for the - * full timeout. If we get a CLOSE we know the peer has stopped all - * RDMA. Otherwise if we wait for the full timeout we can also be sure - * all RDMA has stopped. */ - conn->rac_last_rx = jiffies; - smp_mb(); - - conn->rac_state = RANAL_CONN_CLOSING; - kranal_schedule_conn(conn); /* schedule sending CLOSE */ - - kranal_conn_decref(conn); /* lose peer's ref */ -} - -void -kranal_close_conn (kra_conn_t *conn, int error) -{ - unsigned long flags; - - - write_lock_irqsave(&kranal_data.kra_global_lock, flags); - - if (conn->rac_state == RANAL_CONN_ESTABLISHED) - kranal_close_conn_locked(conn, error); - - write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); -} - -int -kranal_set_conn_params(kra_conn_t *conn, kra_connreq_t *connreq, - __u32 peer_ip, int peer_port) -{ - kra_device_t *dev = conn->rac_device; - unsigned long flags; - RAP_RETURN rrc; - - /* CAVEAT EMPTOR: we're really overloading rac_last_tx + rac_keepalive - * to do RapkCompleteSync() timekeeping (see kibnal_scheduler). */ - conn->rac_last_tx = jiffies; - conn->rac_keepalive = 0; - - rrc = RapkSetRiParams(conn->rac_rihandle, &connreq->racr_riparams); - if (rrc != RAP_SUCCESS) { - CERROR("Error setting riparams from %u.%u.%u.%u/%d: %d\n", - HIPQUAD(peer_ip), peer_port, rrc); - return -ECONNABORTED; - } - - /* Schedule conn on rad_new_conns */ - kranal_conn_addref(conn); - spin_lock_irqsave(&dev->rad_lock, flags); - cfs_list_add_tail(&conn->rac_schedlist, &dev->rad_new_conns); - wake_up(&dev->rad_waitq); - spin_unlock_irqrestore(&dev->rad_lock, flags); - - rrc = RapkWaitToConnect(conn->rac_rihandle); - if (rrc != RAP_SUCCESS) { - CERROR("Error waiting to connect to %u.%u.%u.%u/%d: %d\n", - HIPQUAD(peer_ip), peer_port, rrc); - return -ECONNABORTED; - } - - /* Scheduler doesn't touch conn apart from to deschedule and decref it - * after RapkCompleteSync() return success, so conn is all mine */ - - conn->rac_peerstamp = connreq->racr_peerstamp; - conn->rac_peer_connstamp = connreq->racr_connstamp; - conn->rac_keepalive = RANAL_TIMEOUT2KEEPALIVE(connreq->racr_timeout); - kranal_update_reaper_timeout(conn->rac_keepalive); - return 0; -} - -int -kranal_passive_conn_handshake (struct socket *sock, lnet_nid_t *src_nidp, - lnet_nid_t *dst_nidp, kra_conn_t **connp) -{ - __u32 peer_ip; - unsigned int peer_port; - kra_connreq_t rx_connreq; - kra_connreq_t tx_connreq; - kra_conn_t *conn; - kra_device_t *dev; - int rc; - int i; - - rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port); - if (rc != 0) { - CERROR("Can't get peer's IP: %d\n", rc); - return rc; - } - - rc = kranal_recv_connreq(sock, &rx_connreq, 0); - - if (rc < 0) { - CERROR("Can't rx connreq from %u.%u.%u.%u/%d: %d\n", - HIPQUAD(peer_ip), peer_port, rc); - return rc; - } - - if (rc > 0) { - /* Request from "new" peer: send reply with my MAGIC/VERSION to - * tell her I'm old... */ - kranal_pack_connreq(&tx_connreq, NULL, LNET_NID_ANY); - - rc = libcfs_sock_write(sock, &tx_connreq, sizeof(tx_connreq), - lnet_acceptor_timeout()); - if (rc != 0) - CERROR("Can't tx stub connreq to %u.%u.%u.%u/%d: %d\n", - HIPQUAD(peer_ip), peer_port, rc); - - return -EPROTO; - } - - for (i = 0;;i++) { - if (i == kranal_data.kra_ndevs) { - CERROR("Can't match dev %d from %u.%u.%u.%u/%d\n", - rx_connreq.racr_devid, HIPQUAD(peer_ip), peer_port); - return -ENODEV; - } - dev = &kranal_data.kra_devices[i]; - if (dev->rad_id == rx_connreq.racr_devid) - break; - } - - rc = kranal_create_conn(&conn, dev); - if (rc != 0) - return rc; - - kranal_pack_connreq(&tx_connreq, conn, rx_connreq.racr_srcnid); - - rc = libcfs_sock_write(sock, &tx_connreq, sizeof(tx_connreq), - lnet_acceptor_timeout()); - if (rc != 0) { - CERROR("Can't tx connreq to %u.%u.%u.%u/%d: %d\n", - HIPQUAD(peer_ip), peer_port, rc); - kranal_conn_decref(conn); - return rc; - } - - rc = kranal_set_conn_params(conn, &rx_connreq, peer_ip, peer_port); - if (rc != 0) { - kranal_conn_decref(conn); - return rc; - } - - *connp = conn; - *src_nidp = rx_connreq.racr_srcnid; - *dst_nidp = rx_connreq.racr_dstnid; - return 0; -} - -int -kranal_active_conn_handshake(kra_peer_t *peer, - lnet_nid_t *dst_nidp, kra_conn_t **connp) -{ - kra_connreq_t connreq; - kra_conn_t *conn; - kra_device_t *dev; - struct socket *sock; - int rc; - unsigned int idx; - - /* spread connections over all devices using both peer NIDs to ensure - * all nids use all devices */ - idx = peer->rap_nid + kranal_data.kra_ni->ni_nid; - dev = &kranal_data.kra_devices[idx % kranal_data.kra_ndevs]; - - rc = kranal_create_conn(&conn, dev); - if (rc != 0) - return rc; - - kranal_pack_connreq(&connreq, conn, peer->rap_nid); - - if (the_lnet.ln_testprotocompat != 0) { - /* single-shot proto test */ - LNET_LOCK(); - if ((the_lnet.ln_testprotocompat & 1) != 0) { - connreq.racr_version++; - the_lnet.ln_testprotocompat &= ~1; - } - if ((the_lnet.ln_testprotocompat & 2) != 0) { - connreq.racr_magic = LNET_PROTO_MAGIC; - the_lnet.ln_testprotocompat &= ~2; - } - LNET_UNLOCK(); - } - - rc = lnet_connect(&sock, peer->rap_nid, - 0, peer->rap_ip, peer->rap_port); - if (rc != 0) - goto failed_0; - - /* CAVEAT EMPTOR: the passive side receives with a SHORT rx timeout - * immediately after accepting a connection, so we connect and then - * send immediately. */ - - rc = libcfs_sock_write(sock, &connreq, sizeof(connreq), - lnet_acceptor_timeout()); - if (rc != 0) { - CERROR("Can't tx connreq to %u.%u.%u.%u/%d: %d\n", - HIPQUAD(peer->rap_ip), peer->rap_port, rc); - goto failed_2; - } - - rc = kranal_recv_connreq(sock, &connreq, 1); - if (rc != 0) { - CERROR("Can't rx connreq from %u.%u.%u.%u/%d: %d\n", - HIPQUAD(peer->rap_ip), peer->rap_port, rc); - goto failed_2; - } - - libcfs_sock_release(sock); - rc = -EPROTO; - - if (connreq.racr_srcnid != peer->rap_nid) { - CERROR("Unexpected srcnid from %u.%u.%u.%u/%d: " - "received %s expected %s\n", - HIPQUAD(peer->rap_ip), peer->rap_port, - libcfs_nid2str(connreq.racr_srcnid), - libcfs_nid2str(peer->rap_nid)); - goto failed_1; - } - - if (connreq.racr_devid != dev->rad_id) { - CERROR("Unexpected device id from %u.%u.%u.%u/%d: " - "received %d expected %d\n", - HIPQUAD(peer->rap_ip), peer->rap_port, - connreq.racr_devid, dev->rad_id); - goto failed_1; - } - - rc = kranal_set_conn_params(conn, &connreq, - peer->rap_ip, peer->rap_port); - if (rc != 0) - goto failed_1; - - *connp = conn; - *dst_nidp = connreq.racr_dstnid; - return 0; - - failed_2: - libcfs_sock_release(sock); - failed_1: - lnet_connect_console_error(rc, peer->rap_nid, - peer->rap_ip, peer->rap_port); - failed_0: - kranal_conn_decref(conn); - return rc; -} - -int -kranal_conn_handshake (struct socket *sock, kra_peer_t *peer) -{ - kra_peer_t *peer2; - kra_tx_t *tx; - lnet_nid_t peer_nid; - lnet_nid_t dst_nid; - unsigned long flags; - kra_conn_t *conn; - int rc; - int nstale; - int new_peer = 0; - - if (sock == NULL) { - /* active: connd wants to connect to 'peer' */ - LASSERT (peer != NULL); - LASSERT (peer->rap_connecting); - - rc = kranal_active_conn_handshake(peer, &dst_nid, &conn); - if (rc != 0) - return rc; - - write_lock_irqsave(&kranal_data.kra_global_lock, flags); - - if (!kranal_peer_active(peer)) { - /* raced with peer getting unlinked */ - write_unlock_irqrestore(&kranal_data. \ - kra_global_lock, - flags); - kranal_conn_decref(conn); - return -ESTALE; - } - - peer_nid = peer->rap_nid; - } else { - /* passive: listener accepted 'sock' */ - LASSERT (peer == NULL); - - rc = kranal_passive_conn_handshake(sock, &peer_nid, - &dst_nid, &conn); - if (rc != 0) - return rc; - - /* assume this is a new peer */ - rc = kranal_create_peer(&peer, peer_nid); - if (rc != 0) { - CERROR("Can't create conn for %s\n", - libcfs_nid2str(peer_nid)); - kranal_conn_decref(conn); - return -ENOMEM; - } - - write_lock_irqsave(&kranal_data.kra_global_lock, flags); - - peer2 = kranal_find_peer_locked(peer_nid); - if (peer2 == NULL) { - new_peer = 1; - } else { - /* peer_nid already in the peer table */ - kranal_peer_decref(peer); - peer = peer2; - } - } - - LASSERT ((!new_peer) != (!kranal_peer_active(peer))); - - /* Refuse connection if peer thinks we are a different NID. We check - * this while holding the global lock, to synch with connection - * destruction on NID change. */ - if (kranal_data.kra_ni->ni_nid != dst_nid) { - write_unlock_irqrestore(&kranal_data.kra_global_lock, - flags); - - CERROR("Stale/bad connection with %s: dst_nid %s, expected %s\n", - libcfs_nid2str(peer_nid), libcfs_nid2str(dst_nid), - libcfs_nid2str(kranal_data.kra_ni->ni_nid)); - rc = -ESTALE; - goto failed; - } - - /* Refuse to duplicate an existing connection (both sides might try to - * connect at once). NB we return success! We _are_ connected so we - * _don't_ have any blocked txs to complete with failure. */ - rc = kranal_conn_isdup_locked(peer, conn); - if (rc != 0) { - LASSERT (!cfs_list_empty(&peer->rap_conns)); - LASSERT (cfs_list_empty(&peer->rap_tx_queue)); - write_unlock_irqrestore(&kranal_data.kra_global_lock, - flags); - CWARN("Not creating duplicate connection to %s: %d\n", - libcfs_nid2str(peer_nid), rc); - rc = 0; - goto failed; - } - - if (new_peer) { - /* peer table takes my ref on the new peer */ - cfs_list_add_tail(&peer->rap_list, - kranal_nid2peerlist(peer_nid)); - } - - /* initialise timestamps before reaper looks at them */ - conn->rac_last_tx = conn->rac_last_rx = jiffies; - - kranal_peer_addref(peer); /* +1 ref for conn */ - conn->rac_peer = peer; - cfs_list_add_tail(&conn->rac_list, &peer->rap_conns); - - kranal_conn_addref(conn); /* +1 ref for conn table */ - cfs_list_add_tail(&conn->rac_hashlist, - kranal_cqid2connlist(conn->rac_cqid)); - - /* Schedule all packets blocking for a connection */ - while (!cfs_list_empty(&peer->rap_tx_queue)) { - tx = cfs_list_entry(peer->rap_tx_queue.next, - kra_tx_t, tx_list); - - cfs_list_del(&tx->tx_list); - kranal_post_fma(conn, tx); - } - - nstale = kranal_close_stale_conns_locked(peer, conn); - - write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); - - /* CAVEAT EMPTOR: passive peer can disappear NOW */ - - if (nstale != 0) - CWARN("Closed %d stale conns to %s\n", nstale, - libcfs_nid2str(peer_nid)); - - CWARN("New connection to %s on devid[%d] = %d\n", - libcfs_nid2str(peer_nid), - conn->rac_device->rad_idx, conn->rac_device->rad_id); - - /* Ensure conn gets checked. Transmits may have been queued and an - * FMA event may have happened before it got in the cq hash table */ - kranal_schedule_conn(conn); - return 0; - - failed: - if (new_peer) - kranal_peer_decref(peer); - kranal_conn_decref(conn); - return rc; -} - -void -kranal_connect (kra_peer_t *peer) -{ - kra_tx_t *tx; - unsigned long flags; - cfs_list_t zombies; - int rc; - - LASSERT (peer->rap_connecting); - - CDEBUG(D_NET, "About to handshake %s\n", - libcfs_nid2str(peer->rap_nid)); - - rc = kranal_conn_handshake(NULL, peer); - - CDEBUG(D_NET, "Done handshake %s:%d \n", - libcfs_nid2str(peer->rap_nid), rc); - - write_lock_irqsave(&kranal_data.kra_global_lock, flags); - - LASSERT (peer->rap_connecting); - peer->rap_connecting = 0; - - if (rc == 0) { - /* kranal_conn_handshake() queues blocked txs immediately on - * success to avoid messages jumping the queue */ - LASSERT (cfs_list_empty(&peer->rap_tx_queue)); - - peer->rap_reconnect_interval = 0; /* OK to reconnect at any time */ - - write_unlock_irqrestore(&kranal_data.kra_global_lock, - flags); - return; - } - - peer->rap_reconnect_interval *= 2; - peer->rap_reconnect_interval = - MAX(peer->rap_reconnect_interval, - *kranal_tunables.kra_min_reconnect_interval); - peer->rap_reconnect_interval = - MIN(peer->rap_reconnect_interval, - *kranal_tunables.kra_max_reconnect_interval); - - peer->rap_reconnect_time = jiffies + - msecs_to_jiffies(peer->rap_reconnect_interval * MSEC_PER_SEC); - - /* Grab all blocked packets while we have the global lock */ - cfs_list_add(&zombies, &peer->rap_tx_queue); - cfs_list_del_init(&peer->rap_tx_queue); - - write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); - - if (cfs_list_empty(&zombies)) - return; - - CNETERR("Dropping packets for %s: connection failed\n", - libcfs_nid2str(peer->rap_nid)); - - do { - tx = cfs_list_entry(zombies.next, kra_tx_t, tx_list); - - cfs_list_del(&tx->tx_list); - kranal_tx_done(tx, -EHOSTUNREACH); - - } while (!cfs_list_empty(&zombies)); -} - -void -kranal_free_acceptsock (kra_acceptsock_t *ras) -{ - libcfs_sock_release(ras->ras_sock); - LIBCFS_FREE(ras, sizeof(*ras)); -} - -int -kranal_accept (lnet_ni_t *ni, struct socket *sock) -{ - kra_acceptsock_t *ras; - int rc; - __u32 peer_ip; - int peer_port; - unsigned long flags; - - rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port); - LASSERT (rc == 0); /* we succeeded before */ - - LIBCFS_ALLOC(ras, sizeof(*ras)); - if (ras == NULL) { - CERROR("ENOMEM allocating connection request from " - "%u.%u.%u.%u\n", HIPQUAD(peer_ip)); - return -ENOMEM; - } - - ras->ras_sock = sock; - - spin_lock_irqsave(&kranal_data.kra_connd_lock, flags); - - cfs_list_add_tail(&ras->ras_list, &kranal_data.kra_connd_acceptq); - wake_up(&kranal_data.kra_connd_waitq); - - spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags); - return 0; -} - -int -kranal_create_peer (kra_peer_t **peerp, lnet_nid_t nid) -{ - kra_peer_t *peer; - unsigned long flags; - - LASSERT (nid != LNET_NID_ANY); - - LIBCFS_ALLOC(peer, sizeof(*peer)); - if (peer == NULL) - return -ENOMEM; - - memset(peer, 0, sizeof(*peer)); /* zero flags etc */ - - peer->rap_nid = nid; - atomic_set(&peer->rap_refcount, 1); /* 1 ref for caller */ - - CFS_INIT_LIST_HEAD(&peer->rap_list); - CFS_INIT_LIST_HEAD(&peer->rap_connd_list); - CFS_INIT_LIST_HEAD(&peer->rap_conns); - CFS_INIT_LIST_HEAD(&peer->rap_tx_queue); - - peer->rap_reconnect_interval = 0; /* OK to connect at any time */ - - write_lock_irqsave(&kranal_data.kra_global_lock, flags); - - if (kranal_data.kra_nonewpeers) { - /* shutdown has started already */ - write_unlock_irqrestore(&kranal_data.kra_global_lock, - flags); - - LIBCFS_FREE(peer, sizeof(*peer)); - CERROR("Can't create peer: network shutdown\n"); - return -ESHUTDOWN; - } - - atomic_inc(&kranal_data.kra_npeers); - - write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); - - *peerp = peer; - return 0; -} - -void -kranal_destroy_peer (kra_peer_t *peer) -{ - CDEBUG(D_NET, "peer %s %p deleted\n", - libcfs_nid2str(peer->rap_nid), peer); - - LASSERT (atomic_read(&peer->rap_refcount) == 0); - LASSERT (peer->rap_persistence == 0); - LASSERT (!kranal_peer_active(peer)); - LASSERT (!peer->rap_connecting); - LASSERT (cfs_list_empty(&peer->rap_conns)); - LASSERT (cfs_list_empty(&peer->rap_tx_queue)); - LASSERT (cfs_list_empty(&peer->rap_connd_list)); - - LIBCFS_FREE(peer, sizeof(*peer)); - - /* NB a peer's connections keep a reference on their peer until - * they are destroyed, so we can be assured that _all_ state to do - * with this peer has been cleaned up when its refcount drops to - * zero. */ - atomic_dec(&kranal_data.kra_npeers); -} - -kra_peer_t * -kranal_find_peer_locked (lnet_nid_t nid) -{ - cfs_list_t *peer_list = kranal_nid2peerlist(nid); - cfs_list_t *tmp; - kra_peer_t *peer; - - cfs_list_for_each (tmp, peer_list) { - - peer = cfs_list_entry(tmp, kra_peer_t, rap_list); - - LASSERT (peer->rap_persistence > 0 || /* persistent peer */ - !cfs_list_empty(&peer->rap_conns)); /* active conn */ - - if (peer->rap_nid != nid) - continue; - - CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n", - peer, libcfs_nid2str(nid), - atomic_read(&peer->rap_refcount)); - return peer; - } - return NULL; -} - -kra_peer_t * -kranal_find_peer (lnet_nid_t nid) -{ - kra_peer_t *peer; - - read_lock(&kranal_data.kra_global_lock); - peer = kranal_find_peer_locked(nid); - if (peer != NULL) /* +1 ref for caller? */ - kranal_peer_addref(peer); - read_unlock(&kranal_data.kra_global_lock); - - return peer; -} - -void -kranal_unlink_peer_locked (kra_peer_t *peer) -{ - LASSERT (peer->rap_persistence == 0); - LASSERT (cfs_list_empty(&peer->rap_conns)); - - LASSERT (kranal_peer_active(peer)); - cfs_list_del_init(&peer->rap_list); - - /* lose peerlist's ref */ - kranal_peer_decref(peer); -} - -int -kranal_get_peer_info (int index, lnet_nid_t *nidp, __u32 *ipp, int *portp, - int *persistencep) -{ - kra_peer_t *peer; - cfs_list_t *ptmp; - int i; - - read_lock(&kranal_data.kra_global_lock); - - for (i = 0; i < kranal_data.kra_peer_hash_size; i++) { - - cfs_list_for_each(ptmp, &kranal_data.kra_peers[i]) { - - peer = cfs_list_entry(ptmp, kra_peer_t, rap_list); - LASSERT (peer->rap_persistence > 0 || - !cfs_list_empty(&peer->rap_conns)); - - if (index-- > 0) - continue; - - *nidp = peer->rap_nid; - *ipp = peer->rap_ip; - *portp = peer->rap_port; - *persistencep = peer->rap_persistence; - - read_unlock(&kranal_data.kra_global_lock); - return 0; - } - } - - read_unlock(&kranal_data.kra_global_lock); - return -ENOENT; -} - -int -kranal_add_persistent_peer (lnet_nid_t nid, __u32 ip, int port) -{ - unsigned long flags; - kra_peer_t *peer; - kra_peer_t *peer2; - int rc; - - if (nid == LNET_NID_ANY) - return -EINVAL; - - rc = kranal_create_peer(&peer, nid); - if (rc != 0) - return rc; - - write_lock_irqsave(&kranal_data.kra_global_lock, flags); - - peer2 = kranal_find_peer_locked(nid); - if (peer2 != NULL) { - kranal_peer_decref(peer); - peer = peer2; - } else { - /* peer table takes existing ref on peer */ - cfs_list_add_tail(&peer->rap_list, - kranal_nid2peerlist(nid)); - } - - peer->rap_ip = ip; - peer->rap_port = port; - peer->rap_persistence++; - - write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); - return 0; -} - -void -kranal_del_peer_locked (kra_peer_t *peer) -{ - cfs_list_t *ctmp; - cfs_list_t *cnxt; - kra_conn_t *conn; - - peer->rap_persistence = 0; - - if (cfs_list_empty(&peer->rap_conns)) { - kranal_unlink_peer_locked(peer); - } else { - cfs_list_for_each_safe(ctmp, cnxt, &peer->rap_conns) { - conn = cfs_list_entry(ctmp, kra_conn_t, rac_list); - - kranal_close_conn_locked(conn, 0); - } - /* peer unlinks itself when last conn is closed */ - } -} - -int -kranal_del_peer (lnet_nid_t nid) -{ - unsigned long flags; - cfs_list_t *ptmp; - cfs_list_t *pnxt; - kra_peer_t *peer; - int lo; - int hi; - int i; - int rc = -ENOENT; - - write_lock_irqsave(&kranal_data.kra_global_lock, flags); - - if (nid != LNET_NID_ANY) - lo = hi = kranal_nid2peerlist(nid) - kranal_data.kra_peers; - else { - lo = 0; - hi = kranal_data.kra_peer_hash_size - 1; - } - - for (i = lo; i <= hi; i++) { - cfs_list_for_each_safe (ptmp, pnxt, &kranal_data.kra_peers[i]) { - peer = cfs_list_entry(ptmp, kra_peer_t, rap_list); - LASSERT (peer->rap_persistence > 0 || - !cfs_list_empty(&peer->rap_conns)); - - if (!(nid == LNET_NID_ANY || peer->rap_nid == nid)) - continue; - - kranal_del_peer_locked(peer); - rc = 0; /* matched something */ - } - } - - write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); - - return rc; -} - -kra_conn_t * -kranal_get_conn_by_idx (int index) -{ - kra_peer_t *peer; - cfs_list_t *ptmp; - kra_conn_t *conn; - cfs_list_t *ctmp; - int i; - - read_lock(&kranal_data.kra_global_lock); - - for (i = 0; i < kranal_data.kra_peer_hash_size; i++) { - cfs_list_for_each (ptmp, &kranal_data.kra_peers[i]) { - - peer = cfs_list_entry(ptmp, kra_peer_t, rap_list); - LASSERT (peer->rap_persistence > 0 || - !cfs_list_empty(&peer->rap_conns)); - - cfs_list_for_each (ctmp, &peer->rap_conns) { - if (index-- > 0) - continue; - - conn = cfs_list_entry(ctmp, kra_conn_t, - rac_list); - CDEBUG(D_NET, "++conn[%p] -> %s (%d)\n", conn, - libcfs_nid2str(conn->rac_peer->rap_nid), - atomic_read(&conn->rac_refcount)); - atomic_inc(&conn->rac_refcount); - read_unlock(&kranal_data.kra_global_lock); - return conn; - } - } - } - - read_unlock(&kranal_data.kra_global_lock); - return NULL; -} - -int -kranal_close_peer_conns_locked (kra_peer_t *peer, int why) -{ - kra_conn_t *conn; - cfs_list_t *ctmp; - cfs_list_t *cnxt; - int count = 0; - - cfs_list_for_each_safe (ctmp, cnxt, &peer->rap_conns) { - conn = cfs_list_entry(ctmp, kra_conn_t, rac_list); - - count++; - kranal_close_conn_locked(conn, why); - } - - return count; -} - -int -kranal_close_matching_conns (lnet_nid_t nid) -{ - unsigned long flags; - kra_peer_t *peer; - cfs_list_t *ptmp; - cfs_list_t *pnxt; - int lo; - int hi; - int i; - int count = 0; - - write_lock_irqsave(&kranal_data.kra_global_lock, flags); - - if (nid != LNET_NID_ANY) - lo = hi = kranal_nid2peerlist(nid) - kranal_data.kra_peers; - else { - lo = 0; - hi = kranal_data.kra_peer_hash_size - 1; - } - - for (i = lo; i <= hi; i++) { - cfs_list_for_each_safe (ptmp, pnxt, &kranal_data.kra_peers[i]) { - - peer = cfs_list_entry(ptmp, kra_peer_t, rap_list); - LASSERT (peer->rap_persistence > 0 || - !cfs_list_empty(&peer->rap_conns)); - - if (!(nid == LNET_NID_ANY || nid == peer->rap_nid)) - continue; - - count += kranal_close_peer_conns_locked(peer, 0); - } - } - - write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); - - /* wildcards always succeed */ - if (nid == LNET_NID_ANY) - return 0; - - return (count == 0) ? -ENOENT : 0; -} - -int -kranal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) -{ - struct libcfs_ioctl_data *data = arg; - int rc = -EINVAL; - - LASSERT (ni == kranal_data.kra_ni); - - switch(cmd) { - case IOC_LIBCFS_GET_PEER: { - lnet_nid_t nid = 0; - __u32 ip = 0; - int port = 0; - int share_count = 0; - - rc = kranal_get_peer_info(data->ioc_count, - &nid, &ip, &port, &share_count); - data->ioc_nid = nid; - data->ioc_count = share_count; - data->ioc_u32[0] = ip; - data->ioc_u32[1] = port; - break; - } - case IOC_LIBCFS_ADD_PEER: { - rc = kranal_add_persistent_peer(data->ioc_nid, - data->ioc_u32[0], /* IP */ - data->ioc_u32[1]); /* port */ - break; - } - case IOC_LIBCFS_DEL_PEER: { - rc = kranal_del_peer(data->ioc_nid); - break; - } - case IOC_LIBCFS_GET_CONN: { - kra_conn_t *conn = kranal_get_conn_by_idx(data->ioc_count); - - if (conn == NULL) - rc = -ENOENT; - else { - rc = 0; - data->ioc_nid = conn->rac_peer->rap_nid; - data->ioc_u32[0] = conn->rac_device->rad_id; - kranal_conn_decref(conn); - } - break; - } - case IOC_LIBCFS_CLOSE_CONNECTION: { - rc = kranal_close_matching_conns(data->ioc_nid); - break; - } - case IOC_LIBCFS_REGISTER_MYNID: { - /* Ignore if this is a noop */ - if (data->ioc_nid == ni->ni_nid) { - rc = 0; - } else { - CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n", - libcfs_nid2str(data->ioc_nid), - libcfs_nid2str(ni->ni_nid)); - rc = -EINVAL; - } - break; - } - } - - return rc; -} - -void -kranal_free_txdescs(cfs_list_t *freelist) -{ - kra_tx_t *tx; - - while (!cfs_list_empty(freelist)) { - tx = cfs_list_entry(freelist->next, kra_tx_t, tx_list); - - cfs_list_del(&tx->tx_list); - LIBCFS_FREE(tx->tx_phys, LNET_MAX_IOV * sizeof(*tx->tx_phys)); - LIBCFS_FREE(tx, sizeof(*tx)); - } -} - -int -kranal_alloc_txdescs(cfs_list_t *freelist, int n) -{ - int i; - kra_tx_t *tx; - - LASSERT (freelist == &kranal_data.kra_idle_txs); - LASSERT (cfs_list_empty(freelist)); - - for (i = 0; i < n; i++) { - - LIBCFS_ALLOC(tx, sizeof(*tx)); - if (tx == NULL) { - CERROR("Can't allocate tx[%d]\n", i); - kranal_free_txdescs(freelist); - return -ENOMEM; - } - - LIBCFS_ALLOC(tx->tx_phys, - LNET_MAX_IOV * sizeof(*tx->tx_phys)); - if (tx->tx_phys == NULL) { - CERROR("Can't allocate tx[%d]->tx_phys\n", i); - - LIBCFS_FREE(tx, sizeof(*tx)); - kranal_free_txdescs(freelist); - return -ENOMEM; - } - - tx->tx_buftype = RANAL_BUF_NONE; - tx->tx_msg.ram_type = RANAL_MSG_NONE; - - cfs_list_add(&tx->tx_list, freelist); - } - - return 0; -} - -int -kranal_device_init(int id, kra_device_t *dev) -{ - int total_ntx = *kranal_tunables.kra_ntx; - RAP_RETURN rrc; - - dev->rad_id = id; - rrc = RapkGetDeviceByIndex(id, kranal_device_callback, - &dev->rad_handle); - if (rrc != RAP_SUCCESS) { - CERROR("Can't get Rapidarray Device %d: %d\n", id, rrc); - goto failed_0; - } - - rrc = RapkReserveRdma(dev->rad_handle, total_ntx); - if (rrc != RAP_SUCCESS) { - CERROR("Can't reserve %d RDMA descriptors" - " for device %d: %d\n", total_ntx, id, rrc); - goto failed_1; - } - - rrc = RapkCreateCQ(dev->rad_handle, total_ntx, RAP_CQTYPE_SEND, - &dev->rad_rdma_cqh); - if (rrc != RAP_SUCCESS) { - CERROR("Can't create rdma cq size %d for device %d: %d\n", - total_ntx, id, rrc); - goto failed_1; - } - - rrc = RapkCreateCQ(dev->rad_handle, - *kranal_tunables.kra_fma_cq_size, - RAP_CQTYPE_RECV, &dev->rad_fma_cqh); - if (rrc != RAP_SUCCESS) { - CERROR("Can't create fma cq size %d for device %d: %d\n", - *kranal_tunables.kra_fma_cq_size, id, rrc); - goto failed_2; - } - - return 0; - - failed_2: - RapkDestroyCQ(dev->rad_handle, dev->rad_rdma_cqh); - failed_1: - RapkReleaseDevice(dev->rad_handle); - failed_0: - return -ENODEV; -} - -void -kranal_device_fini(kra_device_t *dev) -{ - LASSERT (cfs_list_empty(&dev->rad_ready_conns)); - LASSERT (cfs_list_empty(&dev->rad_new_conns)); - LASSERT (dev->rad_nphysmap == 0); - LASSERT (dev->rad_nppphysmap == 0); - LASSERT (dev->rad_nvirtmap == 0); - LASSERT (dev->rad_nobvirtmap == 0); - - LASSERT(dev->rad_scheduler == NULL); - RapkDestroyCQ(dev->rad_handle, dev->rad_fma_cqh); - RapkDestroyCQ(dev->rad_handle, dev->rad_rdma_cqh); - RapkReleaseDevice(dev->rad_handle); -} - -void -kranal_shutdown (lnet_ni_t *ni) -{ - int i; - unsigned long flags; - - CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", - atomic_read(&libcfs_kmemory)); - - LASSERT (ni == kranal_data.kra_ni); - LASSERT (ni->ni_data == &kranal_data); - - switch (kranal_data.kra_init) { - default: - CERROR("Unexpected state %d\n", kranal_data.kra_init); - LBUG(); - - case RANAL_INIT_ALL: - /* Prevent new peers from being created */ - write_lock_irqsave(&kranal_data.kra_global_lock, flags); - kranal_data.kra_nonewpeers = 1; - write_unlock_irqrestore(&kranal_data.kra_global_lock, - flags); - - /* Remove all existing peers from the peer table */ - kranal_del_peer(LNET_NID_ANY); - - /* Wait for pending conn reqs to be handled */ - i = 2; - spin_lock_irqsave(&kranal_data.kra_connd_lock, flags); - while (!cfs_list_empty(&kranal_data.kra_connd_acceptq)) { - spin_unlock_irqrestore(&kranal_data.kra_connd_lock, - flags); - i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n */ - "waiting for conn reqs to clean up\n"); - cfs_pause(cfs_time_seconds(1)); - - spin_lock_irqsave(&kranal_data.kra_connd_lock, - flags); - } - spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags); - - /* Wait for all peers to be freed */ - i = 2; - while (atomic_read(&kranal_data.kra_npeers) != 0) { - i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n */ - "waiting for %d peers to close down\n", - atomic_read(&kranal_data.kra_npeers)); - cfs_pause(cfs_time_seconds(1)); - } - /* fall through */ - - case RANAL_INIT_DATA: - break; - } - - /* Peer state all cleaned up BEFORE setting shutdown, so threads don't - * have to worry about shutdown races. NB connections may be created - * while there are still active connds, but these will be temporary - * since peer creation always fails after the listener has started to - * shut down. */ - LASSERT (atomic_read(&kranal_data.kra_npeers) == 0); - - /* Flag threads to terminate */ - kranal_data.kra_shutdown = 1; - - for (i = 0; i < kranal_data.kra_ndevs; i++) { - kra_device_t *dev = &kranal_data.kra_devices[i]; - - spin_lock_irqsave(&dev->rad_lock, flags); - wake_up(&dev->rad_waitq); - spin_unlock_irqrestore(&dev->rad_lock, flags); - } - - spin_lock_irqsave(&kranal_data.kra_reaper_lock, flags); - wake_up_all(&kranal_data.kra_reaper_waitq); - spin_unlock_irqrestore(&kranal_data.kra_reaper_lock, flags); - - LASSERT (cfs_list_empty(&kranal_data.kra_connd_peers)); - spin_lock_irqsave(&kranal_data.kra_connd_lock, flags); - wake_up_all(&kranal_data.kra_connd_waitq); - spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags); - - /* Wait for threads to exit */ - i = 2; - while (atomic_read(&kranal_data.kra_nthreads) != 0) { - i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ - "Waiting for %d threads to terminate\n", - atomic_read(&kranal_data.kra_nthreads)); - cfs_pause(cfs_time_seconds(1)); - } - - LASSERT (atomic_read(&kranal_data.kra_npeers) == 0); - if (kranal_data.kra_peers != NULL) { - for (i = 0; i < kranal_data.kra_peer_hash_size; i++) - LASSERT (cfs_list_empty(&kranal_data.kra_peers[i])); - - LIBCFS_FREE(kranal_data.kra_peers, - sizeof (cfs_list_t) * - kranal_data.kra_peer_hash_size); - } - - LASSERT (atomic_read(&kranal_data.kra_nconns) == 0); - if (kranal_data.kra_conns != NULL) { - for (i = 0; i < kranal_data.kra_conn_hash_size; i++) - LASSERT (cfs_list_empty(&kranal_data.kra_conns[i])); - - LIBCFS_FREE(kranal_data.kra_conns, - sizeof (cfs_list_t) * - kranal_data.kra_conn_hash_size); - } - - for (i = 0; i < kranal_data.kra_ndevs; i++) - kranal_device_fini(&kranal_data.kra_devices[i]); - - kranal_free_txdescs(&kranal_data.kra_idle_txs); - - CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", - atomic_read(&libcfs_kmemory)); - - kranal_data.kra_init = RANAL_INIT_NOTHING; - module_put(THIS_MODULE); -} - -int -kranal_startup (lnet_ni_t *ni) -{ - struct timeval tv; - int pkmem = atomic_read(&libcfs_kmemory); - int rc; - int i; - kra_device_t *dev; - char name[16]; - - LASSERT (ni->ni_lnd == &the_kralnd); - - /* Only 1 instance supported */ - if (kranal_data.kra_init != RANAL_INIT_NOTHING) { - CERROR ("Only 1 instance supported\n"); - return -EPERM; - } - - if (lnet_set_ip_niaddr(ni) != 0) { - CERROR ("Can't determine my NID\n"); - return -EPERM; - } - - if (*kranal_tunables.kra_credits > *kranal_tunables.kra_ntx) { - CERROR ("Can't set credits(%d) > ntx(%d)\n", - *kranal_tunables.kra_credits, - *kranal_tunables.kra_ntx); - return -EINVAL; - } - - memset(&kranal_data, 0, sizeof(kranal_data)); /* zero pointers, flags etc */ - - ni->ni_maxtxcredits = *kranal_tunables.kra_credits; - ni->ni_peertxcredits = *kranal_tunables.kra_peercredits; - - ni->ni_data = &kranal_data; - kranal_data.kra_ni = ni; - - /* CAVEAT EMPTOR: Every 'Fma' message includes the sender's NID and - * a unique (for all time) connstamp so we can uniquely identify - * the sender. The connstamp is an incrementing counter - * initialised with seconds + microseconds at startup time. So we - * rely on NOT creating connections more frequently on average than - * 1MHz to ensure we don't use old connstamps when we reboot. */ - do_gettimeofday(&tv); - kranal_data.kra_connstamp = - kranal_data.kra_peerstamp = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; - - rwlock_init(&kranal_data.kra_global_lock); - - for (i = 0; i < RANAL_MAXDEVS; i++ ) { - kra_device_t *dev = &kranal_data.kra_devices[i]; - - dev->rad_idx = i; - CFS_INIT_LIST_HEAD(&dev->rad_ready_conns); - CFS_INIT_LIST_HEAD(&dev->rad_new_conns); - init_waitqueue_head(&dev->rad_waitq); - spin_lock_init(&dev->rad_lock); - } - - kranal_data.kra_new_min_timeout = MAX_SCHEDULE_TIMEOUT; - init_waitqueue_head(&kranal_data.kra_reaper_waitq); - spin_lock_init(&kranal_data.kra_reaper_lock); - - CFS_INIT_LIST_HEAD(&kranal_data.kra_connd_acceptq); - CFS_INIT_LIST_HEAD(&kranal_data.kra_connd_peers); - init_waitqueue_head(&kranal_data.kra_connd_waitq); - spin_lock_init(&kranal_data.kra_connd_lock); - - CFS_INIT_LIST_HEAD(&kranal_data.kra_idle_txs); - spin_lock_init(&kranal_data.kra_tx_lock); - - /* OK to call kranal_api_shutdown() to cleanup now */ - kranal_data.kra_init = RANAL_INIT_DATA; - try_module_get(THIS_MODULE); - - kranal_data.kra_peer_hash_size = RANAL_PEER_HASH_SIZE; - LIBCFS_ALLOC(kranal_data.kra_peers, - sizeof(cfs_list_t) * - kranal_data.kra_peer_hash_size); - if (kranal_data.kra_peers == NULL) - goto failed; - - for (i = 0; i < kranal_data.kra_peer_hash_size; i++) - CFS_INIT_LIST_HEAD(&kranal_data.kra_peers[i]); - - kranal_data.kra_conn_hash_size = RANAL_PEER_HASH_SIZE; - LIBCFS_ALLOC(kranal_data.kra_conns, - sizeof(cfs_list_t) * - kranal_data.kra_conn_hash_size); - if (kranal_data.kra_conns == NULL) - goto failed; - - for (i = 0; i < kranal_data.kra_conn_hash_size; i++) - CFS_INIT_LIST_HEAD(&kranal_data.kra_conns[i]); - - rc = kranal_alloc_txdescs(&kranal_data.kra_idle_txs, - *kranal_tunables.kra_ntx); - if (rc != 0) - goto failed; - - rc = kranal_thread_start(kranal_reaper, NULL, "kranal_reaper"); - if (rc != 0) { - CERROR("Can't spawn ranal reaper: %d\n", rc); - goto failed; - } - - for (i = 0; i < *kranal_tunables.kra_n_connd; i++) { - snprintf(name, sizeof(name), "kranal_connd_%02ld", i); - rc = kranal_thread_start(kranal_connd, - (void *)(unsigned long)i, name); - if (rc != 0) { - CERROR("Can't spawn ranal connd[%d]: %d\n", - i, rc); - goto failed; - } - } - - LASSERT (kranal_data.kra_ndevs == 0); - - /* Use all available RapidArray devices */ - for (i = 0; i < RANAL_MAXDEVS; i++) { - dev = &kranal_data.kra_devices[kranal_data.kra_ndevs]; - - rc = kranal_device_init(kranal_devids[i], dev); - if (rc == 0) - kranal_data.kra_ndevs++; - } - - if (kranal_data.kra_ndevs == 0) { - CERROR("Can't initialise any RapidArray devices\n"); - goto failed; - } - - for (i = 0; i < kranal_data.kra_ndevs; i++) { - dev = &kranal_data.kra_devices[i]; - snprintf(name, sizeof(name), "kranal_sd_%02d", dev->rad_idx); - rc = kranal_thread_start(kranal_scheduler, dev, name); - if (rc != 0) { - CERROR("Can't spawn ranal scheduler[%d]: %d\n", - i, rc); - goto failed; - } - } - - /* flag everything initialised */ - kranal_data.kra_init = RANAL_INIT_ALL; - /*****************************************************/ - - CDEBUG(D_MALLOC, "initial kmem %d\n", pkmem); - return 0; - - failed: - kranal_shutdown(ni); - return -ENETDOWN; -} - -void __exit -kranal_module_fini (void) -{ - lnet_unregister_lnd(&the_kralnd); - kranal_tunables_fini(); -} - -int __init -kranal_module_init (void) -{ - int rc; - - rc = kranal_tunables_init(); - if (rc != 0) - return rc; - - lnet_register_lnd(&the_kralnd); - - return 0; -} - -MODULE_AUTHOR("Sun Microsystems, Inc. "); -MODULE_DESCRIPTION("Kernel RapidArray LND v0.01"); -MODULE_LICENSE("GPL"); - -module_init(kranal_module_init); -module_exit(kranal_module_fini); diff --git a/lnet/klnds/ralnd/ralnd.h b/lnet/klnds/ralnd/ralnd.h deleted file mode 100644 index fb7aa20..0000000 --- a/lnet/klnds/ralnd/ralnd.h +++ /dev/null @@ -1,464 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. - * - * GPL HEADER END - */ -/* - * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/ralnd/ralnd.h - * - * Author: Eric Barton - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#define DEBUG_SUBSYSTEM S_LND - -#include -#include -#include - -#include - -/* tunables determined at compile time */ -#define RANAL_RESCHED 100 /* # scheduler loops before reschedule */ - -#define RANAL_PEER_HASH_SIZE 101 /* # peer lists */ -#define RANAL_CONN_HASH_SIZE 101 /* # conn lists */ - -#define RANAL_MIN_TIMEOUT 5 /* minimum timeout interval (seconds) */ -#define RANAL_TIMEOUT2KEEPALIVE(t) (((t)+1)/2) /* timeout -> keepalive interval */ - -/* fixed constants */ -#define RANAL_MAXDEVS 2 /* max # devices RapidArray supports */ -#define RANAL_FMA_MAX_PREFIX 232 /* max bytes in FMA "Prefix" we can use */ -#define RANAL_FMA_MAX_DATA ((7<<10)-256) /* Max FMA MSG is 7K including prefix */ - - -typedef struct -{ - int *kra_n_connd; /* # connection daemons */ - int *kra_min_reconnect_interval; /* first failed connection retry... */ - int *kra_max_reconnect_interval; /* ...exponentially increasing to this */ - int *kra_ntx; /* # tx descs */ - int *kra_credits; /* # concurrent sends */ - int *kra_peercredits; /* # concurrent sends to 1 peer */ - int *kra_fma_cq_size; /* # entries in receive CQ */ - int *kra_timeout; /* comms timeout (seconds) */ - int *kra_max_immediate; /* immediate payload breakpoint */ - -#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM - struct ctl_table_header *kra_sysctl; /* sysctl interface */ -#endif -} kra_tunables_t; - -typedef struct -{ - RAP_PVOID rad_handle; /* device handle */ - RAP_PVOID rad_fma_cqh; /* FMA completion queue handle */ - RAP_PVOID rad_rdma_cqh; /* rdma completion queue handle */ - int rad_id; /* device id */ - int rad_idx; /* index in kra_devices */ - int rad_ready; /* set by device callback */ - cfs_list_t rad_ready_conns;/* connections ready to tx/rx */ - cfs_list_t rad_new_conns; /* new connections to complete */ - wait_queue_head_t rad_waitq; /* scheduler waits here */ - spinlock_t rad_lock; /* serialise */ - void *rad_scheduler; /* scheduling thread */ - unsigned int rad_nphysmap; /* # phys mappings */ - unsigned int rad_nppphysmap;/* # phys pages mapped */ - unsigned int rad_nvirtmap; /* # virt mappings */ - unsigned long rad_nobvirtmap;/* # virt bytes mapped */ -} kra_device_t; - -typedef struct -{ - int kra_init; /* initialisation state */ - int kra_shutdown; /* shut down? */ - atomic_t kra_nthreads; /* # live threads */ - lnet_ni_t *kra_ni; /* _the_ nal instance */ - - kra_device_t kra_devices[RANAL_MAXDEVS]; /* device/ptag/cq */ - int kra_ndevs; /* # devices */ - - rwlock_t kra_global_lock; /* stabilize peer/conn ops */ - - cfs_list_t *kra_peers; /* hash table of all my known peers */ - int kra_peer_hash_size; /* size of kra_peers */ - atomic_t kra_npeers; /* # peers extant */ - int kra_nonewpeers; /* prevent new peers */ - - cfs_list_t *kra_conns; /* conns hashed by cqid */ - int kra_conn_hash_size; /* size of kra_conns */ - __u64 kra_peerstamp; /* when I started up */ - __u64 kra_connstamp; /* conn stamp generator */ - int kra_next_cqid; /* cqid generator */ - atomic_t kra_nconns; /* # connections extant */ - - long kra_new_min_timeout; /* minimum timeout on any new conn */ - wait_queue_head_t kra_reaper_waitq; /* reaper sleeps here */ - spinlock_t kra_reaper_lock; /* serialise */ - - cfs_list_t kra_connd_peers; /* peers waiting for a connection */ - cfs_list_t kra_connd_acceptq; /* accepted sockets to handshake */ - wait_queue_head_t kra_connd_waitq; /* connection daemons sleep here */ - spinlock_t kra_connd_lock; /* serialise */ - - cfs_list_t kra_idle_txs; /* idle tx descriptors */ - __u64 kra_next_tx_cookie; /* RDMA completion cookie */ - spinlock_t kra_tx_lock; /* serialise */ -} kra_data_t; - -#define RANAL_INIT_NOTHING 0 -#define RANAL_INIT_DATA 1 -#define RANAL_INIT_ALL 2 - -typedef struct kra_acceptsock /* accepted socket queued for connd */ -{ - cfs_list_t ras_list; /* queue for attention */ - struct socket *ras_sock; /* the accepted socket */ -} kra_acceptsock_t; - -/************************************************************************ - * Wire message structs. These are sent in sender's byte order - * (i.e. receiver checks magic and flips if required). - */ - -typedef struct kra_connreq /* connection request/response */ -{ /* (sent via socket) */ - __u32 racr_magic; /* I'm an ranal connreq */ - __u16 racr_version; /* this is my version number */ - __u16 racr_devid; /* sender's device ID */ - __u64 racr_srcnid; /* sender's NID */ - __u64 racr_dstnid; /* who sender expects to listen */ - __u64 racr_peerstamp; /* sender's instance stamp */ - __u64 racr_connstamp; /* sender's connection stamp */ - __u32 racr_timeout; /* sender's timeout */ - RAP_RI_PARAMETERS racr_riparams; /* sender's endpoint info */ -} kra_connreq_t; - -typedef struct -{ - RAP_MEM_KEY rard_key; - RAP_PVOID64 rard_addr; - RAP_UINT32 rard_nob; -} kra_rdma_desc_t; - -typedef struct -{ - lnet_hdr_t raim_hdr; /* portals header */ - /* Portals payload is in FMA "Message Data" */ -} kra_immediate_msg_t; - -typedef struct -{ - lnet_hdr_t raprm_hdr; /* portals header */ - __u64 raprm_cookie; /* opaque completion cookie */ -} kra_putreq_msg_t; - -typedef struct -{ - __u64 rapam_src_cookie; /* reflected completion cookie */ - __u64 rapam_dst_cookie; /* opaque completion cookie */ - kra_rdma_desc_t rapam_desc; /* sender's sink buffer */ -} kra_putack_msg_t; - -typedef struct -{ - lnet_hdr_t ragm_hdr; /* portals header */ - __u64 ragm_cookie; /* opaque completion cookie */ - kra_rdma_desc_t ragm_desc; /* sender's sink buffer */ -} kra_get_msg_t; - -typedef struct -{ - __u64 racm_cookie; /* reflected completion cookie */ -} kra_completion_msg_t; - -typedef struct /* NB must fit in FMA "Prefix" */ -{ - __u32 ram_magic; /* I'm an ranal message */ - __u16 ram_version; /* this is my version number */ - __u16 ram_type; /* msg type */ - __u64 ram_srcnid; /* sender's NID */ - __u64 ram_connstamp; /* sender's connection stamp */ - union { - kra_immediate_msg_t immediate; - kra_putreq_msg_t putreq; - kra_putack_msg_t putack; - kra_get_msg_t get; - kra_completion_msg_t completion; - } ram_u; - __u32 ram_seq; /* incrementing sequence number */ -} kra_msg_t; - -#define RANAL_MSG_MAGIC LNET_PROTO_RA_MAGIC /* unique magic */ -#define RANAL_MSG_VERSION 1 /* current protocol version */ - -#define RANAL_MSG_FENCE 0x80 /* fence RDMA */ - -#define RANAL_MSG_NONE 0x00 /* illegal message */ -#define RANAL_MSG_NOOP 0x01 /* empty ram_u (keepalive) */ -#define RANAL_MSG_IMMEDIATE 0x02 /* ram_u.immediate */ -#define RANAL_MSG_PUT_REQ 0x03 /* ram_u.putreq (src->sink) */ -#define RANAL_MSG_PUT_NAK 0x04 /* ram_u.completion (no PUT match: sink->src) */ -#define RANAL_MSG_PUT_ACK 0x05 /* ram_u.putack (PUT matched: sink->src) */ -#define RANAL_MSG_PUT_DONE 0x86 /* ram_u.completion (src->sink) */ -#define RANAL_MSG_GET_REQ 0x07 /* ram_u.get (sink->src) */ -#define RANAL_MSG_GET_NAK 0x08 /* ram_u.completion (no GET match: src->sink) */ -#define RANAL_MSG_GET_DONE 0x89 /* ram_u.completion (src->sink) */ -#define RANAL_MSG_CLOSE 0x8a /* empty ram_u */ - -/***********************************************************************/ - -typedef struct kra_tx /* message descriptor */ -{ - cfs_list_t tx_list; /* queue on idle_txs/rac_sendq/rac_waitq */ - struct kra_conn *tx_conn; /* owning conn */ - lnet_msg_t *tx_lntmsg[2]; /* ptl msgs to finalize on completion */ - unsigned long tx_qtime; /* when tx started to wait for something (jiffies) */ - int tx_nob; /* # bytes of payload */ - int tx_buftype; /* payload buffer type */ - void *tx_buffer; /* source/sink buffer */ - int tx_phys_offset; /* first page offset (if phys) */ - int tx_phys_npages; /* # physical pages */ - RAP_PHYS_REGION *tx_phys; /* page descriptors */ - RAP_MEM_KEY tx_map_key; /* mapping key */ - RAP_RDMA_DESCRIPTOR tx_rdma_desc; /* rdma descriptor */ - __u64 tx_cookie; /* identify this tx to peer */ - kra_msg_t tx_msg; /* FMA message buffer */ -} kra_tx_t; - -#define RANAL_BUF_NONE 0 /* buffer type not set */ -#define RANAL_BUF_IMMEDIATE 1 /* immediate data */ -#define RANAL_BUF_PHYS_UNMAPPED 2 /* physical: not mapped yet */ -#define RANAL_BUF_PHYS_MAPPED 3 /* physical: mapped already */ -#define RANAL_BUF_VIRT_UNMAPPED 4 /* virtual: not mapped yet */ -#define RANAL_BUF_VIRT_MAPPED 5 /* virtual: mapped already */ - -typedef struct kra_conn -{ - struct kra_peer *rac_peer; /* owning peer */ - cfs_list_t rac_list; /* stash on peer's conn list */ - cfs_list_t rac_hashlist; /* stash in connection hash table */ - cfs_list_t rac_schedlist; /* schedule (on rad_???_conns) for attention */ - cfs_list_t rac_fmaq; /* txs queued for FMA */ - cfs_list_t rac_rdmaq; /* txs awaiting RDMA completion */ - cfs_list_t rac_replyq; /* txs awaiting replies */ - __u64 rac_peerstamp; /* peer's unique stamp */ - __u64 rac_peer_connstamp;/* peer's unique connection stamp */ - __u64 rac_my_connstamp; /* my unique connection stamp */ - unsigned long rac_last_tx; /* when I last sent an FMA message (jiffies) */ - unsigned long rac_last_rx; /* when I last received an FMA messages (jiffies) */ - long rac_keepalive; /* keepalive interval (seconds) */ - long rac_timeout; /* infer peer death if no rx for this many seconds */ - __u32 rac_cqid; /* my completion callback id (non-unique) */ - __u32 rac_tx_seq; /* tx msg sequence number */ - __u32 rac_rx_seq; /* rx msg sequence number */ - atomic_t rac_refcount; /* # users */ - unsigned int rac_close_sent; /* I've sent CLOSE */ - unsigned int rac_close_recvd; /* I've received CLOSE */ - unsigned int rac_state; /* connection state */ - unsigned int rac_scheduled; /* being attented to */ - spinlock_t rac_lock; /* serialise */ - kra_device_t *rac_device; /* which device */ - RAP_PVOID rac_rihandle; /* RA endpoint */ - kra_msg_t *rac_rxmsg; /* incoming message (FMA prefix) */ - kra_msg_t rac_msg; /* keepalive/CLOSE message buffer */ -} kra_conn_t; - -#define RANAL_CONN_ESTABLISHED 0 -#define RANAL_CONN_CLOSING 1 -#define RANAL_CONN_CLOSED 2 - -typedef struct kra_peer { - cfs_list_t rap_list; /* stash on global peer list */ - cfs_list_t rap_connd_list; /* schedule on kra_connd_peers */ - cfs_list_t rap_conns; /* all active connections */ - cfs_list_t rap_tx_queue; /* msgs waiting for a conn */ - lnet_nid_t rap_nid; /* who's on the other end(s) */ - __u32 rap_ip; /* IP address of peer */ - int rap_port; /* port on which peer listens */ - atomic_t rap_refcount; /* # users */ - int rap_persistence; /* "known" peer refs */ - int rap_connecting; /* connection forming */ - unsigned long rap_reconnect_time; /* get_seconds() when reconnect OK */ - unsigned long rap_reconnect_interval; /* exponential backoff */ -} kra_peer_t; - -extern kra_data_t kranal_data; -extern kra_tunables_t kranal_tunables; - -extern void kranal_destroy_peer(kra_peer_t *peer); -extern void kranal_destroy_conn(kra_conn_t *conn); - -static inline void -kranal_peer_addref(kra_peer_t *peer) -{ - CDEBUG(D_NET, "%p->%s\n", peer, libcfs_nid2str(peer->rap_nid)); - LASSERT(atomic_read(&peer->rap_refcount) > 0); - atomic_inc(&peer->rap_refcount); -} - -static inline void -kranal_peer_decref(kra_peer_t *peer) -{ - CDEBUG(D_NET, "%p->%s\n", peer, libcfs_nid2str(peer->rap_nid)); - LASSERT(atomic_read(&peer->rap_refcount) > 0); - if (atomic_dec_and_test(&peer->rap_refcount)) - kranal_destroy_peer(peer); -} - -static inline cfs_list_t * -kranal_nid2peerlist (lnet_nid_t nid) -{ - unsigned int hash = ((unsigned int)nid) % kranal_data.kra_peer_hash_size; - - return (&kranal_data.kra_peers[hash]); -} - -static inline int -kranal_peer_active(kra_peer_t *peer) -{ - /* Am I in the peer hash table? */ - return (!cfs_list_empty(&peer->rap_list)); -} - -static inline void -kranal_conn_addref(kra_conn_t *conn) -{ - CDEBUG(D_NET, "%p->%s\n", conn, - libcfs_nid2str(conn->rac_peer->rap_nid)); - LASSERT(atomic_read(&conn->rac_refcount) > 0); - atomic_inc(&conn->rac_refcount); -} - -static inline void -kranal_conn_decref(kra_conn_t *conn) -{ - CDEBUG(D_NET, "%p->%s\n", conn, - libcfs_nid2str(conn->rac_peer->rap_nid)); - LASSERT(atomic_read(&conn->rac_refcount) > 0); - if (atomic_dec_and_test(&conn->rac_refcount)) - kranal_destroy_conn(conn); -} - -static inline cfs_list_t * -kranal_cqid2connlist (__u32 cqid) -{ - unsigned int hash = cqid % kranal_data.kra_conn_hash_size; - - return (&kranal_data.kra_conns [hash]); -} - -static inline kra_conn_t * -kranal_cqid2conn_locked (__u32 cqid) -{ - cfs_list_t *conns = kranal_cqid2connlist(cqid); - cfs_list_t *tmp; - kra_conn_t *conn; - - cfs_list_for_each(tmp, conns) { - conn = cfs_list_entry(tmp, kra_conn_t, rac_hashlist); - - if (conn->rac_cqid == cqid) - return conn; - } - - return NULL; -} - -static inline int -kranal_tx_mapped (kra_tx_t *tx) -{ - return (tx->tx_buftype == RANAL_BUF_VIRT_MAPPED || - tx->tx_buftype == RANAL_BUF_PHYS_MAPPED); -} - -int kranal_startup (lnet_ni_t *ni); -void kranal_shutdown (lnet_ni_t *ni); -int kranal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg); -int kranal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); -int kranal_eager_recv(lnet_ni_t *ni, void *private, - lnet_msg_t *lntmsg, void **new_private); -int kranal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, - int delayed, unsigned int niov, - struct iovec *iov, lnet_kiov_t *kiov, - unsigned int offset, unsigned int mlen, unsigned int rlen); -int kranal_accept(lnet_ni_t *ni, struct socket *sock); - -extern void kranal_free_acceptsock (kra_acceptsock_t *ras); -extern int kranal_listener_procint (struct ctl_table *table, - int write, struct file *filp, - void *buffer, size_t *lenp); -extern void kranal_update_reaper_timeout (long timeout); -extern void kranal_tx_done (kra_tx_t *tx, int completion); -extern void kranal_unlink_peer_locked (kra_peer_t *peer); -extern void kranal_schedule_conn (kra_conn_t *conn); -extern int kranal_create_peer (kra_peer_t **peerp, lnet_nid_t nid); -extern int kranal_add_persistent_peer (lnet_nid_t nid, __u32 ip, int port); -extern kra_peer_t *kranal_find_peer_locked (lnet_nid_t nid); -extern void kranal_post_fma (kra_conn_t *conn, kra_tx_t *tx); -extern int kranal_del_peer (lnet_nid_t nid); -extern void kranal_device_callback (RAP_INT32 devid, RAP_PVOID arg); -extern int kranal_thread_start(int(*fn)(void *arg), void *arg, char *name); -extern int kranal_connd (void *arg); -extern int kranal_reaper (void *arg); -extern int kranal_scheduler (void *arg); -extern void kranal_close_conn_locked (kra_conn_t *conn, int error); -extern void kranal_close_conn (kra_conn_t *conn, int error); -extern void kranal_terminate_conn_locked (kra_conn_t *conn); -extern void kranal_connect (kra_peer_t *peer); -extern int kranal_conn_handshake (struct socket *sock, kra_peer_t *peer); -extern int kranal_tunables_init(void); -extern void kranal_tunables_fini(void); -extern void kranal_init_msg(kra_msg_t *msg, int type); diff --git a/lnet/klnds/ralnd/ralnd_cb.c b/lnet/klnds/ralnd/ralnd_cb.c deleted file mode 100644 index f53be8d..0000000 --- a/lnet/klnds/ralnd/ralnd_cb.c +++ /dev/null @@ -1,2078 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. - * - * GPL HEADER END - */ -/* - * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2014, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/ralnd/ralnd_cb.c - * - * Author: Eric Barton - */ - -#include -#include "ralnd.h" - -void -kranal_device_callback(RAP_INT32 devid, RAP_PVOID arg) -{ - kra_device_t *dev; - int i; - unsigned long flags; - - CDEBUG(D_NET, "callback for device %d\n", devid); - - for (i = 0; i < kranal_data.kra_ndevs; i++) { - - dev = &kranal_data.kra_devices[i]; - if (dev->rad_id != devid) - continue; - - spin_lock_irqsave(&dev->rad_lock, flags); - - if (!dev->rad_ready) { - dev->rad_ready = 1; - wake_up(&dev->rad_waitq); - } - - spin_unlock_irqrestore(&dev->rad_lock, flags); - return; - } - - CWARN("callback for unknown device %d\n", devid); -} - -void -kranal_schedule_conn(kra_conn_t *conn) -{ - kra_device_t *dev = conn->rac_device; - unsigned long flags; - - spin_lock_irqsave(&dev->rad_lock, flags); - - if (!conn->rac_scheduled) { - kranal_conn_addref(conn); /* +1 ref for scheduler */ - conn->rac_scheduled = 1; - cfs_list_add_tail(&conn->rac_schedlist, &dev->rad_ready_conns); - wake_up(&dev->rad_waitq); - } - - spin_unlock_irqrestore(&dev->rad_lock, flags); -} - -kra_tx_t * -kranal_get_idle_tx (void) -{ - unsigned long flags; - kra_tx_t *tx; - - spin_lock_irqsave(&kranal_data.kra_tx_lock, flags); - - if (cfs_list_empty(&kranal_data.kra_idle_txs)) { - spin_unlock_irqrestore(&kranal_data.kra_tx_lock, flags); - return NULL; - } - - tx = cfs_list_entry(kranal_data.kra_idle_txs.next, kra_tx_t, tx_list); - cfs_list_del(&tx->tx_list); - - /* Allocate a new completion cookie. It might not be needed, but we've - * got a lock right now... */ - tx->tx_cookie = kranal_data.kra_next_tx_cookie++; - - spin_unlock_irqrestore(&kranal_data.kra_tx_lock, flags); - - LASSERT (tx->tx_buftype == RANAL_BUF_NONE); - LASSERT (tx->tx_msg.ram_type == RANAL_MSG_NONE); - LASSERT (tx->tx_conn == NULL); - LASSERT (tx->tx_lntmsg[0] == NULL); - LASSERT (tx->tx_lntmsg[1] == NULL); - - return tx; -} - -void -kranal_init_msg(kra_msg_t *msg, int type) -{ - msg->ram_magic = RANAL_MSG_MAGIC; - msg->ram_version = RANAL_MSG_VERSION; - msg->ram_type = type; - msg->ram_srcnid = kranal_data.kra_ni->ni_nid; - /* ram_connstamp gets set when FMA is sent */ -} - -kra_tx_t * -kranal_new_tx_msg (int type) -{ - kra_tx_t *tx = kranal_get_idle_tx(); - - if (tx != NULL) - kranal_init_msg(&tx->tx_msg, type); - - return tx; -} - -int -kranal_setup_immediate_buffer (kra_tx_t *tx, - unsigned int niov, struct iovec *iov, - int offset, int nob) - -{ - /* For now this is almost identical to kranal_setup_virt_buffer, but we - * could "flatten" the payload into a single contiguous buffer ready - * for sending direct over an FMA if we ever needed to. */ - - LASSERT (tx->tx_buftype == RANAL_BUF_NONE); - LASSERT (nob >= 0); - - if (nob == 0) { - tx->tx_buffer = NULL; - } else { - LASSERT (niov > 0); - - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - niov--; - iov++; - LASSERT (niov > 0); - } - - if (nob > iov->iov_len - offset) { - CERROR("Can't handle multiple vaddr fragments\n"); - return -EMSGSIZE; - } - - tx->tx_buffer = (void *)(((unsigned long)iov->iov_base) + offset); - } - - tx->tx_buftype = RANAL_BUF_IMMEDIATE; - tx->tx_nob = nob; - return 0; -} - -int -kranal_setup_virt_buffer (kra_tx_t *tx, - unsigned int niov, struct iovec *iov, - int offset, int nob) - -{ - LASSERT (nob > 0); - LASSERT (niov > 0); - LASSERT (tx->tx_buftype == RANAL_BUF_NONE); - - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - niov--; - iov++; - LASSERT (niov > 0); - } - - if (nob > iov->iov_len - offset) { - CERROR("Can't handle multiple vaddr fragments\n"); - return -EMSGSIZE; - } - - tx->tx_buftype = RANAL_BUF_VIRT_UNMAPPED; - tx->tx_nob = nob; - tx->tx_buffer = (void *)(((unsigned long)iov->iov_base) + offset); - return 0; -} - -int -kranal_setup_phys_buffer (kra_tx_t *tx, int nkiov, lnet_kiov_t *kiov, - int offset, int nob) -{ - RAP_PHYS_REGION *phys = tx->tx_phys; - int resid; - - CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob); - - LASSERT (nob > 0); - LASSERT (nkiov > 0); - LASSERT (tx->tx_buftype == RANAL_BUF_NONE); - - while (offset >= kiov->kiov_len) { - offset -= kiov->kiov_len; - nkiov--; - kiov++; - LASSERT (nkiov > 0); - } - - tx->tx_buftype = RANAL_BUF_PHYS_UNMAPPED; - tx->tx_nob = nob; - tx->tx_buffer = (void *)((unsigned long)(kiov->kiov_offset + offset)); - - phys->Address = page_to_phys(kiov->kiov_page); - phys++; - - resid = nob - (kiov->kiov_len - offset); - while (resid > 0) { - kiov++; - nkiov--; - LASSERT (nkiov > 0); - - if (kiov->kiov_offset != 0 || - ((resid > PAGE_SIZE) && - kiov->kiov_len < PAGE_SIZE)) { - /* Can't have gaps */ - CERROR("Can't make payload contiguous in I/O VM:" - "page %d, offset %d, len %d \n", - (int)(phys - tx->tx_phys), - kiov->kiov_offset, kiov->kiov_len); - return -EINVAL; - } - - if ((phys - tx->tx_phys) == LNET_MAX_IOV) { - CERROR ("payload too big (%d)\n", (int)(phys - tx->tx_phys)); - return -EMSGSIZE; - } - - phys->Address = page_to_phys(kiov->kiov_page); - phys++; - - resid -= PAGE_SIZE; - } - - tx->tx_phys_npages = phys - tx->tx_phys; - return 0; -} - -static inline int -kranal_setup_rdma_buffer (kra_tx_t *tx, unsigned int niov, - struct iovec *iov, lnet_kiov_t *kiov, - int offset, int nob) -{ - LASSERT ((iov == NULL) != (kiov == NULL)); - - if (kiov != NULL) - return kranal_setup_phys_buffer(tx, niov, kiov, offset, nob); - - return kranal_setup_virt_buffer(tx, niov, iov, offset, nob); -} - -int -kranal_map_buffer (kra_tx_t *tx) -{ - kra_conn_t *conn = tx->tx_conn; - kra_device_t *dev = conn->rac_device; - RAP_RETURN rrc; - - LASSERT (current == dev->rad_scheduler); - - switch (tx->tx_buftype) { - default: - LBUG(); - - case RANAL_BUF_NONE: - case RANAL_BUF_IMMEDIATE: - case RANAL_BUF_PHYS_MAPPED: - case RANAL_BUF_VIRT_MAPPED: - return 0; - - case RANAL_BUF_PHYS_UNMAPPED: - rrc = RapkRegisterPhys(dev->rad_handle, - tx->tx_phys, tx->tx_phys_npages, - &tx->tx_map_key); - if (rrc != RAP_SUCCESS) { - CERROR ("Can't map %d pages: dev %d " - "phys %u pp %u, virt %u nob %lu\n", - tx->tx_phys_npages, dev->rad_id, - dev->rad_nphysmap, dev->rad_nppphysmap, - dev->rad_nvirtmap, dev->rad_nobvirtmap); - return -ENOMEM; /* assume insufficient resources */ - } - - dev->rad_nphysmap++; - dev->rad_nppphysmap += tx->tx_phys_npages; - - tx->tx_buftype = RANAL_BUF_PHYS_MAPPED; - return 0; - - case RANAL_BUF_VIRT_UNMAPPED: - rrc = RapkRegisterMemory(dev->rad_handle, - tx->tx_buffer, tx->tx_nob, - &tx->tx_map_key); - if (rrc != RAP_SUCCESS) { - CERROR ("Can't map %d bytes: dev %d " - "phys %u pp %u, virt %u nob %lu\n", - tx->tx_nob, dev->rad_id, - dev->rad_nphysmap, dev->rad_nppphysmap, - dev->rad_nvirtmap, dev->rad_nobvirtmap); - return -ENOMEM; /* assume insufficient resources */ - } - - dev->rad_nvirtmap++; - dev->rad_nobvirtmap += tx->tx_nob; - - tx->tx_buftype = RANAL_BUF_VIRT_MAPPED; - return 0; - } -} - -void -kranal_unmap_buffer (kra_tx_t *tx) -{ - kra_device_t *dev; - RAP_RETURN rrc; - - switch (tx->tx_buftype) { - default: - LBUG(); - - case RANAL_BUF_NONE: - case RANAL_BUF_IMMEDIATE: - case RANAL_BUF_PHYS_UNMAPPED: - case RANAL_BUF_VIRT_UNMAPPED: - break; - - case RANAL_BUF_PHYS_MAPPED: - LASSERT (tx->tx_conn != NULL); - dev = tx->tx_conn->rac_device; - LASSERT (current == dev->rad_scheduler); - rrc = RapkDeregisterMemory(dev->rad_handle, NULL, - &tx->tx_map_key); - LASSERT (rrc == RAP_SUCCESS); - - dev->rad_nphysmap--; - dev->rad_nppphysmap -= tx->tx_phys_npages; - - tx->tx_buftype = RANAL_BUF_PHYS_UNMAPPED; - break; - - case RANAL_BUF_VIRT_MAPPED: - LASSERT (tx->tx_conn != NULL); - dev = tx->tx_conn->rac_device; - LASSERT (current == dev->rad_scheduler); - rrc = RapkDeregisterMemory(dev->rad_handle, tx->tx_buffer, - &tx->tx_map_key); - LASSERT (rrc == RAP_SUCCESS); - - dev->rad_nvirtmap--; - dev->rad_nobvirtmap -= tx->tx_nob; - - tx->tx_buftype = RANAL_BUF_VIRT_UNMAPPED; - break; - } -} - -void -kranal_tx_done (kra_tx_t *tx, int completion) -{ - lnet_msg_t *lnetmsg[2]; - unsigned long flags; - int i; - - LASSERT (!in_interrupt()); - - kranal_unmap_buffer(tx); - - lnetmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL; - lnetmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL; - - tx->tx_buftype = RANAL_BUF_NONE; - tx->tx_msg.ram_type = RANAL_MSG_NONE; - tx->tx_conn = NULL; - - spin_lock_irqsave(&kranal_data.kra_tx_lock, flags); - - cfs_list_add_tail(&tx->tx_list, &kranal_data.kra_idle_txs); - - spin_unlock_irqrestore(&kranal_data.kra_tx_lock, flags); - - /* finalize AFTER freeing lnet msgs */ - for (i = 0; i < 2; i++) { - if (lnetmsg[i] == NULL) - continue; - - lnet_finalize(kranal_data.kra_ni, lnetmsg[i], completion); - } -} - -kra_conn_t * -kranal_find_conn_locked (kra_peer_t *peer) -{ - cfs_list_t *tmp; - - /* just return the first connection */ - cfs_list_for_each (tmp, &peer->rap_conns) { - return cfs_list_entry(tmp, kra_conn_t, rac_list); - } - - return NULL; -} - -void -kranal_post_fma (kra_conn_t *conn, kra_tx_t *tx) -{ - unsigned long flags; - - tx->tx_conn = conn; - - spin_lock_irqsave(&conn->rac_lock, flags); - cfs_list_add_tail(&tx->tx_list, &conn->rac_fmaq); - tx->tx_qtime = jiffies; - spin_unlock_irqrestore(&conn->rac_lock, flags); - - kranal_schedule_conn(conn); -} - -void -kranal_launch_tx (kra_tx_t *tx, lnet_nid_t nid) -{ - unsigned long flags; - kra_peer_t *peer; - kra_conn_t *conn; - int rc; - int retry; - rwlock_t *g_lock = &kranal_data.kra_global_lock; - - /* If I get here, I've committed to send, so I complete the tx with - * failure on any problems */ - - LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */ - - for (retry = 0; ; retry = 1) { - - read_lock(g_lock); - - peer = kranal_find_peer_locked(nid); - if (peer != NULL) { - conn = kranal_find_conn_locked(peer); - if (conn != NULL) { - kranal_post_fma(conn, tx); - read_unlock(g_lock); - return; - } - } - - /* Making connections; I'll need a write lock... */ - read_unlock(g_lock); - write_lock_irqsave(g_lock, flags); - - peer = kranal_find_peer_locked(nid); - if (peer != NULL) - break; - - write_unlock_irqrestore(g_lock, flags); - - if (retry) { - CERROR("Can't find peer %s\n", libcfs_nid2str(nid)); - kranal_tx_done(tx, -EHOSTUNREACH); - return; - } - - rc = kranal_add_persistent_peer(nid, LNET_NIDADDR(nid), - lnet_acceptor_port()); - if (rc != 0) { - CERROR("Can't add peer %s: %d\n", - libcfs_nid2str(nid), rc); - kranal_tx_done(tx, rc); - return; - } - } - - conn = kranal_find_conn_locked(peer); - if (conn != NULL) { - /* Connection exists; queue message on it */ - kranal_post_fma(conn, tx); - write_unlock_irqrestore(g_lock, flags); - return; - } - - LASSERT (peer->rap_persistence > 0); - - if (!peer->rap_connecting) { - LASSERT (cfs_list_empty(&peer->rap_tx_queue)); - - if (!(peer->rap_reconnect_interval == 0 || /* first attempt */ - cfs_time_aftereq(jiffies, peer->rap_reconnect_time))) { - write_unlock_irqrestore(g_lock, flags); - kranal_tx_done(tx, -EHOSTUNREACH); - return; - } - - peer->rap_connecting = 1; - kranal_peer_addref(peer); /* extra ref for connd */ - - spin_lock(&kranal_data.kra_connd_lock); - - cfs_list_add_tail(&peer->rap_connd_list, - &kranal_data.kra_connd_peers); - wake_up(&kranal_data.kra_connd_waitq); - - spin_unlock(&kranal_data.kra_connd_lock); - } - - /* A connection is being established; queue the message... */ - cfs_list_add_tail(&tx->tx_list, &peer->rap_tx_queue); - - write_unlock_irqrestore(g_lock, flags); -} - -void -kranal_rdma(kra_tx_t *tx, int type, - kra_rdma_desc_t *sink, int nob, __u64 cookie) -{ - kra_conn_t *conn = tx->tx_conn; - RAP_RETURN rrc; - unsigned long flags; - - LASSERT (kranal_tx_mapped(tx)); - LASSERT (nob <= sink->rard_nob); - LASSERT (nob <= tx->tx_nob); - - /* No actual race with scheduler sending CLOSE (I'm she!) */ - LASSERT (current == conn->rac_device->rad_scheduler); - - memset(&tx->tx_rdma_desc, 0, sizeof(tx->tx_rdma_desc)); - tx->tx_rdma_desc.SrcPtr.AddressBits = (__u64)((unsigned long)tx->tx_buffer); - tx->tx_rdma_desc.SrcKey = tx->tx_map_key; - tx->tx_rdma_desc.DstPtr = sink->rard_addr; - tx->tx_rdma_desc.DstKey = sink->rard_key; - tx->tx_rdma_desc.Length = nob; - tx->tx_rdma_desc.AppPtr = tx; - - /* prep final completion message */ - kranal_init_msg(&tx->tx_msg, type); - tx->tx_msg.ram_u.completion.racm_cookie = cookie; - - if (nob == 0) { /* Immediate completion */ - kranal_post_fma(conn, tx); - return; - } - - LASSERT (!conn->rac_close_sent); /* Don't lie (CLOSE == RDMA idle) */ - - rrc = RapkPostRdma(conn->rac_rihandle, &tx->tx_rdma_desc); - LASSERT (rrc == RAP_SUCCESS); - - spin_lock_irqsave(&conn->rac_lock, flags); - cfs_list_add_tail(&tx->tx_list, &conn->rac_rdmaq); - tx->tx_qtime = jiffies; - spin_unlock_irqrestore(&conn->rac_lock, flags); -} - -int -kranal_consume_rxmsg (kra_conn_t *conn, void *buffer, int nob) -{ - __u32 nob_received = nob; - RAP_RETURN rrc; - - LASSERT (conn->rac_rxmsg != NULL); - CDEBUG(D_NET, "Consuming %p\n", conn); - - rrc = RapkFmaCopyOut(conn->rac_rihandle, buffer, - &nob_received, sizeof(kra_msg_t)); - LASSERT (rrc == RAP_SUCCESS); - - conn->rac_rxmsg = NULL; - - if (nob_received < nob) { - CWARN("Incomplete immediate msg from %s: expected %d, got %d\n", - libcfs_nid2str(conn->rac_peer->rap_nid), - nob, nob_received); - return -EPROTO; - } - - return 0; -} - -int -kranal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) -{ - lnet_hdr_t *hdr = &lntmsg->msg_hdr; - int type = lntmsg->msg_type; - lnet_process_id_t target = lntmsg->msg_target; - int target_is_router = lntmsg->msg_target_is_router; - int routing = lntmsg->msg_routing; - unsigned int niov = lntmsg->msg_niov; - struct iovec *iov = lntmsg->msg_iov; - lnet_kiov_t *kiov = lntmsg->msg_kiov; - unsigned int offset = lntmsg->msg_offset; - unsigned int nob = lntmsg->msg_len; - kra_tx_t *tx; - int rc; - - /* NB 'private' is different depending on what we're sending.... */ - - CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n", - nob, niov, libcfs_id2str(target)); - - LASSERT (nob == 0 || niov > 0); - LASSERT (niov <= LNET_MAX_IOV); - - LASSERT (!in_interrupt()); - /* payload is either all vaddrs or all pages */ - LASSERT (!(kiov != NULL && iov != NULL)); - - if (routing) { - CERROR ("Can't route\n"); - return -EIO; - } - - switch(type) { - default: - LBUG(); - - case LNET_MSG_ACK: - LASSERT (nob == 0); - break; - - case LNET_MSG_GET: - LASSERT (niov == 0); - LASSERT (nob == 0); - /* We have to consider the eventual sink buffer rather than any - * payload passed here (there isn't any, and strictly, looking - * inside lntmsg is a layering violation). We send a simple - * IMMEDIATE GET if the sink buffer is mapped already and small - * enough for FMA */ - - if (routing || target_is_router) - break; /* send IMMEDIATE */ - - if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0 && - lntmsg->msg_md->md_length <= RANAL_FMA_MAX_DATA && - lntmsg->msg_md->md_length <= *kranal_tunables.kra_max_immediate) - break; /* send IMMEDIATE */ - - tx = kranal_new_tx_msg(RANAL_MSG_GET_REQ); - if (tx == NULL) - return -ENOMEM; - - if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0) - rc = kranal_setup_virt_buffer(tx, lntmsg->msg_md->md_niov, - lntmsg->msg_md->md_iov.iov, - 0, lntmsg->msg_md->md_length); - else - rc = kranal_setup_phys_buffer(tx, lntmsg->msg_md->md_niov, - lntmsg->msg_md->md_iov.kiov, - 0, lntmsg->msg_md->md_length); - if (rc != 0) { - kranal_tx_done(tx, rc); - return -EIO; - } - - tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg); - if (tx->tx_lntmsg[1] == NULL) { - CERROR("Can't create reply for GET to %s\n", - libcfs_nid2str(target.nid)); - kranal_tx_done(tx, rc); - return -EIO; - } - - tx->tx_lntmsg[0] = lntmsg; - tx->tx_msg.ram_u.get.ragm_hdr = *hdr; - /* rest of tx_msg is setup just before it is sent */ - kranal_launch_tx(tx, target.nid); - return 0; - - case LNET_MSG_REPLY: - case LNET_MSG_PUT: - if (kiov == NULL && /* not paged */ - nob <= RANAL_FMA_MAX_DATA && /* small enough */ - nob <= *kranal_tunables.kra_max_immediate) - break; /* send IMMEDIATE */ - - tx = kranal_new_tx_msg(RANAL_MSG_PUT_REQ); - if (tx == NULL) - return -ENOMEM; - - rc = kranal_setup_rdma_buffer(tx, niov, iov, kiov, offset, nob); - if (rc != 0) { - kranal_tx_done(tx, rc); - return -EIO; - } - - tx->tx_lntmsg[0] = lntmsg; - tx->tx_msg.ram_u.putreq.raprm_hdr = *hdr; - /* rest of tx_msg is setup just before it is sent */ - kranal_launch_tx(tx, target.nid); - return 0; - } - - /* send IMMEDIATE */ - - LASSERT (kiov == NULL); - LASSERT (nob <= RANAL_FMA_MAX_DATA); - - tx = kranal_new_tx_msg(RANAL_MSG_IMMEDIATE); - if (tx == NULL) - return -ENOMEM; - - rc = kranal_setup_immediate_buffer(tx, niov, iov, offset, nob); - if (rc != 0) { - kranal_tx_done(tx, rc); - return -EIO; - } - - tx->tx_msg.ram_u.immediate.raim_hdr = *hdr; - tx->tx_lntmsg[0] = lntmsg; - kranal_launch_tx(tx, target.nid); - return 0; -} - -void -kranal_reply(lnet_ni_t *ni, kra_conn_t *conn, lnet_msg_t *lntmsg) -{ - kra_msg_t *rxmsg = conn->rac_rxmsg; - unsigned int niov = lntmsg->msg_niov; - struct iovec *iov = lntmsg->msg_iov; - lnet_kiov_t *kiov = lntmsg->msg_kiov; - unsigned int offset = lntmsg->msg_offset; - unsigned int nob = lntmsg->msg_len; - kra_tx_t *tx; - int rc; - - tx = kranal_get_idle_tx(); - if (tx == NULL) - goto failed_0; - - rc = kranal_setup_rdma_buffer(tx, niov, iov, kiov, offset, nob); - if (rc != 0) - goto failed_1; - - tx->tx_conn = conn; - - rc = kranal_map_buffer(tx); - if (rc != 0) - goto failed_1; - - tx->tx_lntmsg[0] = lntmsg; - - kranal_rdma(tx, RANAL_MSG_GET_DONE, - &rxmsg->ram_u.get.ragm_desc, nob, - rxmsg->ram_u.get.ragm_cookie); - return; - - failed_1: - kranal_tx_done(tx, -EIO); - failed_0: - lnet_finalize(ni, lntmsg, -EIO); -} - -int -kranal_eager_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, - void **new_private) -{ - kra_conn_t *conn = (kra_conn_t *)private; - - LCONSOLE_ERROR_MSG(0x12b, "Dropping message from %s: no buffers free.\n", - libcfs_nid2str(conn->rac_peer->rap_nid)); - - return -EDEADLK; -} - -int -kranal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, - int delayed, unsigned int niov, - struct iovec *iov, lnet_kiov_t *kiov, - unsigned int offset, unsigned int mlen, unsigned int rlen) -{ - kra_conn_t *conn = private; - kra_msg_t *rxmsg = conn->rac_rxmsg; - kra_tx_t *tx; - void *buffer; - int rc; - - LASSERT (mlen <= rlen); - LASSERT (!in_interrupt()); - /* Either all pages or all vaddrs */ - LASSERT (!(kiov != NULL && iov != NULL)); - - CDEBUG(D_NET, "conn %p, rxmsg %p, lntmsg %p\n", conn, rxmsg, lntmsg); - - switch(rxmsg->ram_type) { - default: - LBUG(); - - case RANAL_MSG_IMMEDIATE: - if (mlen == 0) { - buffer = NULL; - } else if (kiov != NULL) { - CERROR("Can't recv immediate into paged buffer\n"); - return -EIO; - } else { - LASSERT (niov > 0); - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - iov++; - niov--; - LASSERT (niov > 0); - } - if (mlen > iov->iov_len - offset) { - CERROR("Can't handle immediate frags\n"); - return -EIO; - } - buffer = ((char *)iov->iov_base) + offset; - } - rc = kranal_consume_rxmsg(conn, buffer, mlen); - lnet_finalize(ni, lntmsg, (rc == 0) ? 0 : -EIO); - return 0; - - case RANAL_MSG_PUT_REQ: - tx = kranal_new_tx_msg(RANAL_MSG_PUT_ACK); - if (tx == NULL) { - kranal_consume_rxmsg(conn, NULL, 0); - return -ENOMEM; - } - - rc = kranal_setup_rdma_buffer(tx, niov, iov, kiov, offset, mlen); - if (rc != 0) { - kranal_tx_done(tx, rc); - kranal_consume_rxmsg(conn, NULL, 0); - return -EIO; - } - - tx->tx_conn = conn; - rc = kranal_map_buffer(tx); - if (rc != 0) { - kranal_tx_done(tx, rc); - kranal_consume_rxmsg(conn, NULL, 0); - return -EIO; - } - - tx->tx_msg.ram_u.putack.rapam_src_cookie = - conn->rac_rxmsg->ram_u.putreq.raprm_cookie; - tx->tx_msg.ram_u.putack.rapam_dst_cookie = tx->tx_cookie; - tx->tx_msg.ram_u.putack.rapam_desc.rard_key = tx->tx_map_key; - tx->tx_msg.ram_u.putack.rapam_desc.rard_addr.AddressBits = - (__u64)((unsigned long)tx->tx_buffer); - tx->tx_msg.ram_u.putack.rapam_desc.rard_nob = mlen; - - tx->tx_lntmsg[0] = lntmsg; /* finalize this on RDMA_DONE */ - - kranal_post_fma(conn, tx); - kranal_consume_rxmsg(conn, NULL, 0); - return 0; - - case RANAL_MSG_GET_REQ: - if (lntmsg != NULL) { - /* Matched! */ - kranal_reply(ni, conn, lntmsg); - } else { - /* No match */ - tx = kranal_new_tx_msg(RANAL_MSG_GET_NAK); - if (tx != NULL) { - tx->tx_msg.ram_u.completion.racm_cookie = - rxmsg->ram_u.get.ragm_cookie; - kranal_post_fma(conn, tx); - } - } - kranal_consume_rxmsg(conn, NULL, 0); - return 0; - } -} - -int -kranal_thread_start(int(*fn)(void *arg), void *arg, char *name) -{ - struct task_struct *task = cfs_thread_run(fn, arg, name); - - if (!IS_ERR(task)) - atomic_inc(&kranal_data.kra_nthreads); - return PTR_ERR(task); -} - -void -kranal_thread_fini (void) -{ - atomic_dec(&kranal_data.kra_nthreads); -} - -int -kranal_check_conn_timeouts (kra_conn_t *conn) -{ - kra_tx_t *tx; - cfs_list_t *ttmp; - unsigned long flags; - long timeout; - unsigned long now = jiffies; - - LASSERT (conn->rac_state == RANAL_CONN_ESTABLISHED || - conn->rac_state == RANAL_CONN_CLOSING); - - if (!conn->rac_close_sent && - cfs_time_aftereq(now, conn->rac_last_tx + - msecs_to_jiffies(conn->rac_keepalive * - MSEC_PER_SEC))) { - /* not sent in a while; schedule conn so scheduler sends a keepalive */ - CDEBUG(D_NET, "Scheduling keepalive %p->%s\n", - conn, libcfs_nid2str(conn->rac_peer->rap_nid)); - kranal_schedule_conn(conn); - } - - timeout = msecs_to_jiffies(conn->rac_timeout * MSEC_PER_SEC); - - if (!conn->rac_close_recvd && - cfs_time_aftereq(now, conn->rac_last_rx + timeout)) { - CERROR("%s received from %s within %lu seconds\n", - (conn->rac_state == RANAL_CONN_ESTABLISHED) ? - "Nothing" : "CLOSE not", - libcfs_nid2str(conn->rac_peer->rap_nid), - jiffies_to_msecs(now - conn->rac_last_rx)/MSEC_PER_SEC); - return -ETIMEDOUT; - } - - if (conn->rac_state != RANAL_CONN_ESTABLISHED) - return 0; - - /* Check the conn's queues are moving. These are "belt+braces" checks, - * in case of hardware/software errors that make this conn seem - * responsive even though it isn't progressing its message queues. */ - - spin_lock_irqsave(&conn->rac_lock, flags); - - cfs_list_for_each (ttmp, &conn->rac_fmaq) { - tx = cfs_list_entry(ttmp, kra_tx_t, tx_list); - - if (cfs_time_aftereq(now, tx->tx_qtime + timeout)) { - spin_unlock_irqrestore(&conn->rac_lock, flags); - CERROR("tx on fmaq for %s blocked %lu seconds\n", - libcfs_nid2str(conn->rac_peer->rap_nid), - jiffies_to_msecs(now-tx->tx_qtime)/MSEC_PER_SEC); - return -ETIMEDOUT; - } - } - - cfs_list_for_each (ttmp, &conn->rac_rdmaq) { - tx = cfs_list_entry(ttmp, kra_tx_t, tx_list); - - if (cfs_time_aftereq(now, tx->tx_qtime + timeout)) { - spin_unlock_irqrestore(&conn->rac_lock, flags); - CERROR("tx on rdmaq for %s blocked %lu seconds\n", - libcfs_nid2str(conn->rac_peer->rap_nid), - jiffies_to_msecs(now-tx->tx_qtime)/MSEC_PER_SEC); - return -ETIMEDOUT; - } - } - - cfs_list_for_each (ttmp, &conn->rac_replyq) { - tx = cfs_list_entry(ttmp, kra_tx_t, tx_list); - - if (cfs_time_aftereq(now, tx->tx_qtime + timeout)) { - spin_unlock_irqrestore(&conn->rac_lock, flags); - CERROR("tx on replyq for %s blocked %lu seconds\n", - libcfs_nid2str(conn->rac_peer->rap_nid), - jiffies_to_msecs(now-tx->tx_qtime)/MSEC_PER_SEC); - return -ETIMEDOUT; - } - } - - spin_unlock_irqrestore(&conn->rac_lock, flags); - return 0; -} - -void -kranal_reaper_check (int idx, unsigned long *min_timeoutp) -{ - cfs_list_t *conns = &kranal_data.kra_conns[idx]; - cfs_list_t *ctmp; - kra_conn_t *conn; - unsigned long flags; - int rc; - - again: - /* NB. We expect to check all the conns and not find any problems, so - * we just use a shared lock while we take a look... */ - read_lock(&kranal_data.kra_global_lock); - - cfs_list_for_each (ctmp, conns) { - conn = cfs_list_entry(ctmp, kra_conn_t, rac_hashlist); - - if (conn->rac_timeout < *min_timeoutp ) - *min_timeoutp = conn->rac_timeout; - if (conn->rac_keepalive < *min_timeoutp ) - *min_timeoutp = conn->rac_keepalive; - - rc = kranal_check_conn_timeouts(conn); - if (rc == 0) - continue; - - kranal_conn_addref(conn); - read_unlock(&kranal_data.kra_global_lock); - - CERROR("Conn to %s, cqid %d timed out\n", - libcfs_nid2str(conn->rac_peer->rap_nid), - conn->rac_cqid); - - write_lock_irqsave(&kranal_data.kra_global_lock, flags); - - switch (conn->rac_state) { - default: - LBUG(); - - case RANAL_CONN_ESTABLISHED: - kranal_close_conn_locked(conn, -ETIMEDOUT); - break; - - case RANAL_CONN_CLOSING: - kranal_terminate_conn_locked(conn); - break; - } - - write_unlock_irqrestore(&kranal_data.kra_global_lock, - flags); - - kranal_conn_decref(conn); - - /* start again now I've dropped the lock */ - goto again; - } - - read_unlock(&kranal_data.kra_global_lock); -} - -int -kranal_connd (void *arg) -{ - long id = (long)arg; - wait_queue_t wait; - unsigned long flags; - kra_peer_t *peer; - kra_acceptsock_t *ras; - int did_something; - - cfs_block_allsigs(); - - init_waitqueue_entry_current(&wait); - - spin_lock_irqsave(&kranal_data.kra_connd_lock, flags); - - while (!kranal_data.kra_shutdown) { - did_something = 0; - - if (!cfs_list_empty(&kranal_data.kra_connd_acceptq)) { - ras = cfs_list_entry(kranal_data.kra_connd_acceptq.next, - kra_acceptsock_t, ras_list); - cfs_list_del(&ras->ras_list); - - spin_unlock_irqrestore(&kranal_data.kra_connd_lock, - flags); - - CDEBUG(D_NET,"About to handshake someone\n"); - - kranal_conn_handshake(ras->ras_sock, NULL); - kranal_free_acceptsock(ras); - - CDEBUG(D_NET,"Finished handshaking someone\n"); - - spin_lock_irqsave(&kranal_data.kra_connd_lock, - flags); - did_something = 1; - } - - if (!cfs_list_empty(&kranal_data.kra_connd_peers)) { - peer = cfs_list_entry(kranal_data.kra_connd_peers.next, - kra_peer_t, rap_connd_list); - - cfs_list_del_init(&peer->rap_connd_list); - spin_unlock_irqrestore(&kranal_data.kra_connd_lock, - flags); - - kranal_connect(peer); - kranal_peer_decref(peer); - - spin_lock_irqsave(&kranal_data.kra_connd_lock, - flags); - did_something = 1; - } - - if (did_something) - continue; - - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue_exclusive(&kranal_data.kra_connd_waitq, &wait); - - spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags); - - waitq_wait(&wait, TASK_INTERRUPTIBLE); - - set_current_state(TASK_RUNNING); - remove_wait_queue(&kranal_data.kra_connd_waitq, &wait); - - spin_lock_irqsave(&kranal_data.kra_connd_lock, flags); - } - - spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags); - - kranal_thread_fini(); - return 0; -} - -void -kranal_update_reaper_timeout(long timeout) -{ - unsigned long flags; - - LASSERT (timeout > 0); - - spin_lock_irqsave(&kranal_data.kra_reaper_lock, flags); - - if (timeout < kranal_data.kra_new_min_timeout) - kranal_data.kra_new_min_timeout = timeout; - - spin_unlock_irqrestore(&kranal_data.kra_reaper_lock, flags); -} - -int -kranal_reaper (void *arg) -{ - wait_queue_t wait; - unsigned long flags; - long timeout; - int i; - int conn_entries = kranal_data.kra_conn_hash_size; - int conn_index = 0; - int base_index = conn_entries - 1; - unsigned long next_check_time = jiffies; - long next_min_timeout = MAX_SCHEDULE_TIMEOUT; - long current_min_timeout = 1; - - cfs_block_allsigs(); - - init_waitqueue_entry_current(&wait); - - spin_lock_irqsave(&kranal_data.kra_reaper_lock, flags); - - while (!kranal_data.kra_shutdown) { - /* I wake up every 'p' seconds to check for timeouts on some - * more peers. I try to check every connection 'n' times - * within the global minimum of all keepalive and timeout - * intervals, to ensure I attend to every connection within - * (n+1)/n times its timeout intervals. */ - const int p = 1; - const int n = 3; - unsigned long min_timeout; - int chunk; - - /* careful with the jiffy wrap... */ - timeout = (long)(next_check_time - jiffies); - if (timeout > 0) { - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kranal_data.kra_reaper_waitq, &wait); - - spin_unlock_irqrestore(&kranal_data.kra_reaper_lock, - flags); - - waitq_timedwait(&wait, TASK_INTERRUPTIBLE, - timeout); - - spin_lock_irqsave(&kranal_data.kra_reaper_lock, - flags); - - set_current_state(TASK_RUNNING); - remove_wait_queue(&kranal_data.kra_reaper_waitq, &wait); - continue; - } - - if (kranal_data.kra_new_min_timeout != - MAX_SCHEDULE_TIMEOUT) { - /* new min timeout set: restart min timeout scan */ - next_min_timeout = MAX_SCHEDULE_TIMEOUT; - base_index = conn_index - 1; - if (base_index < 0) - base_index = conn_entries - 1; - - if (kranal_data.kra_new_min_timeout < - current_min_timeout) { - current_min_timeout = - kranal_data.kra_new_min_timeout; - CDEBUG(D_NET, "Set new min timeout %ld\n", - current_min_timeout); - } - - kranal_data.kra_new_min_timeout = - MAX_SCHEDULE_TIMEOUT; - } - min_timeout = current_min_timeout; - - spin_unlock_irqrestore(&kranal_data.kra_reaper_lock, flags); - - LASSERT (min_timeout > 0); - - /* Compute how many table entries to check now so I get round - * the whole table fast enough given that I do this at fixed - * intervals of 'p' seconds) */ - chunk = conn_entries; - if (min_timeout > n * p) - chunk = (chunk * n * p) / min_timeout; - if (chunk == 0) - chunk = 1; - - for (i = 0; i < chunk; i++) { - kranal_reaper_check(conn_index, - &next_min_timeout); - conn_index = (conn_index + 1) % conn_entries; - } - - next_check_time += msecs_to_jiffies(p * MSEC_PER_SEC); - - spin_lock_irqsave(&kranal_data.kra_reaper_lock, flags); - - if (((conn_index - chunk <= base_index && - base_index < conn_index) || - (conn_index - conn_entries - chunk <= base_index && - base_index < conn_index - conn_entries))) { - - /* Scanned all conns: set current_min_timeout... */ - if (current_min_timeout != next_min_timeout) { - current_min_timeout = next_min_timeout; - CDEBUG(D_NET, "Set new min timeout %ld\n", - current_min_timeout); - } - - /* ...and restart min timeout scan */ - next_min_timeout = MAX_SCHEDULE_TIMEOUT; - base_index = conn_index - 1; - if (base_index < 0) - base_index = conn_entries - 1; - } - } - - kranal_thread_fini(); - return 0; -} - -void -kranal_check_rdma_cq (kra_device_t *dev) -{ - kra_conn_t *conn; - kra_tx_t *tx; - RAP_RETURN rrc; - unsigned long flags; - RAP_RDMA_DESCRIPTOR *desc; - __u32 cqid; - __u32 event_type; - - for (;;) { - rrc = RapkCQDone(dev->rad_rdma_cqh, &cqid, &event_type); - if (rrc == RAP_NOT_DONE) { - CDEBUG(D_NET, "RDMA CQ %d empty\n", dev->rad_id); - return; - } - - LASSERT (rrc == RAP_SUCCESS); - LASSERT ((event_type & RAPK_CQ_EVENT_OVERRUN) == 0); - - read_lock(&kranal_data.kra_global_lock); - - conn = kranal_cqid2conn_locked(cqid); - if (conn == NULL) { - /* Conn was destroyed? */ - CDEBUG(D_NET, "RDMA CQID lookup %d failed\n", cqid); - read_unlock(&kranal_data.kra_global_lock); - continue; - } - - rrc = RapkRdmaDone(conn->rac_rihandle, &desc); - LASSERT (rrc == RAP_SUCCESS); - - CDEBUG(D_NET, "Completed %p\n", - cfs_list_entry(conn->rac_rdmaq.next, kra_tx_t, tx_list)); - - spin_lock_irqsave(&conn->rac_lock, flags); - - LASSERT (!cfs_list_empty(&conn->rac_rdmaq)); - tx = cfs_list_entry(conn->rac_rdmaq.next, kra_tx_t, tx_list); - cfs_list_del(&tx->tx_list); - - LASSERT(desc->AppPtr == (void *)tx); - LASSERT(tx->tx_msg.ram_type == RANAL_MSG_PUT_DONE || - tx->tx_msg.ram_type == RANAL_MSG_GET_DONE); - - cfs_list_add_tail(&tx->tx_list, &conn->rac_fmaq); - tx->tx_qtime = jiffies; - - spin_unlock_irqrestore(&conn->rac_lock, flags); - - /* Get conn's fmaq processed, now I've just put something - * there */ - kranal_schedule_conn(conn); - - read_unlock(&kranal_data.kra_global_lock); - } -} - -void -kranal_check_fma_cq (kra_device_t *dev) -{ - kra_conn_t *conn; - RAP_RETURN rrc; - __u32 cqid; - __u32 event_type; - cfs_list_t *conns; - cfs_list_t *tmp; - int i; - - for (;;) { - rrc = RapkCQDone(dev->rad_fma_cqh, &cqid, &event_type); - if (rrc == RAP_NOT_DONE) { - CDEBUG(D_NET, "FMA CQ %d empty\n", dev->rad_id); - return; - } - - LASSERT (rrc == RAP_SUCCESS); - - if ((event_type & RAPK_CQ_EVENT_OVERRUN) == 0) { - - read_lock(&kranal_data.kra_global_lock); - - conn = kranal_cqid2conn_locked(cqid); - if (conn == NULL) { - CDEBUG(D_NET, "FMA CQID lookup %d failed\n", - cqid); - } else { - CDEBUG(D_NET, "FMA completed: %p CQID %d\n", - conn, cqid); - kranal_schedule_conn(conn); - } - - read_unlock(&kranal_data.kra_global_lock); - continue; - } - - /* FMA CQ has overflowed: check ALL conns */ - CWARN("FMA CQ overflow: scheduling ALL conns on device %d\n", - dev->rad_id); - - for (i = 0; i < kranal_data.kra_conn_hash_size; i++) { - - read_lock(&kranal_data.kra_global_lock); - - conns = &kranal_data.kra_conns[i]; - - cfs_list_for_each (tmp, conns) { - conn = cfs_list_entry(tmp, kra_conn_t, - rac_hashlist); - - if (conn->rac_device == dev) - kranal_schedule_conn(conn); - } - - /* don't block write lockers for too long... */ - read_unlock(&kranal_data.kra_global_lock); - } - } -} - -int -kranal_sendmsg(kra_conn_t *conn, kra_msg_t *msg, - void *immediate, int immediatenob) -{ - int sync = (msg->ram_type & RANAL_MSG_FENCE) != 0; - RAP_RETURN rrc; - - CDEBUG(D_NET,"%p sending msg %p %02x%s [%p for %d]\n", - conn, msg, msg->ram_type, sync ? "(sync)" : "", - immediate, immediatenob); - - LASSERT (sizeof(*msg) <= RANAL_FMA_MAX_PREFIX); - LASSERT ((msg->ram_type == RANAL_MSG_IMMEDIATE) ? - immediatenob <= RANAL_FMA_MAX_DATA : - immediatenob == 0); - - msg->ram_connstamp = conn->rac_my_connstamp; - msg->ram_seq = conn->rac_tx_seq; - - if (sync) - rrc = RapkFmaSyncSend(conn->rac_rihandle, - immediate, immediatenob, - msg, sizeof(*msg)); - else - rrc = RapkFmaSend(conn->rac_rihandle, - immediate, immediatenob, - msg, sizeof(*msg)); - - switch (rrc) { - default: - LBUG(); - - case RAP_SUCCESS: - conn->rac_last_tx = jiffies; - conn->rac_tx_seq++; - return 0; - - case RAP_NOT_DONE: - if (cfs_time_aftereq(jiffies, - conn->rac_last_tx + - msecs_to_jiffies(conn->rac_keepalive * - MSEC_PER_SEC))) - CWARN("EAGAIN sending %02x (idle %lu secs)\n", - msg->ram_type, - jiffies_to_msecs(jiffies - conn->rac_last_tx) / - MSEC_PER_SEC); - return -EAGAIN; - } -} - -void -kranal_process_fmaq (kra_conn_t *conn) -{ - unsigned long flags; - int more_to_do; - kra_tx_t *tx; - int rc; - int expect_reply; - - /* NB 1. kranal_sendmsg() may fail if I'm out of credits right now. - * However I will be rescheduled by an FMA completion event - * when I eventually get some. - * NB 2. Sampling rac_state here races with setting it elsewhere. - * But it doesn't matter if I try to send a "real" message just - * as I start closing because I'll get scheduled to send the - * close anyway. */ - - /* Not racing with incoming message processing! */ - LASSERT (current == conn->rac_device->rad_scheduler); - - if (conn->rac_state != RANAL_CONN_ESTABLISHED) { - if (!cfs_list_empty(&conn->rac_rdmaq)) { - /* RDMAs in progress */ - LASSERT (!conn->rac_close_sent); - - if (cfs_time_aftereq(jiffies, - conn->rac_last_tx + - msecs_to_jiffies(conn->rac_keepalive * - MSEC_PER_SEC))) { - CDEBUG(D_NET, "sending NOOP (rdma in progress)\n"); - kranal_init_msg(&conn->rac_msg, RANAL_MSG_NOOP); - kranal_sendmsg(conn, &conn->rac_msg, NULL, 0); - } - return; - } - - if (conn->rac_close_sent) - return; - - CWARN("sending CLOSE to %s\n", - libcfs_nid2str(conn->rac_peer->rap_nid)); - kranal_init_msg(&conn->rac_msg, RANAL_MSG_CLOSE); - rc = kranal_sendmsg(conn, &conn->rac_msg, NULL, 0); - if (rc != 0) - return; - - conn->rac_close_sent = 1; - if (!conn->rac_close_recvd) - return; - - write_lock_irqsave(&kranal_data.kra_global_lock, flags); - - if (conn->rac_state == RANAL_CONN_CLOSING) - kranal_terminate_conn_locked(conn); - - write_unlock_irqrestore(&kranal_data.kra_global_lock, - flags); - return; - } - - spin_lock_irqsave(&conn->rac_lock, flags); - - if (cfs_list_empty(&conn->rac_fmaq)) { - - spin_unlock_irqrestore(&conn->rac_lock, flags); - - if (cfs_time_aftereq(jiffies, - conn->rac_last_tx + - msecs_to_jiffies(conn->rac_keepalive * - MSEC_PER_SEC))) { - CDEBUG(D_NET, "sending NOOP -> %s (%p idle %lu(%ld))\n", - libcfs_nid2str(conn->rac_peer->rap_nid), conn, - jiffies_to_msecs(jiffies - conn->rac_last_tx) / - MSEC_PER_SEC, - conn->rac_keepalive); - kranal_init_msg(&conn->rac_msg, RANAL_MSG_NOOP); - kranal_sendmsg(conn, &conn->rac_msg, NULL, 0); - } - return; - } - - tx = cfs_list_entry(conn->rac_fmaq.next, kra_tx_t, tx_list); - cfs_list_del(&tx->tx_list); - more_to_do = !cfs_list_empty(&conn->rac_fmaq); - - spin_unlock_irqrestore(&conn->rac_lock, flags); - - expect_reply = 0; - CDEBUG(D_NET, "sending regular msg: %p, type %02x, cookie "LPX64"\n", - tx, tx->tx_msg.ram_type, tx->tx_cookie); - switch (tx->tx_msg.ram_type) { - default: - LBUG(); - - case RANAL_MSG_IMMEDIATE: - rc = kranal_sendmsg(conn, &tx->tx_msg, - tx->tx_buffer, tx->tx_nob); - break; - - case RANAL_MSG_PUT_NAK: - case RANAL_MSG_PUT_DONE: - case RANAL_MSG_GET_NAK: - case RANAL_MSG_GET_DONE: - rc = kranal_sendmsg(conn, &tx->tx_msg, NULL, 0); - break; - - case RANAL_MSG_PUT_REQ: - rc = kranal_map_buffer(tx); - LASSERT (rc != -EAGAIN); - if (rc != 0) - break; - - tx->tx_msg.ram_u.putreq.raprm_cookie = tx->tx_cookie; - rc = kranal_sendmsg(conn, &tx->tx_msg, NULL, 0); - expect_reply = 1; - break; - - case RANAL_MSG_PUT_ACK: - rc = kranal_sendmsg(conn, &tx->tx_msg, NULL, 0); - expect_reply = 1; - break; - - case RANAL_MSG_GET_REQ: - rc = kranal_map_buffer(tx); - LASSERT (rc != -EAGAIN); - if (rc != 0) - break; - - tx->tx_msg.ram_u.get.ragm_cookie = tx->tx_cookie; - tx->tx_msg.ram_u.get.ragm_desc.rard_key = tx->tx_map_key; - tx->tx_msg.ram_u.get.ragm_desc.rard_addr.AddressBits = - (__u64)((unsigned long)tx->tx_buffer); - tx->tx_msg.ram_u.get.ragm_desc.rard_nob = tx->tx_nob; - rc = kranal_sendmsg(conn, &tx->tx_msg, NULL, 0); - expect_reply = 1; - break; - } - - if (rc == -EAGAIN) { - /* I need credits to send this. Replace tx at the head of the - * fmaq and I'll get rescheduled when credits appear */ - CDEBUG(D_NET, "EAGAIN on %p\n", conn); - spin_lock_irqsave(&conn->rac_lock, flags); - cfs_list_add(&tx->tx_list, &conn->rac_fmaq); - spin_unlock_irqrestore(&conn->rac_lock, flags); - return; - } - - if (!expect_reply || rc != 0) { - kranal_tx_done(tx, rc); - } else { - /* LASSERT(current) above ensures this doesn't race with reply - * processing */ - spin_lock_irqsave(&conn->rac_lock, flags); - cfs_list_add_tail(&tx->tx_list, &conn->rac_replyq); - tx->tx_qtime = jiffies; - spin_unlock_irqrestore(&conn->rac_lock, flags); - } - - if (more_to_do) { - CDEBUG(D_NET, "Rescheduling %p (more to do)\n", conn); - kranal_schedule_conn(conn); - } -} - -static inline void -kranal_swab_rdma_desc (kra_rdma_desc_t *d) -{ - __swab64s(&d->rard_key.Key); - __swab16s(&d->rard_key.Cookie); - __swab16s(&d->rard_key.MdHandle); - __swab32s(&d->rard_key.Flags); - __swab64s(&d->rard_addr.AddressBits); - __swab32s(&d->rard_nob); -} - -kra_tx_t * -kranal_match_reply(kra_conn_t *conn, int type, __u64 cookie) -{ - cfs_list_t *ttmp; - kra_tx_t *tx; - unsigned long flags; - - spin_lock_irqsave(&conn->rac_lock, flags); - - cfs_list_for_each(ttmp, &conn->rac_replyq) { - tx = cfs_list_entry(ttmp, kra_tx_t, tx_list); - - CDEBUG(D_NET,"Checking %p %02x/"LPX64"\n", - tx, tx->tx_msg.ram_type, tx->tx_cookie); - - if (tx->tx_cookie != cookie) - continue; - - if (tx->tx_msg.ram_type != type) { - spin_unlock_irqrestore(&conn->rac_lock, flags); - CWARN("Unexpected type %x (%x expected) " - "matched reply from %s\n", - tx->tx_msg.ram_type, type, - libcfs_nid2str(conn->rac_peer->rap_nid)); - return NULL; - } - - cfs_list_del(&tx->tx_list); - spin_unlock_irqrestore(&conn->rac_lock, flags); - return tx; - } - - spin_unlock_irqrestore(&conn->rac_lock, flags); - CWARN("Unmatched reply %02x/"LPX64" from %s\n", - type, cookie, libcfs_nid2str(conn->rac_peer->rap_nid)); - return NULL; -} - -void -kranal_check_fma_rx (kra_conn_t *conn) -{ - unsigned long flags; - __u32 seq; - kra_tx_t *tx; - kra_msg_t *msg; - void *prefix; - RAP_RETURN rrc = RapkFmaGetPrefix(conn->rac_rihandle, &prefix); - kra_peer_t *peer = conn->rac_peer; - int rc = 0; - int repost = 1; - - if (rrc == RAP_NOT_DONE) - return; - - CDEBUG(D_NET, "RX on %p\n", conn); - - LASSERT (rrc == RAP_SUCCESS); - conn->rac_last_rx = jiffies; - seq = conn->rac_rx_seq++; - msg = (kra_msg_t *)prefix; - - /* stash message for portals callbacks they'll NULL - * rac_rxmsg if they consume it */ - LASSERT (conn->rac_rxmsg == NULL); - conn->rac_rxmsg = msg; - - if (msg->ram_magic != RANAL_MSG_MAGIC) { - if (__swab32(msg->ram_magic) != RANAL_MSG_MAGIC) { - CERROR("Unexpected magic %08x from %s\n", - msg->ram_magic, libcfs_nid2str(peer->rap_nid)); - rc = -EPROTO; - goto out; - } - - __swab32s(&msg->ram_magic); - __swab16s(&msg->ram_version); - __swab16s(&msg->ram_type); - __swab64s(&msg->ram_srcnid); - __swab64s(&msg->ram_connstamp); - __swab32s(&msg->ram_seq); - - /* NB message type checked below; NOT here... */ - switch (msg->ram_type) { - case RANAL_MSG_PUT_ACK: - kranal_swab_rdma_desc(&msg->ram_u.putack.rapam_desc); - break; - - case RANAL_MSG_GET_REQ: - kranal_swab_rdma_desc(&msg->ram_u.get.ragm_desc); - break; - - default: - break; - } - } - - if (msg->ram_version != RANAL_MSG_VERSION) { - CERROR("Unexpected protocol version %d from %s\n", - msg->ram_version, libcfs_nid2str(peer->rap_nid)); - rc = -EPROTO; - goto out; - } - - if (msg->ram_srcnid != peer->rap_nid) { - CERROR("Unexpected peer %s from %s\n", - libcfs_nid2str(msg->ram_srcnid), - libcfs_nid2str(peer->rap_nid)); - rc = -EPROTO; - goto out; - } - - if (msg->ram_connstamp != conn->rac_peer_connstamp) { - CERROR("Unexpected connstamp "LPX64"("LPX64 - " expected) from %s\n", - msg->ram_connstamp, conn->rac_peer_connstamp, - libcfs_nid2str(peer->rap_nid)); - rc = -EPROTO; - goto out; - } - - if (msg->ram_seq != seq) { - CERROR("Unexpected sequence number %d(%d expected) from %s\n", - msg->ram_seq, seq, libcfs_nid2str(peer->rap_nid)); - rc = -EPROTO; - goto out; - } - - if ((msg->ram_type & RANAL_MSG_FENCE) != 0) { - /* This message signals RDMA completion... */ - rrc = RapkFmaSyncWait(conn->rac_rihandle); - if (rrc != RAP_SUCCESS) { - CERROR("RapkFmaSyncWait failed: %d\n", rrc); - rc = -ENETDOWN; - goto out; - } - } - - if (conn->rac_close_recvd) { - CERROR("Unexpected message %d after CLOSE from %s\n", - msg->ram_type, libcfs_nid2str(conn->rac_peer->rap_nid)); - rc = -EPROTO; - goto out; - } - - if (msg->ram_type == RANAL_MSG_CLOSE) { - CWARN("RX CLOSE from %s\n", libcfs_nid2str(conn->rac_peer->rap_nid)); - conn->rac_close_recvd = 1; - write_lock_irqsave(&kranal_data.kra_global_lock, flags); - - if (conn->rac_state == RANAL_CONN_ESTABLISHED) - kranal_close_conn_locked(conn, 0); - else if (conn->rac_state == RANAL_CONN_CLOSING && - conn->rac_close_sent) - kranal_terminate_conn_locked(conn); - - write_unlock_irqrestore(&kranal_data.kra_global_lock, - flags); - goto out; - } - - if (conn->rac_state != RANAL_CONN_ESTABLISHED) - goto out; - - switch (msg->ram_type) { - case RANAL_MSG_NOOP: - /* Nothing to do; just a keepalive */ - CDEBUG(D_NET, "RX NOOP on %p\n", conn); - break; - - case RANAL_MSG_IMMEDIATE: - CDEBUG(D_NET, "RX IMMEDIATE on %p\n", conn); - rc = lnet_parse(kranal_data.kra_ni, &msg->ram_u.immediate.raim_hdr, - msg->ram_srcnid, conn, 0); - repost = rc < 0; - break; - - case RANAL_MSG_PUT_REQ: - CDEBUG(D_NET, "RX PUT_REQ on %p\n", conn); - rc = lnet_parse(kranal_data.kra_ni, &msg->ram_u.putreq.raprm_hdr, - msg->ram_srcnid, conn, 1); - repost = rc < 0; - break; - - case RANAL_MSG_PUT_NAK: - CDEBUG(D_NET, "RX PUT_NAK on %p\n", conn); - tx = kranal_match_reply(conn, RANAL_MSG_PUT_REQ, - msg->ram_u.completion.racm_cookie); - if (tx == NULL) - break; - - LASSERT (tx->tx_buftype == RANAL_BUF_PHYS_MAPPED || - tx->tx_buftype == RANAL_BUF_VIRT_MAPPED); - kranal_tx_done(tx, -ENOENT); /* no match */ - break; - - case RANAL_MSG_PUT_ACK: - CDEBUG(D_NET, "RX PUT_ACK on %p\n", conn); - tx = kranal_match_reply(conn, RANAL_MSG_PUT_REQ, - msg->ram_u.putack.rapam_src_cookie); - if (tx == NULL) - break; - - kranal_rdma(tx, RANAL_MSG_PUT_DONE, - &msg->ram_u.putack.rapam_desc, - msg->ram_u.putack.rapam_desc.rard_nob, - msg->ram_u.putack.rapam_dst_cookie); - break; - - case RANAL_MSG_PUT_DONE: - CDEBUG(D_NET, "RX PUT_DONE on %p\n", conn); - tx = kranal_match_reply(conn, RANAL_MSG_PUT_ACK, - msg->ram_u.completion.racm_cookie); - if (tx == NULL) - break; - - LASSERT (tx->tx_buftype == RANAL_BUF_PHYS_MAPPED || - tx->tx_buftype == RANAL_BUF_VIRT_MAPPED); - kranal_tx_done(tx, 0); - break; - - case RANAL_MSG_GET_REQ: - CDEBUG(D_NET, "RX GET_REQ on %p\n", conn); - rc = lnet_parse(kranal_data.kra_ni, &msg->ram_u.get.ragm_hdr, - msg->ram_srcnid, conn, 1); - repost = rc < 0; - break; - - case RANAL_MSG_GET_NAK: - CDEBUG(D_NET, "RX GET_NAK on %p\n", conn); - tx = kranal_match_reply(conn, RANAL_MSG_GET_REQ, - msg->ram_u.completion.racm_cookie); - if (tx == NULL) - break; - - LASSERT (tx->tx_buftype == RANAL_BUF_PHYS_MAPPED || - tx->tx_buftype == RANAL_BUF_VIRT_MAPPED); - kranal_tx_done(tx, -ENOENT); /* no match */ - break; - - case RANAL_MSG_GET_DONE: - CDEBUG(D_NET, "RX GET_DONE on %p\n", conn); - tx = kranal_match_reply(conn, RANAL_MSG_GET_REQ, - msg->ram_u.completion.racm_cookie); - if (tx == NULL) - break; - - LASSERT (tx->tx_buftype == RANAL_BUF_PHYS_MAPPED || - tx->tx_buftype == RANAL_BUF_VIRT_MAPPED); -#if 0 - /* completion message should send rdma length if we ever allow - * GET truncation */ - lnet_set_reply_msg_len(kranal_data.kra_ni, tx->tx_lntmsg[1], ???); -#endif - kranal_tx_done(tx, 0); - break; - } - - out: - if (rc < 0) /* protocol/comms error */ - kranal_close_conn (conn, rc); - - if (repost && conn->rac_rxmsg != NULL) - kranal_consume_rxmsg(conn, NULL, 0); - - /* check again later */ - kranal_schedule_conn(conn); -} - -void -kranal_complete_closed_conn (kra_conn_t *conn) -{ - kra_tx_t *tx; - int nfma; - int nreplies; - - LASSERT (conn->rac_state == RANAL_CONN_CLOSED); - LASSERT (cfs_list_empty(&conn->rac_list)); - LASSERT (cfs_list_empty(&conn->rac_hashlist)); - - for (nfma = 0; !cfs_list_empty(&conn->rac_fmaq); nfma++) { - tx = cfs_list_entry(conn->rac_fmaq.next, kra_tx_t, tx_list); - - cfs_list_del(&tx->tx_list); - kranal_tx_done(tx, -ECONNABORTED); - } - - LASSERT (cfs_list_empty(&conn->rac_rdmaq)); - - for (nreplies = 0; !cfs_list_empty(&conn->rac_replyq); nreplies++) { - tx = cfs_list_entry(conn->rac_replyq.next, kra_tx_t, tx_list); - - cfs_list_del(&tx->tx_list); - kranal_tx_done(tx, -ECONNABORTED); - } - - CWARN("Closed conn %p -> %s: nmsg %d nreplies %d\n", - conn, libcfs_nid2str(conn->rac_peer->rap_nid), nfma, nreplies); -} - -int kranal_process_new_conn (kra_conn_t *conn) -{ - RAP_RETURN rrc; - - rrc = RapkCompleteSync(conn->rac_rihandle, 1); - if (rrc == RAP_SUCCESS) - return 0; - - LASSERT (rrc == RAP_NOT_DONE); - if (!cfs_time_aftereq(jiffies, conn->rac_last_tx + - msecs_to_jiffies(conn->rac_timeout*MSEC_PER_SEC))) - return -EAGAIN; - - /* Too late */ - rrc = RapkCompleteSync(conn->rac_rihandle, 0); - LASSERT (rrc == RAP_SUCCESS); - return -ETIMEDOUT; -} - -int -kranal_scheduler (void *arg) -{ - kra_device_t *dev = (kra_device_t *)arg; - wait_queue_t wait; - kra_conn_t *conn; - unsigned long flags; - unsigned long deadline; - unsigned long soonest; - int nsoonest; - long timeout; - cfs_list_t *tmp; - cfs_list_t *nxt; - int rc; - int dropped_lock; - int busy_loops = 0; - - cfs_block_allsigs(); - - dev->rad_scheduler = current; - init_waitqueue_entry_current(&wait); - - spin_lock_irqsave(&dev->rad_lock, flags); - - while (!kranal_data.kra_shutdown) { - /* Safe: kra_shutdown only set when quiescent */ - - if (busy_loops++ >= RANAL_RESCHED) { - spin_unlock_irqrestore(&dev->rad_lock, flags); - - cond_resched(); - busy_loops = 0; - - spin_lock_irqsave(&dev->rad_lock, flags); - } - - dropped_lock = 0; - - if (dev->rad_ready) { - /* Device callback fired since I last checked it */ - dev->rad_ready = 0; - spin_unlock_irqrestore(&dev->rad_lock, flags); - dropped_lock = 1; - - kranal_check_rdma_cq(dev); - kranal_check_fma_cq(dev); - - spin_lock_irqsave(&dev->rad_lock, flags); - } - - cfs_list_for_each_safe(tmp, nxt, &dev->rad_ready_conns) { - conn = cfs_list_entry(tmp, kra_conn_t, rac_schedlist); - - cfs_list_del_init(&conn->rac_schedlist); - LASSERT (conn->rac_scheduled); - conn->rac_scheduled = 0; - spin_unlock_irqrestore(&dev->rad_lock, flags); - dropped_lock = 1; - - kranal_check_fma_rx(conn); - kranal_process_fmaq(conn); - - if (conn->rac_state == RANAL_CONN_CLOSED) - kranal_complete_closed_conn(conn); - - kranal_conn_decref(conn); - spin_lock_irqsave(&dev->rad_lock, flags); - } - - nsoonest = 0; - soonest = jiffies; - - cfs_list_for_each_safe(tmp, nxt, &dev->rad_new_conns) { - conn = cfs_list_entry(tmp, kra_conn_t, rac_schedlist); - - deadline = conn->rac_last_tx + conn->rac_keepalive; - if (cfs_time_aftereq(jiffies, deadline)) { - /* Time to process this new conn */ - spin_unlock_irqrestore(&dev->rad_lock, - flags); - dropped_lock = 1; - - rc = kranal_process_new_conn(conn); - if (rc != -EAGAIN) { - /* All done with this conn */ - spin_lock_irqsave(&dev->rad_lock, - flags); - cfs_list_del_init(&conn->rac_schedlist); - spin_unlock_irqrestore(&dev-> \ - rad_lock, - flags); - - kranal_conn_decref(conn); - spin_lock_irqsave(&dev->rad_lock, - flags); - continue; - } - - /* retry with exponential backoff until HZ */ - if (conn->rac_keepalive == 0) - conn->rac_keepalive = 1; - else if (conn->rac_keepalive <= - msecs_to_jiffies(MSEC_PER_SEC)) - conn->rac_keepalive *= 2; - else - conn->rac_keepalive += - msecs_to_jiffies(MSEC_PER_SEC); - - deadline = conn->rac_last_tx + conn->rac_keepalive; - spin_lock_irqsave(&dev->rad_lock, flags); - } - - /* Does this conn need attention soonest? */ - if (nsoonest++ == 0 || - !cfs_time_aftereq(deadline, soonest)) - soonest = deadline; - } - - if (dropped_lock) /* may sleep iff I didn't drop the lock */ - continue; - - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue_exclusive(&dev->rad_waitq, &wait); - spin_unlock_irqrestore(&dev->rad_lock, flags); - - if (nsoonest == 0) { - busy_loops = 0; - waitq_wait(&wait, TASK_INTERRUPTIBLE); - } else { - timeout = (long)(soonest - jiffies); - if (timeout > 0) { - busy_loops = 0; - waitq_timedwait(&wait, - TASK_INTERRUPTIBLE, - timeout); - } - } - - remove_wait_queue(&dev->rad_waitq, &wait); - set_current_state(TASK_RUNNING); - spin_lock_irqsave(&dev->rad_lock, flags); - } - - spin_unlock_irqrestore(&dev->rad_lock, flags); - - dev->rad_scheduler = NULL; - kranal_thread_fini(); - return 0; -} diff --git a/lnet/klnds/ralnd/ralnd_modparams.c b/lnet/klnds/ralnd/ralnd_modparams.c deleted file mode 100644 index 2d5e64f..0000000 --- a/lnet/klnds/ralnd/ralnd_modparams.c +++ /dev/null @@ -1,210 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/ralnd/ralnd_modparams.c - * - * Author: Eric Barton - */ - -#include "ralnd.h" - -static int n_connd = 4; -CFS_MODULE_PARM(n_connd, "i", int, 0444, - "# of connection daemons"); - -static int min_reconnect_interval = 1; -CFS_MODULE_PARM(min_reconnect_interval, "i", int, 0644, - "minimum connection retry interval (seconds)"); - -static int max_reconnect_interval = 60; -CFS_MODULE_PARM(max_reconnect_interval, "i", int, 0644, - "maximum connection retry interval (seconds)"); - -static int ntx = 256; -CFS_MODULE_PARM(ntx, "i", int, 0444, - "# of transmit descriptors"); - -static int credits = 128; -CFS_MODULE_PARM(credits, "i", int, 0444, - "# concurrent sends"); - -static int peer_credits = 32; -CFS_MODULE_PARM(peer_credits, "i", int, 0444, - "# concurrent sends to 1 peer"); - -static int fma_cq_size = 8192; -CFS_MODULE_PARM(fma_cq_size, "i", int, 0444, - "size of the completion queue"); - -static int timeout = 30; -CFS_MODULE_PARM(timeout, "i", int, 0644, - "communications timeout (seconds)"); - -static int max_immediate = (2<<10); -CFS_MODULE_PARM(max_immediate, "i", int, 0644, - "immediate/RDMA breakpoint"); - -kra_tunables_t kranal_tunables = { - .kra_n_connd = &n_connd, - .kra_min_reconnect_interval = &min_reconnect_interval, - .kra_max_reconnect_interval = &max_reconnect_interval, - .kra_ntx = &ntx, - .kra_credits = &credits, - .kra_peercredits = &peer_credits, - .kra_fma_cq_size = &fma_cq_size, - .kra_timeout = &timeout, - .kra_max_immediate = &max_immediate, -}; - -#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM - -static struct ctl_table kranal_ctl_table[] = { - { - INIT_CTL_NAME - .procname = "n_connd", - .data = &n_connd, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - INIT_CTL_NAME - .procname = "min_reconnect_interval", - .data = &min_reconnect_interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - INIT_CTL_NAME - .procname = "max_reconnect_interval", - .data = &max_reconnect_interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - INIT_CTL_NAME - .procname = "ntx", - .data = &ntx, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - INIT_CTL_NAME - .procname = "credits", - .data = &credits, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - INIT_CTL_NAME - .procname = "peer_credits", - .data = &peer_credits, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - INIT_CTL_NAME - .procname = "fma_cq_size", - .data = &fma_cq_size, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - INIT_CTL_NAME - .procname = "timeout", - .data = &timeout, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - INIT_CTL_NAME - .procname = "max_immediate", - .data = &max_immediate, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { 0 } -}; - -static struct ctl_table kranal_top_ctl_table[] = { - { - INIT_CTL_NAME - .procname = "ranal", - .data = NULL, - .maxlen = 0, - .mode = 0555, - .child = kranal_ctl_table - }, - { 0 } -}; - -int -kranal_tunables_init () -{ - kranal_tunables.kra_sysctl = - register_sysctl_table(kranal_top_ctl_table); - - if (kranal_tunables.kra_sysctl == NULL) - CWARN("Can't setup /proc tunables\n"); - - return 0; -} - -void kranal_tunables_fini() -{ - if (kranal_tunables.kra_sysctl != NULL) - unregister_sysctl_table(kranal_tunables.kra_sysctl); -} - -#else - -int -kranal_tunables_init () -{ - return 0; -} - -void -kranal_tunables_fini () -{ -} - -#endif diff --git a/lnet/lnet/config.c b/lnet/lnet/config.c index 4eb953b..a40f17b 100644 --- a/lnet/lnet/config.c +++ b/lnet/lnet/config.c @@ -1245,83 +1245,3 @@ lnet_parse_ip2nets (char **networksp, char *ip2nets) return 0; } - -int -lnet_set_ip_niaddr (lnet_ni_t *ni) -{ - __u32 net = LNET_NIDNET(ni->ni_nid); - char **names; - int n; - __u32 ip; - __u32 netmask; - int up; - int i; - int rc; - - /* Convenience for LNDs that use the IP address of a local interface as - * the local address part of their NID */ - - if (ni->ni_interfaces[0] != NULL) { - - CLASSERT (LNET_MAX_INTERFACES > 1); - - if (ni->ni_interfaces[1] != NULL) { - CERROR("Net %s doesn't support multiple interfaces\n", - libcfs_net2str(net)); - return -EPERM; - } - - rc = libcfs_ipif_query(ni->ni_interfaces[0], - &up, &ip, &netmask); - if (rc != 0) { - CERROR("Net %s can't query interface %s: %d\n", - libcfs_net2str(net), ni->ni_interfaces[0], rc); - return -EPERM; - } - - if (!up) { - CERROR("Net %s can't use interface %s: it's down\n", - libcfs_net2str(net), ni->ni_interfaces[0]); - return -ENETDOWN; - } - - ni->ni_nid = LNET_MKNID(net, ip); - return 0; - } - - n = libcfs_ipif_enumerate(&names); - if (n <= 0) { - CERROR("Net %s can't enumerate interfaces: %d\n", - libcfs_net2str(net), n); - return 0; - } - - for (i = 0; i < n; i++) { - if (!strcmp(names[i], "lo")) /* skip the loopback IF */ - continue; - - rc = libcfs_ipif_query(names[i], &up, &ip, &netmask); - - if (rc != 0) { - CWARN("Net %s can't query interface %s: %d\n", - libcfs_net2str(net), names[i], rc); - continue; - } - - if (!up) { - CWARN("Net %s ignoring interface %s (down)\n", - libcfs_net2str(net), names[i]); - continue; - } - - libcfs_ipif_free_enumeration(names, n); - ni->ni_nid = LNET_MKNID(net, ip); - return 0; - } - - CERROR("Net %s can't find any interfaces\n", libcfs_net2str(net)); - libcfs_ipif_free_enumeration(names, n); - return -ENOENT; -} -EXPORT_SYMBOL(lnet_set_ip_niaddr); - diff --git a/lnet/utils/debug.c b/lnet/utils/debug.c index 1954a31..3f11c75 100644 --- a/lnet/utils/debug.c +++ b/lnet/utils/debug.c @@ -774,13 +774,9 @@ static struct mod_paths { } mod_paths[] = { { "libcfs", "libcfs/libcfs" }, { "lnet", "lnet/lnet" }, - { "kmxlnd", "lnet/klnds/mxlnd" }, { "ko2iblnd", "lnet/klnds/o2iblnd" }, { "kgnilnd", "lnet/klnds/gnilnd"}, - { "kqswlnd", "lnet/klnds/qswlnd" }, - { "kralnd", "lnet/klnds/ralnd" }, { "ksocklnd", "lnet/klnds/socklnd" }, - { "ktdilnd", "lnet/klnds/tdilnd" }, { "obdclass", "lustre/obdclass" }, { "llog_test", "lustre/obdclass" }, { "ptlrpc_gss", "lustre/ptlrpc/gss" }, diff --git a/lnet/utils/portals.c b/lnet/utils/portals.c index f1e34f0..c8e67c8 100644 --- a/lnet/utils/portals.c +++ b/lnet/utils/portals.c @@ -574,8 +574,7 @@ jt_ptl_print_peers (int argc, char **argv) int index; int rc; - if (!g_net_is_compatible (argv[0], SOCKLND, RALND, MXLND, - O2IBLND, GNILND, 0)) + if (!g_net_is_compatible (argv[0], SOCKLND, O2IBLND, GNILND, 0)) return -1; for (index = 0;;index++) { @@ -591,7 +590,7 @@ jt_ptl_print_peers (int argc, char **argv) id.nid = data.ioc_nid; id.pid = data.ioc_u32[4]; printf ("%-20s [%d]%s->%s:%d #%d\n", - libcfs_id2str(id), + libcfs_id2str(id), data.ioc_count, /* persistence */ /* my ip */ ptl_ipaddr_2_str(data.ioc_u32[2], buffer[0], @@ -601,14 +600,6 @@ jt_ptl_print_peers (int argc, char **argv) sizeof(buffer[1]), 1), data.ioc_u32[1], /* peer port */ data.ioc_u32[3]); /* conn_count */ - } else if (g_net_is_compatible(NULL, RALND, 0)) { - printf ("%-20s [%d]@%s:%d\n", - libcfs_nid2str(data.ioc_nid), /* peer nid */ - data.ioc_count, /* peer persistence */ - /* peer ip */ - ptl_ipaddr_2_str(data.ioc_u32[0], buffer[1], - sizeof(buffer[1]), 1), - data.ioc_u32[1]); /* peer port */ } else if (g_net_is_compatible(NULL, GNILND, 0)) { int disconn = data.ioc_flags >> 16; char *state; @@ -656,8 +647,7 @@ jt_ptl_add_peer (int argc, char **argv) int port = 0; int rc; - if (!g_net_is_compatible (argv[0], SOCKLND, RALND, - GNILND, 0)) + if (!g_net_is_compatible(argv[0], SOCKLND, GNILND, 0)) return -1; if (argc != 4) { @@ -707,8 +697,7 @@ jt_ptl_del_peer (int argc, char **argv) __u32 ip = 0; int rc; - if (!g_net_is_compatible (argv[0], SOCKLND, RALND, MXLND, - O2IBLND, GNILND, 0)) + if (!g_net_is_compatible(argv[0], SOCKLND, O2IBLND, GNILND, 0)) return -1; if (g_net_is_compatible(NULL, SOCKLND, 0)) { @@ -762,8 +751,7 @@ jt_ptl_print_connections (int argc, char **argv) int index; int rc; - if (!g_net_is_compatible (argv[0], SOCKLND, RALND, MXLND, O2IBLND, - GNILND, 0)) + if (!g_net_is_compatible(argv[0], SOCKLND, O2IBLND, GNILND, 0)) return -1; for (index = 0; ; index++) { @@ -795,10 +783,6 @@ jt_ptl_print_connections (int argc, char **argv) data.ioc_count, /* tx buffer size */ data.ioc_u32[5], /* rx buffer size */ data.ioc_flags ? "nagle" : "nonagle"); - } else if (g_net_is_compatible (NULL, RALND, 0)) { - printf ("%-20s [%d]\n", - libcfs_nid2str(data.ioc_nid), - data.ioc_u32[0] /* device id */); } else if (g_net_is_compatible (NULL, O2IBLND, 0)) { printf ("%s mtu %d\n", libcfs_nid2str(data.ioc_nid), @@ -836,8 +820,7 @@ int jt_ptl_disconnect(int argc, char **argv) return 0; } - if (!g_net_is_compatible (NULL, SOCKLND, RALND, MXLND, O2IBLND, - GNILND, 0)) + if (!g_net_is_compatible(NULL, SOCKLND, O2IBLND, GNILND, 0)) return 0; if (argc >= 2 && @@ -902,49 +885,6 @@ int jt_ptl_push_connection (int argc, char **argv) return 0; } -int -jt_ptl_print_active_txs (int argc, char **argv) -{ - struct libcfs_ioctl_data data; - int index; - int rc; - - if (!g_net_is_compatible (argv[0], QSWLND, 0)) - return -1; - - for (index = 0;;index++) { - LIBCFS_IOC_INIT(data); - data.ioc_net = g_net; - data.ioc_count = index; - - rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_GET_TXDESC, &data); - if (rc != 0) - break; - - printf ("type %u payload %6d to %s via %s by pid %6d: " - "%s, %s, state %d\n", - data.ioc_u32[0], - data.ioc_count, - libcfs_nid2str(data.ioc_nid), - libcfs_nid2str(data.ioc_u64[0]), - data.ioc_u32[1], - (data.ioc_flags & 1) ? "delayed" : "immediate", - (data.ioc_flags & 2) ? "nblk" : "normal", - data.ioc_flags >> 2); - } - - if (index == 0) { - if (errno == ENOENT) { - printf ("\n"); - } else { - fprintf(stderr, "Error getting active transmits list: " - "%s: check dmesg.\n", - strerror(errno)); - } - } - return 0; -} - int jt_ptl_ping(int argc, char **argv) { int rc; diff --git a/lustre/doc/lctl.8 b/lustre/doc/lctl.8 index ed79e6a..c3a3379 100644 --- a/lustre/doc/lctl.8 +++ b/lustre/doc/lctl.8 @@ -84,9 +84,6 @@ type. Print all the connected remote NIDs for a given .B network type. -.TP -.BI active_tx -This command should print active transmits, and it is only used for elan network type. .TP .BI route_list Print the complete routing table. diff --git a/lustre/utils/lctl.c b/lustre/utils/lctl.c index 6de68b8..c8351d8 100644 --- a/lustre/utils/lctl.c +++ b/lustre/utils/lctl.c @@ -102,8 +102,6 @@ command_t cmdlist[] = { {"conn_list", jt_ptl_print_connections, 0, "print all the connected remote nid\n" "usage: conn_list"}, - {"active_tx", jt_ptl_print_active_txs, 0, "print active transmits\n" - "usage: active_tx"}, {"route_list", jt_ptl_print_routes, 0, "print the portals routing table, same as show_route\n" "usage: route_list"},