From: Maxim Patlasov Date: Mon, 30 Aug 2010 11:12:56 +0000 (+0400) Subject: b=19700 remove obsolete IB LNDs from HEAD X-Git-Tag: 2.0.51.0~12 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=d2b8a0efaa4b5faea675bd4bd4bfe1f80dad4011 b=19700 remove obsolete IB LNDs from HEAD --- diff --git a/lnet/autoconf/lustre-lnet.m4 b/lnet/autoconf/lustre-lnet.m4 index 1dee10b..d941cd6 100644 --- a/lnet/autoconf/lustre-lnet.m4 +++ b/lnet/autoconf/lustre-lnet.m4 @@ -461,313 +461,6 @@ AC_SUBST(O2IBLND) ]) # -# LN_CONFIG_OPENIB -# -# check for OpenIB in the kernel -AC_DEFUN([LN_CONFIG_OPENIB],[ -AC_MSG_CHECKING([whether to enable OpenIB support]) -# set default -OPENIBPATH="$LINUX/drivers/infiniband" -AC_ARG_WITH([openib], - AC_HELP_STRING([--with-openib=path], - [build openiblnd against path]), - [ - case $with_openib in - yes) ENABLEOPENIB=2 - ;; - no) ENABLEOPENIB=0 - ;; - *) OPENIBPATH="$with_openib" - ENABLEOPENIB=3 - ;; - esac - ],[ - ENABLEOPENIB=1 - ]) -if test $ENABLEOPENIB -eq 0; then - AC_MSG_RESULT([disabled]) -elif test ! \( -f ${OPENIBPATH}/include/ts_ib_core.h -a \ - -f ${OPENIBPATH}/include/ts_ib_cm.h -a \ - -f ${OPENIBPATH}/include/ts_ib_sa_client.h \); then - AC_MSG_RESULT([no]) - case $ENABLEOPENIB in - 1) ;; - 2) AC_MSG_ERROR([kernel OpenIB headers not present]);; - 3) AC_MSG_ERROR([bad --with-openib path]);; - *) AC_MSG_ERROR([internal error]);; - esac -else - case $ENABLEOPENIB in - 1|2) OPENIBCPPFLAGS="-I$OPENIBPATH/include -DIN_TREE_BUILD";; - 3) OPENIBCPPFLAGS="-I$OPENIBPATH/include";; - *) AC_MSG_RESULT([no]) - AC_MSG_ERROR([internal error]);; - esac - OPENIBCPPFLAGS="$OPENIBCPPFLAGS -DIB_NTXRXPARAMS=4" - EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="$EXTRA_KCFLAGS $OPENIBCPPFLAGS" - LB_LINUX_TRY_COMPILE([ - #include - #include - #include - ],[ - struct ib_device_properties dev_props; - struct ib_cm_active_param cm_active_params; - tTS_IB_CLIENT_QUERY_TID tid; - int enum1 = IB_QP_ATTRIBUTE_STATE; - int enum2 = IB_ACCESS_LOCAL_WRITE; - int enum3 = IB_CQ_CALLBACK_INTERRUPT; - int enum4 = IB_CQ_PROVIDER_REARM; - return 0; - ],[ - AC_MSG_RESULT([yes]) - OPENIBLND="openiblnd" - ],[ - AC_MSG_RESULT([no]) - case $ENABLEOPENIB in - 1) ;; - 2) AC_MSG_ERROR([can't compile with kernel OpenIB headers]);; - 3) AC_MSG_ERROR([can't compile with OpenIB headers under $OPENIBPATH]);; - *) AC_MSG_ERROR([internal error]);; - esac - OPENIBLND="" - OPENIBCPPFLAGS="" - ]) - EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save" -fi -AC_SUBST(OPENIBCPPFLAGS) -AC_SUBST(OPENIBLND) -]) - -# -# LN_CONFIG_CIBLND -# -AC_DEFUN([LN_CONFIG_CIB],[ -AC_MSG_CHECKING([whether to enable Cisco/TopSpin IB support]) -# set default -CIBPATH="" -CIBLND="" -AC_ARG_WITH([cib], - AC_HELP_STRING([--with-cib=path], - [build ciblnd against path]), - [ - case $with_cib in - no) AC_MSG_RESULT([no]);; - *) CIBPATH="$with_cib" - if test -d "$CIBPATH"; then - AC_MSG_RESULT([yes]) - else - AC_MSG_RESULT([no]) - AC_MSG_ERROR([No directory $CIBPATH]) - fi;; - esac - ],[ - AC_MSG_RESULT([no]) - ]) -if test -n "$CIBPATH"; then - CIBCPPFLAGS="-I${CIBPATH}/ib/ts_api_ng/include -I${CIBPATH}/all/kernel_services/include -DUSING_TSAPI" - CIBCPPFLAGS="$CIBCPPFLAGS -DIB_NTXRXPARAMS=3" - EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="$EXTRA_KCFLAGS $CIBCPPFLAGS" - LB_LINUX_TRY_COMPILE([ - #include - #include - #include - ],[ - struct ib_device_properties dev_props; - struct ib_cm_active_param cm_active_params; - tTS_IB_CLIENT_QUERY_TID tid; - int enum1 = TS_IB_QP_ATTRIBUTE_STATE; - int enum2 = TS_IB_ACCESS_LOCAL_WRITE; - int enum3 = TS_IB_CQ_CALLBACK_INTERRUPT; - int enum4 = TS_IB_CQ_PROVIDER_REARM; - return 0; - ],[ - CIBLND="ciblnd" - ],[ - AC_MSG_ERROR([can't compile ciblnd with given path]) - CIBCPPFLAGS="" - ]) - EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save" -fi -AC_SUBST(CIBCPPFLAGS) -AC_SUBST(CIBLND) -]) - -# -# LN_CONFIG_IIB -# -# check for infinicon infiniband support -# -AC_DEFUN([LN_CONFIG_IIB],[ -AC_MSG_CHECKING([whether to enable Infinicon support]) -# set default -IIBPATH="/usr/include" -AC_ARG_WITH([iib], - AC_HELP_STRING([--with-iib=path], - [build iiblnd against path]), - [ - case $with_iib in - yes) ENABLEIIB=2 - ;; - no) ENABLEIIB=0 - ;; - *) IIBPATH="${with_iib}/include" - ENABLEIIB=3 - ;; - esac - ],[ - ENABLEIIB=1 - ]) -if test $ENABLEIIB -eq 0; then - AC_MSG_RESULT([disabled]) -elif test ! \( -f ${IIBPATH}/linux/iba/ibt.h \); then - AC_MSG_RESULT([no]) - case $ENABLEIIB in - 1) ;; - 2) AC_MSG_ERROR([default Infinicon headers not present]);; - 3) AC_MSG_ERROR([bad --with-iib path]);; - *) AC_MSG_ERROR([internal error]);; - esac -else - IIBCPPFLAGS="-I$IIBPATH" - if test $IIBPATH != "/usr/include"; then - # we need /usr/include come what may - IIBCPPFLAGS="$IIBCPPFLAGS -I/usr/include" - fi - EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="$EXTRA_KCFLAGS $IIBCPPFLAGS" - LB_LINUX_TRY_COMPILE([ - #include - ],[ - IBT_INTERFACE_UNION interfaces; - FSTATUS rc; - - rc = IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2, - &interfaces); - - return rc == FSUCCESS ? 0 : 1; - ],[ - AC_MSG_RESULT([yes]) - IIBLND="iiblnd" - ],[ - AC_MSG_RESULT([no]) - case $ENABLEIIB in - 1) ;; - 2) AC_MSG_ERROR([can't compile with default Infinicon headers]);; - 3) AC_MSG_ERROR([can't compile with Infinicon headers under $IIBPATH]);; - *) AC_MSG_ERROR([internal error]);; - esac - IIBLND="" - IIBCPPFLAGS="" - ]) - EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save" -fi -AC_SUBST(IIBCPPFLAGS) -AC_SUBST(IIBLND) -]) - -# -# LN_CONFIG_VIB -# -# check for Voltaire infiniband support -# -AC_DEFUN([LN_CONFIG_VIB], -[AC_MSG_CHECKING([whether to enable Voltaire IB support]) -VIBPATH="" -AC_ARG_WITH([vib], - AC_HELP_STRING([--with-vib=path], - [build viblnd against path]), - [ - case $with_vib in - no) AC_MSG_RESULT([no]);; - *) VIBPATH="${with_vib}/src/nvigor/ib-code" - if test -d "$with_vib" -a -d "$VIBPATH"; then - AC_MSG_RESULT([yes]) - else - AC_MSG_RESULT([no]) - AC_MSG_ERROR([No directory $VIBPATH]) - fi;; - esac - ],[ - AC_MSG_RESULT([no]) - ]) -if test -z "$VIBPATH"; then - VIBLND="" -else - VIBCPPFLAGS="-I${VIBPATH}/include -I${VIBPATH}/cm" - EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="$EXTRA_KCFLAGS $VIBCPPFLAGS" - LB_LINUX_TRY_COMPILE([ - #include - #include - #ifdef __BIG_ENDIAN - # define CPU_BE 1 - # define CPU_LE 0 - #endif - #ifdef __LITTLE_ENDIAN - # define CPU_BE 0 - # define CPU_LE 1 - #endif - #include - #include - #include - ],[ - vv_hca_h_t kib_hca; - vv_return_t vvrc; - cm_cep_handle_t cep; - ibat_arp_data_t arp_data; - ibat_stat_t ibatrc; - - vvrc = vv_hca_open("ANY_HCA", NULL, &kib_hca); - cep = cm_create_cep(cm_cep_transp_rc); - ibatrc = ibat_get_ib_data((uint32_t)0, (uint32_t)0, - ibat_paths_primary, &arp_data, - (ibat_get_ib_data_reply_fn_t)NULL, - NULL, 0); - return 0; - ],[ - VIBLND="viblnd" - ],[ - AC_MSG_ERROR([can't compile viblnd with given path]) - ]) - EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save" -fi -if test -n "$VIBLND"; then - EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="$EXTRA_KCFLAGS $VIBCPPFLAGS" - AC_MSG_CHECKING([if Voltaire still uses void * sg addresses]) - LB_LINUX_TRY_COMPILE([ - #include - #include - #ifdef __BIG_ENDIAN - # define CPU_BE 1 - # define CPU_LE 0 - #endif - #ifdef __LITTLE_ENDIAN - # define CPU_BE 0 - # define CPU_LE 1 - #endif - #include - #include - #include - ],[ - vv_scatgat_t sg; - - return &sg.v_address[3] == NULL; - ],[ - AC_MSG_RESULT([yes]) - VIBCPPFLAGS="$VIBCPPFLAGS -DIBNAL_VOIDSTAR_SGADDR=1" - ],[ - AC_MSG_RESULT([no]) - ]) - EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save" -fi -AC_SUBST(VIBCPPFLAGS) -AC_SUBST(VIBLND) -]) - -# # LN_CONFIG_RALND # # check whether to use the RapidArray lnd @@ -849,10 +542,6 @@ AC_DEFUN([LN_PROG_LINUX], LN_CONFIG_AFFINITY LN_CONFIG_BACKOFF LN_CONFIG_QUADRICS -LN_CONFIG_OPENIB -LN_CONFIG_CIB -LN_CONFIG_VIB -LN_CONFIG_IIB LN_CONFIG_O2IB LN_CONFIG_RALND LN_CONFIG_PTLLND @@ -999,10 +688,6 @@ AC_DEFUN([LN_CONDITIONALS], [AM_CONDITIONAL(BUILD_QSWLND, test x$QSWLND = "xqswlnd") AM_CONDITIONAL(BUILD_MXLND, test x$MXLND = "xmxlnd") AM_CONDITIONAL(BUILD_O2IBLND, test x$O2IBLND = "xo2iblnd") -AM_CONDITIONAL(BUILD_OPENIBLND, test x$OPENIBLND = "xopeniblnd") -AM_CONDITIONAL(BUILD_CIBLND, test x$CIBLND = "xciblnd") -AM_CONDITIONAL(BUILD_IIBLND, test x$IIBLND = "xiiblnd") -AM_CONDITIONAL(BUILD_VIBLND, test x$VIBLND = "xviblnd") AM_CONDITIONAL(BUILD_RALND, test x$RALND = "xralnd") AM_CONDITIONAL(BUILD_PTLLND, test x$PTLLND = "xptllnd") AM_CONDITIONAL(BUILD_UPTLLND, test x$UPTLLND = "xptllnd") @@ -1028,16 +713,8 @@ lnet/klnds/Makefile lnet/klnds/autoMakefile lnet/klnds/mxlnd/autoMakefile lnet/klnds/mxlnd/Makefile -lnet/klnds/openiblnd/Makefile -lnet/klnds/openiblnd/autoMakefile lnet/klnds/o2iblnd/Makefile lnet/klnds/o2iblnd/autoMakefile -lnet/klnds/ciblnd/Makefile -lnet/klnds/ciblnd/autoMakefile -lnet/klnds/iiblnd/Makefile -lnet/klnds/iiblnd/autoMakefile -lnet/klnds/viblnd/Makefile -lnet/klnds/viblnd/autoMakefile lnet/klnds/qswlnd/Makefile lnet/klnds/qswlnd/autoMakefile lnet/klnds/ralnd/Makefile diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index 57454bc..91169bd 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -153,9 +153,6 @@ typedef struct { /* PROTO MAGIC for LNDs */ #define LNET_PROTO_IB_MAGIC 0x0be91b91 -#define LNET_PROTO_OPENIB_MAGIC LNET_PROTO_IB_MAGIC -#define LNET_PROTO_IIB_MAGIC LNET_PROTO_IB_MAGIC -#define LNET_PROTO_VIB_MAGIC LNET_PROTO_IB_MAGIC #define LNET_PROTO_RA_MAGIC 0x0be91b92 #define LNET_PROTO_QSW_MAGIC 0x0be91b93 #define LNET_PROTO_TCP_MAGIC 0xeebc0ded diff --git a/lnet/include/lnet/lnet-sysctl.h b/lnet/include/lnet/lnet-sysctl.h index dfa5dc4..f403505 100644 --- a/lnet/include/lnet/lnet-sysctl.h +++ b/lnet/include/lnet/lnet-sysctl.h @@ -42,25 +42,19 @@ #ifndef HAVE_SYSCTL_UNNUMBERED #define CTL_KRANAL 201 -#define CTL_KIBNAL 203 -#define CTL_IIBBLND 204 #define CTL_O2IBLND 205 #define CTL_PTLLND 206 #define CTL_QSWNAL 207 #define CTL_SOCKLND 208 -#define CTL_VIBLND 209 #define CTL_GNILND 210 #else #define CTL_KRANAL CTL_UNNUMBERED -#define CTL_KIBNAL CTL_UNNUMBERED -#define CTL_IIBLND CTL_UNNUMBERED #define CTL_O2IBLND CTL_UNNUMBERED #define CTL_PTLLND CTL_UNNUMBERED #define CTL_QSWNAL CTL_UNNUMBERED #define CTL_SOCKLND CTL_UNNUMBERED -#define CTL_VIBLND CTL_UNNUMBERED #define CTL_GNILND CTL_UNNUMBERED #endif /* sysctl id */ diff --git a/lnet/klnds/Makefile.in b/lnet/klnds/Makefile.in index 0e8bb60..f0586ae 100644 --- a/lnet/klnds/Makefile.in +++ b/lnet/klnds/Makefile.in @@ -1,10 +1,6 @@ @BUILD_MXLND_TRUE@subdir-m += mxlnd @BUILD_RALND_TRUE@subdir-m += ralnd @BUILD_O2IBLND_TRUE@subdir-m += o2iblnd -@BUILD_OPENIBLND_TRUE@subdir-m += openiblnd -@BUILD_CIBLND_TRUE@subdir-m += ciblnd -@BUILD_IIBLND_TRUE@subdir-m += iiblnd -@BUILD_VIBLND_TRUE@subdir-m += viblnd @BUILD_QSWLND_TRUE@subdir-m += qswlnd @BUILD_PTLLND_TRUE@subdir-m += ptllnd subdir-m += socklnd diff --git a/lnet/klnds/autoMakefile.am b/lnet/klnds/autoMakefile.am index cdd1820..57d709c 100644 --- a/lnet/klnds/autoMakefile.am +++ b/lnet/klnds/autoMakefile.am @@ -34,4 +34,4 @@ # Lustre is a trademark of Sun Microsystems, Inc. # -SUBDIRS = socklnd qswlnd mxlnd openiblnd iiblnd viblnd ralnd ptllnd ciblnd o2iblnd +SUBDIRS = socklnd qswlnd mxlnd ralnd ptllnd o2iblnd diff --git a/lnet/klnds/ciblnd/.gitignore b/lnet/klnds/ciblnd/.gitignore deleted file mode 100644 index b5d0279..0000000 --- a/lnet/klnds/ciblnd/.gitignore +++ /dev/null @@ -1,10 +0,0 @@ -/.deps -/Makefile -/.*.cmd -/autoMakefile.in -/autoMakefile -/*.ko -/*.mod.c -/.*.flags -/.tmp_versions -/.depend diff --git a/lnet/klnds/ciblnd/Makefile.in b/lnet/klnds/ciblnd/Makefile.in deleted file mode 100644 index 55311ad..0000000 --- a/lnet/klnds/ciblnd/Makefile.in +++ /dev/null @@ -1,8 +0,0 @@ -MODULES := kciblnd -kciblnd-objs := ciblnd.o ciblnd_cb.o ciblnd_modparams.o - -default: all - -EXTRA_POST_CFLAGS := @CIBCPPFLAGS@ -I@LUSTRE@/../lnet/klnds/openiblnd - -@INCLUDE_RULES@ diff --git a/lnet/klnds/ciblnd/autoMakefile.am b/lnet/klnds/ciblnd/autoMakefile.am deleted file mode 100644 index 0068dd7..0000000 --- a/lnet/klnds/ciblnd/autoMakefile.am +++ /dev/null @@ -1,44 +0,0 @@ -# -# GPL HEADER START -# -# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 only, -# as published by the Free Software Foundation. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License version 2 for more details (a copy is included -# in the LICENSE file that accompanied this code). -# -# You should have received a copy of the GNU General Public License -# version 2 along with this program; If not, see -# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf -# -# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, -# CA 95054 USA or visit www.sun.com if you need additional information or -# have any questions. -# -# GPL HEADER END -# - -# -# Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. -# Use is subject to license terms. -# - -# -# This file is part of Lustre, http://www.lustre.org/ -# Lustre is a trademark of Sun Microsystems, Inc. -# - -if MODULES -if BUILD_CIBLND -modulenet_DATA = kciblnd$(KMODEXT) -endif -endif - -MOSTLYCLEANFILES = @MOSTLYCLEANFILES@ -DIST_SOURCES = $(kciblnd-objs:%.o=%.c) diff --git a/lnet/klnds/ciblnd/ciblnd.c b/lnet/klnds/ciblnd/ciblnd.c deleted file mode 100644 index 1d7dc58..0000000 --- a/lnet/klnds/ciblnd/ciblnd.c +++ /dev/null @@ -1,37 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#include "openiblnd.c" diff --git a/lnet/klnds/ciblnd/ciblnd_cb.c b/lnet/klnds/ciblnd/ciblnd_cb.c deleted file mode 100644 index 1be9433..0000000 --- a/lnet/klnds/ciblnd/ciblnd_cb.c +++ /dev/null @@ -1,37 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#include "openiblnd_cb.c" diff --git a/lnet/klnds/ciblnd/ciblnd_modparams.c b/lnet/klnds/ciblnd/ciblnd_modparams.c deleted file mode 100644 index 41a33f8..0000000 --- a/lnet/klnds/ciblnd/ciblnd_modparams.c +++ /dev/null @@ -1,37 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#include "openiblnd_modparams.c" diff --git a/lnet/klnds/iiblnd/.gitignore b/lnet/klnds/iiblnd/.gitignore deleted file mode 100644 index b5d0279..0000000 --- a/lnet/klnds/iiblnd/.gitignore +++ /dev/null @@ -1,10 +0,0 @@ -/.deps -/Makefile -/.*.cmd -/autoMakefile.in -/autoMakefile -/*.ko -/*.mod.c -/.*.flags -/.tmp_versions -/.depend diff --git a/lnet/klnds/iiblnd/Makefile.in b/lnet/klnds/iiblnd/Makefile.in deleted file mode 100644 index 7ee9b64..0000000 --- a/lnet/klnds/iiblnd/Makefile.in +++ /dev/null @@ -1,6 +0,0 @@ -MODULES := kiiblnd -kiiblnd-objs := iiblnd.o iiblnd_cb.o iiblnd_modparams.o - -EXTRA_POST_CFLAGS := @IIBCPPFLAGS@ - -@INCLUDE_RULES@ diff --git a/lnet/klnds/iiblnd/autoMakefile.am b/lnet/klnds/iiblnd/autoMakefile.am deleted file mode 100644 index 8ada02e..0000000 --- a/lnet/klnds/iiblnd/autoMakefile.am +++ /dev/null @@ -1,43 +0,0 @@ -# GPL HEADER START -# -# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 only, -# as published by the Free Software Foundation. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License version 2 for more details (a copy is included -# in the LICENSE file that accompanied this code). -# -# You should have received a copy of the GNU General Public License -# version 2 along with this program; If not, see -# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf -# -# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, -# CA 95054 USA or visit www.sun.com if you need additional information or -# have any questions. -# -# GPL HEADER END -# - -# -# Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. -# Use is subject to license terms. -# - -# -# This file is part of Lustre, http://www.lustre.org/ -# Lustre is a trademark of Sun Microsystems, Inc. -# - -if MODULES -if BUILD_IIBLND -modulenet_DATA = kiiblnd$(KMODEXT) -endif -endif - -MOSTLYCLEANFILES = @MOSTLYCLEANFILES@ -DIST_SOURCES = $(kiiblnd-objs:%.o=%.c) iiblnd.h diff --git a/lnet/klnds/iiblnd/iiblnd.c b/lnet/klnds/iiblnd/iiblnd.c deleted file mode 100644 index d5995d5..0000000 --- a/lnet/klnds/iiblnd/iiblnd.c +++ /dev/null @@ -1,2164 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. - * - * GPL HEADER END - */ -/* - * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/iiblnd/iiblnd.c - * - * Author: Eric Barton - */ - -#include "iiblnd.h" - -lnd_t the_kiblnd = { - .lnd_type = IIBLND, - .lnd_startup = kibnal_startup, - .lnd_shutdown = kibnal_shutdown, - .lnd_ctl = kibnal_ctl, - .lnd_send = kibnal_send, - .lnd_recv = kibnal_recv, - .lnd_eager_recv = kibnal_eager_recv, -}; - -kib_data_t kibnal_data; - -__u32 -kibnal_cksum (void *ptr, int nob) -{ - char *c = ptr; - __u32 sum = 0; - - while (nob-- > 0) - sum = ((sum << 1) | (sum >> 31)) + *c++; - - /* ensure I don't return 0 (== no checksum) */ - return (sum == 0) ? 1 : sum; -} - -void -kibnal_init_msg(kib_msg_t *msg, int type, int body_nob) -{ - msg->ibm_type = type; - msg->ibm_nob = offsetof(kib_msg_t, ibm_u) + body_nob; -} - -void -kibnal_pack_msg(kib_msg_t *msg, __u32 version, int credits, - lnet_nid_t dstnid, __u64 dststamp, __u64 seq) -{ - /* CAVEAT EMPTOR! all message fields not set here should have been - * initialised previously. */ - msg->ibm_magic = IBNAL_MSG_MAGIC; - msg->ibm_version = version; - /* ibm_type */ - msg->ibm_credits = credits; - /* ibm_nob */ - msg->ibm_cksum = 0; - msg->ibm_srcnid = kibnal_data.kib_ni->ni_nid; - msg->ibm_srcstamp = kibnal_data.kib_incarnation; - msg->ibm_dstnid = dstnid; - msg->ibm_dststamp = dststamp; - msg->ibm_seq = seq; - - if (*kibnal_tunables.kib_cksum) { - /* NB ibm_cksum zero while computing cksum */ - msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob); - } -} - -void -kibnal_pack_connmsg(kib_msg_t *msg, __u32 version, int nob, - int type, lnet_nid_t dstnid, __u64 dststamp) -{ - LASSERT (nob >= offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t)); - - memset(msg, 0, nob); - kibnal_init_msg(msg, type, sizeof(kib_connparams_t)); - - msg->ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE; - msg->ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE; - msg->ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS; - - kibnal_pack_msg(msg, version, 0, dstnid, dststamp, 0); -} - -int -kibnal_unpack_msg(kib_msg_t *msg, __u32 expected_version, int nob) -{ - const int hdr_size = offsetof(kib_msg_t, ibm_u); - __u32 msg_cksum; - __u32 msg_version; - int flip; - int msg_nob; -#if !IBNAL_USE_FMR - int i; - int n; -#endif - /* 6 bytes are enough to have received magic + version */ - if (nob < 6) { - CERROR("Short message: %d\n", nob); - return -EPROTO; - } - - /* Future protocol version compatibility support! - * If the iiblnd-specific protocol changes, or when LNET unifies - * protocols over all LNDs, the initial connection will negotiate a - * protocol version. If I find this, I avoid any console errors. If - * my is doing connection establishment, the reject will tell the peer - * which version I'm running. */ - - if (msg->ibm_magic == IBNAL_MSG_MAGIC) { - flip = 0; - } else if (msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC)) { - flip = 1; - } else { - if (msg->ibm_magic == LNET_PROTO_MAGIC || - msg->ibm_magic == __swab32(LNET_PROTO_MAGIC)) - return -EPROTO; - - /* Completely out to lunch */ - CERROR("Bad magic: %08x\n", msg->ibm_magic); - return -EPROTO; - } - - msg_version = flip ? __swab16(msg->ibm_version) : msg->ibm_version; - if (expected_version == 0) { - if (msg_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD && - msg_version != IBNAL_MSG_VERSION) - return -EPROTO; - } else if (msg_version != expected_version) { - CERROR("Bad version: %x(%x expected)\n", - msg_version, expected_version); - return -EPROTO; - } - - if (nob < hdr_size) { - CERROR("Short message: %d\n", nob); - return -EPROTO; - } - - msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob; - if (msg_nob > nob) { - CERROR("Short message: got %d, wanted %d\n", nob, msg_nob); - return -EPROTO; - } - - /* checksum must be computed with ibm_cksum zero and BEFORE anything - * gets flipped */ - msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum; - msg->ibm_cksum = 0; - if (msg_cksum != 0 && - msg_cksum != kibnal_cksum(msg, msg_nob)) { - CERROR("Bad checksum\n"); - return -EPROTO; - } - msg->ibm_cksum = msg_cksum; - - if (flip) { - /* leave magic unflipped as a clue to peer endianness */ - msg->ibm_version = msg_version; - CLASSERT (sizeof(msg->ibm_type) == 1); - CLASSERT (sizeof(msg->ibm_credits) == 1); - msg->ibm_nob = msg_nob; - __swab64s(&msg->ibm_srcnid); - __swab64s(&msg->ibm_srcstamp); - __swab64s(&msg->ibm_dstnid); - __swab64s(&msg->ibm_dststamp); - __swab64s(&msg->ibm_seq); - } - - if (msg->ibm_srcnid == LNET_NID_ANY) { - CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid)); - return -EPROTO; - } - - switch (msg->ibm_type) { - default: - CERROR("Unknown message type %x\n", msg->ibm_type); - return -EPROTO; - - case IBNAL_MSG_NOOP: - break; - - case IBNAL_MSG_IMMEDIATE: - if (msg_nob < offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])) { - CERROR("Short IMMEDIATE: %d(%d)\n", msg_nob, - (int)offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])); - return -EPROTO; - } - break; - - case IBNAL_MSG_PUT_REQ: - if (msg_nob < hdr_size + sizeof(msg->ibm_u.putreq)) { - CERROR("Short PUT_REQ: %d(%d)\n", msg_nob, - (int)(hdr_size + sizeof(msg->ibm_u.putreq))); - return -EPROTO; - } - break; - - case IBNAL_MSG_PUT_ACK: - if (msg_nob < hdr_size + sizeof(msg->ibm_u.putack)) { - CERROR("Short PUT_ACK: %d(%d)\n", msg_nob, - (int)(hdr_size + sizeof(msg->ibm_u.putack))); - return -EPROTO; - } -#if IBNAL_USE_FMR - if (flip) { - __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_addr); - __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nob); - __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key); - } -#else - if (flip) { - __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key); - __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nfrag); - } - - n = msg->ibm_u.putack.ibpam_rd.rd_nfrag; - if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) { - CERROR("Bad PUT_ACK nfrags: %d, should be 0 < n <= %d\n", - n, IBNAL_MAX_RDMA_FRAGS); - return -EPROTO; - } - - if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])) { - CERROR("Short PUT_ACK: %d(%d)\n", msg_nob, - (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])); - return -EPROTO; - } - - if (flip) { - for (i = 0; i < n; i++) { - __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_nob); - __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr); - } - } -#endif - break; - - case IBNAL_MSG_GET_REQ: - if (msg_nob < hdr_size + sizeof(msg->ibm_u.get)) { - CERROR("Short GET_REQ: %d(%d)\n", msg_nob, - (int)(hdr_size + sizeof(msg->ibm_u.get))); - return -EPROTO; - } -#if IBNAL_USE_FMR - if (flip) { - __swab64s(&msg->ibm_u.get.ibgm_rd.rd_addr); - __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nob); - __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key); - } -#else - if (flip) { - __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key); - __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nfrag); - } - - n = msg->ibm_u.get.ibgm_rd.rd_nfrag; - if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) { - CERROR("Bad GET_REQ nfrags: %d, should be 0 < n <= %d\n", - n, IBNAL_MAX_RDMA_FRAGS); - return -EPROTO; - } - - if (msg_nob < offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])) { - CERROR("Short GET_REQ: %d(%d)\n", msg_nob, - (int)offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])); - return -EPROTO; - } - - if (flip) - for (i = 0; i < msg->ibm_u.get.ibgm_rd.rd_nfrag; i++) { - __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_nob); - __swab64s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr); - } -#endif - break; - - case IBNAL_MSG_PUT_NAK: - case IBNAL_MSG_PUT_DONE: - case IBNAL_MSG_GET_DONE: - if (msg_nob < hdr_size + sizeof(msg->ibm_u.completion)) { - CERROR("Short RDMA completion: %d(%d)\n", msg_nob, - (int)(hdr_size + sizeof(msg->ibm_u.completion))); - return -EPROTO; - } - if (flip) - __swab32s(&msg->ibm_u.completion.ibcm_status); - break; - - case IBNAL_MSG_CONNREQ: - case IBNAL_MSG_CONNACK: - if (msg_nob < hdr_size + sizeof(msg->ibm_u.connparams)) { - CERROR("Short connreq/ack: %d(%d)\n", msg_nob, - (int)(hdr_size + sizeof(msg->ibm_u.connparams))); - return -EPROTO; - } - if (flip) { - __swab32s(&msg->ibm_u.connparams.ibcp_queue_depth); - __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size); - __swab32s(&msg->ibm_u.connparams.ibcp_max_frags); - } - break; - } - return 0; -} - -IB_HANDLE -kibnal_create_cep(lnet_nid_t nid) -{ - FSTATUS frc; - __u32 u32val; - IB_HANDLE cep; - - cep = iba_cm_create_cep(CM_RC_TYPE); - if (cep == NULL) { - CERROR ("Can't create CEP for %s\n", - (nid == LNET_NID_ANY) ? "listener" : - libcfs_nid2str(nid)); - return NULL; - } - - if (nid == LNET_NID_ANY) { - u32val = 1; - frc = iba_cm_modify_cep(cep, CM_FLAG_ASYNC_ACCEPT, - (char *)&u32val, sizeof(u32val), 0); - if (frc != FSUCCESS) { - CERROR("Can't set async_accept: %d\n", frc); - goto failed; - } - - u32val = 0; /* sets system max */ - frc = iba_cm_modify_cep(cep, CM_FLAG_LISTEN_BACKLOG, - (char *)&u32val, sizeof(u32val), 0); - if (frc != FSUCCESS) { - CERROR("Can't set listen backlog: %d\n", frc); - goto failed; - } - } - - u32val = 1; - frc = iba_cm_modify_cep(cep, CM_FLAG_TIMEWAIT_CALLBACK, - (char *)&u32val, sizeof(u32val), 0); - if (frc != FSUCCESS) { - CERROR("Can't set timewait_callback for %s: %d\n", - (nid == LNET_NID_ANY) ? "listener" : - libcfs_nid2str(nid), frc); - goto failed; - } - - return cep; - - failed: - iba_cm_destroy_cep(cep); - return NULL; -} - -#define IBNAL_CHECK_ADVERT 1 -#if IBNAL_CHECK_ADVERT -void -kibnal_service_query_done (void *arg, QUERY *qry, - QUERY_RESULT_VALUES *qry_result) -{ - int *rcp = arg; - FSTATUS frc = qry_result->Status; - SERVICE_RECORD_RESULTS *svc_rslt; - IB_SERVICE_RECORD *svc; - lnet_nid_t nid; - - if (frc != FSUCCESS || qry_result->ResultDataSize == 0) { - CERROR("Error checking advert: status %d data size %d\n", - frc, qry_result->ResultDataSize); - *rcp = -EIO; - goto out; - } - - svc_rslt = (SERVICE_RECORD_RESULTS *)qry_result->QueryResult; - - if (svc_rslt->NumServiceRecords < 1) { - CERROR("Check advert: %d records\n", - svc_rslt->NumServiceRecords); - *rcp = -ENOENT; - goto out; - } - - svc = &svc_rslt->ServiceRecords[0]; - nid = le64_to_cpu(*kibnal_service_nid_field(svc)); - - CDEBUG(D_NET, "Check advert: %s "LPX64" "LPX64":%04x\n", - libcfs_nid2str(nid), svc->RID.ServiceID, - svc->RID.ServiceGID.Type.Global.InterfaceID, - svc->RID.ServiceP_Key); - - if (nid != kibnal_data.kib_ni->ni_nid) { - CERROR("Check advert: Bad NID %s (%s expected)\n", - libcfs_nid2str(nid), - libcfs_nid2str(kibnal_data.kib_ni->ni_nid)); - *rcp = -EINVAL; - goto out; - } - - if (svc->RID.ServiceID != *kibnal_tunables.kib_service_number) { - CERROR("Check advert: Bad ServiceID "LPX64" (%x expected)\n", - svc->RID.ServiceID, - *kibnal_tunables.kib_service_number); - *rcp = -EINVAL; - goto out; - } - - if (svc->RID.ServiceGID.Type.Global.InterfaceID != - kibnal_data.kib_port_guid) { - CERROR("Check advert: Bad GUID "LPX64" ("LPX64" expected)\n", - svc->RID.ServiceGID.Type.Global.InterfaceID, - kibnal_data.kib_port_guid); - *rcp = -EINVAL; - goto out; - } - - if (svc->RID.ServiceP_Key != kibnal_data.kib_port_pkey) { - CERROR("Check advert: Bad PKEY %04x (%04x expected)\n", - svc->RID.ServiceP_Key, kibnal_data.kib_port_pkey); - *rcp = -EINVAL; - goto out; - } - - CDEBUG(D_NET, "Check advert OK\n"); - *rcp = 0; - - out: - up (&kibnal_data.kib_listener_signal); -} - -int -kibnal_check_advert (void) -{ - /* single-threaded */ - static QUERY qry; - - FSTATUS frc; - int rc; - - memset (&qry, 0, sizeof(qry)); - qry.InputType = InputTypeServiceRecord; - qry.OutputType = OutputTypeServiceRecord; - kibnal_set_service_keys(&qry.InputValue.ServiceRecordValue.ServiceRecord, - kibnal_data.kib_ni->ni_nid); - qry.InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK; - - frc = iba_sd_query_port_fabric_info(kibnal_data.kib_sd, - kibnal_data.kib_port_guid, - &qry, - kibnal_service_query_done, - &kibnal_data.kib_sdretry, - &rc); - if (frc != FPENDING) { - CERROR ("Immediate error %d checking SM service\n", frc); - return -EIO; - } - - down (&kibnal_data.kib_listener_signal); - - if (rc != 0) - CERROR ("Error %d checking SM service\n", rc); - return rc; -} -#else -int -kibnal_check_advert(void) -{ - return 0; -} -#endif - -void -kibnal_fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type) -{ - IB_SERVICE_RECORD *svc; - - memset (fod, 0, sizeof(*fod)); - fod->Type = type; - - svc = &fod->Value.ServiceRecordValue.ServiceRecord; - svc->RID.ServiceID = *kibnal_tunables.kib_service_number; - svc->RID.ServiceGID.Type.Global.InterfaceID = kibnal_data.kib_port_guid; - svc->RID.ServiceGID.Type.Global.SubnetPrefix = DEFAULT_SUBNET_PREFIX; - svc->RID.ServiceP_Key = kibnal_data.kib_port_pkey; - svc->ServiceLease = 0xffffffff; - - kibnal_set_service_keys(svc, kibnal_data.kib_ni->ni_nid); -} - -void -kibnal_service_setunset_done (void *arg, FABRIC_OPERATION_DATA *fod, - FSTATUS frc, uint32 madrc) -{ - *(FSTATUS *)arg = frc; - up (&kibnal_data.kib_listener_signal); -} - -int -kibnal_advertise (void) -{ - /* Single threaded here */ - static FABRIC_OPERATION_DATA fod; - - IB_SERVICE_RECORD *svc = &fod.Value.ServiceRecordValue.ServiceRecord; - FSTATUS frc; - FSTATUS frc2; - - if (strlen(*kibnal_tunables.kib_service_name) >= - sizeof(svc->ServiceName)) { - CERROR("Service name '%s' too long (%d chars max)\n", - *kibnal_tunables.kib_service_name, - (int)sizeof(svc->ServiceName) - 1); - return -EINVAL; - } - - kibnal_fill_fod(&fod, FabOpSetServiceRecord); - - CDEBUG(D_NET, "Advertising service id "LPX64" %s:%s\n", - svc->RID.ServiceID, svc->ServiceName, - libcfs_nid2str(le64_to_cpu(*kibnal_service_nid_field(svc)))); - - frc = iba_sd_port_fabric_operation(kibnal_data.kib_sd, - kibnal_data.kib_port_guid, - &fod, - kibnal_service_setunset_done, - &kibnal_data.kib_sdretry, - &frc2); - - if (frc != FSUCCESS && frc != FPENDING) { - CERROR ("Immediate error %d advertising NID %s\n", - frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid)); - return -EIO; - } - - down (&kibnal_data.kib_listener_signal); - - frc = frc2; - if (frc == FSUCCESS) - return 0; - - CERROR ("Error %d advertising %s\n", - frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid)); - return -EIO; -} - -void -kibnal_unadvertise (int expect_success) -{ - /* single threaded */ - static FABRIC_OPERATION_DATA fod; - - IB_SERVICE_RECORD *svc = &fod.Value.ServiceRecordValue.ServiceRecord; - FSTATUS frc; - FSTATUS frc2; - - LASSERT (kibnal_data.kib_ni->ni_nid != LNET_NID_ANY); - - kibnal_fill_fod(&fod, FabOpDeleteServiceRecord); - - CDEBUG(D_NET, "Unadvertising service %s:%s\n", - svc->ServiceName, - libcfs_nid2str(le64_to_cpu(*kibnal_service_nid_field(svc)))); - - frc = iba_sd_port_fabric_operation(kibnal_data.kib_sd, - kibnal_data.kib_port_guid, - &fod, - kibnal_service_setunset_done, - &kibnal_data.kib_sdretry, - &frc2); - if (frc != FSUCCESS && frc != FPENDING) { - CERROR ("Immediate error %d unadvertising NID %s\n", - frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid)); - return; - } - - down (&kibnal_data.kib_listener_signal); - - CDEBUG(D_NET, "Unadvertise rc: %d\n", frc2); - - if ((frc2 == FSUCCESS) == !!expect_success) - return; - - if (expect_success) - CERROR("Error %d unadvertising NID %s\n", - frc2, libcfs_nid2str(kibnal_data.kib_ni->ni_nid)); - else - CWARN("Removed conflicting NID %s\n", - libcfs_nid2str(kibnal_data.kib_ni->ni_nid)); -} - -void -kibnal_stop_listener(int normal_shutdown) -{ - /* NB this also disables peer creation and destroys all existing - * peers */ - IB_HANDLE cep = kibnal_data.kib_listener_cep; - unsigned long flags; - FSTATUS frc; - - LASSERT (cep != NULL); - - kibnal_unadvertise(normal_shutdown); - - frc = iba_cm_cancel(cep); - if (frc != FSUCCESS && frc != FPENDING) - CERROR ("Error %d stopping listener\n", frc); - - down(&kibnal_data.kib_listener_signal); - - frc = iba_cm_destroy_cep(cep); - if (frc != FSUCCESS) - CERROR ("Error %d destroying listener CEP\n", frc); - - write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - /* This assignment disables peer creation */ - kibnal_data.kib_listener_cep = NULL; - write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - - /* Start to tear down any peers created while the listener was - * running */ - kibnal_del_peer(LNET_NID_ANY); -} - -int -kibnal_start_listener(void) -{ - /* NB this also enables peer creation */ - - IB_HANDLE cep; - CM_LISTEN_INFO info; - unsigned long flags; - int rc; - FSTATUS frc; - - LASSERT (kibnal_data.kib_listener_cep == NULL); - init_MUTEX_LOCKED (&kibnal_data.kib_listener_signal); - - cep = kibnal_create_cep(LNET_NID_ANY); - if (cep == NULL) - return -ENOMEM; - - memset (&info, 0, sizeof(info)); - info.ListenAddr.EndPt.SID = *kibnal_tunables.kib_service_number; - - frc = iba_cm_listen(cep, &info, kibnal_listen_callback, NULL); - if (frc != FSUCCESS && frc != FPENDING) { - CERROR ("iba_cm_listen error: %d\n", frc); - - iba_cm_destroy_cep(cep); - return -EIO; - } - - write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - /* This assignment enables peer creation */ - kibnal_data.kib_listener_cep = cep; - write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - - rc = kibnal_advertise(); - if (rc == 0) - rc = kibnal_check_advert(); - - if (rc == 0) - return 0; - - kibnal_stop_listener(0); - return rc; -} - -int -kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid) -{ - kib_peer_t *peer; - unsigned long flags; - int rc; - - LASSERT (nid != LNET_NID_ANY); - - LIBCFS_ALLOC (peer, sizeof (*peer)); - if (peer == NULL) { - CERROR("Cannot allocate peer\n"); - return -ENOMEM; - } - - memset(peer, 0, sizeof(*peer)); /* zero flags etc */ - - peer->ibp_nid = nid; - atomic_set (&peer->ibp_refcount, 1); /* 1 ref for caller */ - - INIT_LIST_HEAD (&peer->ibp_list); /* not in the peer table yet */ - INIT_LIST_HEAD (&peer->ibp_conns); - INIT_LIST_HEAD (&peer->ibp_tx_queue); - - peer->ibp_error = 0; - peer->ibp_last_alive = cfs_time_current(); - peer->ibp_reconnect_interval = 0; /* OK to connect at any time */ - - write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - - if (atomic_read(&kibnal_data.kib_npeers) >= - *kibnal_tunables.kib_concurrent_peers) { - rc = -EOVERFLOW; /* !! but at least it distinguishes */ - } else if (kibnal_data.kib_listener_cep == NULL) { - rc = -ESHUTDOWN; /* shutdown has started */ - } else { - rc = 0; - /* npeers only grows with the global lock held */ - atomic_inc(&kibnal_data.kib_npeers); - } - - write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - - if (rc != 0) { - CERROR("Can't create peer: %s\n", - (rc == -ESHUTDOWN) ? "shutting down" : - "too many peers"); - LIBCFS_FREE(peer, sizeof(*peer)); - } else { - *peerp = peer; - } - - return rc; -} - -void -kibnal_destroy_peer (kib_peer_t *peer) -{ - - LASSERT (atomic_read (&peer->ibp_refcount) == 0); - LASSERT (peer->ibp_persistence == 0); - LASSERT (!kibnal_peer_active(peer)); - LASSERT (!kibnal_peer_connecting(peer)); - LASSERT (list_empty (&peer->ibp_conns)); - LASSERT (list_empty (&peer->ibp_tx_queue)); - - LIBCFS_FREE (peer, sizeof (*peer)); - - /* NB a peer's connections keep a reference on their peer until - * they are destroyed, so we can be assured that _all_ state to do - * with this peer has been cleaned up when its refcount drops to - * zero. */ - atomic_dec (&kibnal_data.kib_npeers); -} - -/* the caller is responsible for accounting for the additional reference - * that this creates */ -kib_peer_t * -kibnal_find_peer_locked (lnet_nid_t nid) -{ - struct list_head *peer_list = kibnal_nid2peerlist (nid); - struct list_head *tmp; - kib_peer_t *peer; - - list_for_each (tmp, peer_list) { - - peer = list_entry (tmp, kib_peer_t, ibp_list); - - LASSERT (peer->ibp_persistence != 0 || - kibnal_peer_connecting(peer) || - !list_empty (&peer->ibp_conns)); - - if (peer->ibp_nid != nid) - continue; - - CDEBUG(D_NET, "got peer %s (%d)\n", - libcfs_nid2str(nid), atomic_read (&peer->ibp_refcount)); - return (peer); - } - return (NULL); -} - -void -kibnal_unlink_peer_locked (kib_peer_t *peer) -{ - LASSERT (peer->ibp_persistence == 0); - LASSERT (list_empty(&peer->ibp_conns)); - - LASSERT (kibnal_peer_active(peer)); - list_del_init (&peer->ibp_list); - /* lose peerlist's ref */ - kibnal_peer_decref(peer); -} - -int -kibnal_get_peer_info (int index, lnet_nid_t *nidp, int *persistencep) -{ - kib_peer_t *peer; - struct list_head *ptmp; - unsigned long flags; - int i; - - read_lock_irqsave(&kibnal_data.kib_global_lock, flags); - - for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { - - list_for_each (ptmp, &kibnal_data.kib_peers[i]) { - - peer = list_entry (ptmp, kib_peer_t, ibp_list); - LASSERT (peer->ibp_persistence != 0 || - kibnal_peer_connecting(peer) || - !list_empty (&peer->ibp_conns)); - - if (index-- > 0) - continue; - - *nidp = peer->ibp_nid; - *persistencep = peer->ibp_persistence; - - read_unlock_irqrestore(&kibnal_data.kib_global_lock, - flags); - return (0); - } - } - - read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - return (-ENOENT); -} - -int -kibnal_add_persistent_peer (lnet_nid_t nid) -{ - unsigned long flags; - kib_peer_t *peer; - kib_peer_t *peer2; - int rc; - - if (nid == LNET_NID_ANY) - return (-EINVAL); - - rc = kibnal_create_peer(&peer, nid); - if (rc != 0) - return rc; - - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - - /* I'm always called with a reference on kibnal_data.kib_ni - * so shutdown can't have started */ - LASSERT (kibnal_data.kib_listener_cep != NULL); - - peer2 = kibnal_find_peer_locked (nid); - if (peer2 != NULL) { - kibnal_peer_decref (peer); - peer = peer2; - } else { - /* peer table takes existing ref on peer */ - list_add_tail (&peer->ibp_list, - kibnal_nid2peerlist (nid)); - } - - peer->ibp_persistence++; - - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - return (0); -} - -void -kibnal_del_peer_locked (kib_peer_t *peer) -{ - struct list_head *ctmp; - struct list_head *cnxt; - kib_conn_t *conn; - - peer->ibp_persistence = 0; - - if (list_empty(&peer->ibp_conns)) { - kibnal_unlink_peer_locked(peer); - } else { - list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { - conn = list_entry(ctmp, kib_conn_t, ibc_list); - - kibnal_close_conn_locked (conn, 0); - } - /* NB peer is no longer persistent; closing its last conn - * unlinked it. */ - } - /* NB peer now unlinked; might even be freed if the peer table had the - * last ref on it. */ -} - -int -kibnal_del_peer (lnet_nid_t nid) -{ - unsigned long flags; - CFS_LIST_HEAD (zombies); - struct list_head *ptmp; - struct list_head *pnxt; - kib_peer_t *peer; - int lo; - int hi; - int i; - int rc = -ENOENT; - - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - - if (nid != LNET_NID_ANY) - lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; - else { - lo = 0; - hi = kibnal_data.kib_peer_hash_size - 1; - } - - for (i = lo; i <= hi; i++) { - list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) { - peer = list_entry (ptmp, kib_peer_t, ibp_list); - LASSERT (peer->ibp_persistence != 0 || - kibnal_peer_connecting(peer) || - !list_empty (&peer->ibp_conns)); - - if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid)) - continue; - - if (!list_empty(&peer->ibp_tx_queue)) { - LASSERT (list_empty(&peer->ibp_conns)); - - list_splice_init(&peer->ibp_tx_queue, &zombies); - } - - kibnal_del_peer_locked (peer); - rc = 0; /* matched something */ - } - } - - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - - kibnal_txlist_done(&zombies, -EIO); - - return (rc); -} - -kib_conn_t * -kibnal_get_conn_by_idx (int index) -{ - kib_peer_t *peer; - struct list_head *ptmp; - kib_conn_t *conn; - struct list_head *ctmp; - unsigned long flags; - int i; - - read_lock_irqsave(&kibnal_data.kib_global_lock, flags); - - for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { - list_for_each (ptmp, &kibnal_data.kib_peers[i]) { - - peer = list_entry (ptmp, kib_peer_t, ibp_list); - LASSERT (peer->ibp_persistence != 0 || - kibnal_peer_connecting(peer) || - !list_empty (&peer->ibp_conns)); - - list_for_each (ctmp, &peer->ibp_conns) { - if (index-- > 0) - continue; - - conn = list_entry (ctmp, kib_conn_t, ibc_list); - kibnal_conn_addref(conn); - read_unlock_irqrestore(&kibnal_data.kib_global_lock, - flags); - return (conn); - } - } - } - - read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - return (NULL); -} - -int -kibnal_conn_rts(kib_conn_t *conn, - __u32 qpn, __u8 resp_res, __u8 init_depth, __u32 psn) -{ - IB_PATH_RECORD *path = &conn->ibc_cvars->cv_path; - IB_HANDLE qp = conn->ibc_qp; - IB_QP_ATTRIBUTES_MODIFY modify_attr; - FSTATUS frc; - int rc; - - if (resp_res > kibnal_data.kib_hca_attrs.MaxQPResponderResources) - resp_res = kibnal_data.kib_hca_attrs.MaxQPResponderResources; - - if (init_depth > kibnal_data.kib_hca_attrs.MaxQPInitiatorDepth) - init_depth = kibnal_data.kib_hca_attrs.MaxQPInitiatorDepth; - - modify_attr = (IB_QP_ATTRIBUTES_MODIFY) { - .RequestState = QPStateReadyToRecv, - .RecvPSN = IBNAL_STARTING_PSN, - .DestQPNumber = qpn, - .ResponderResources = resp_res, - .MinRnrTimer = UsecToRnrNakTimer(2000), /* 20 ms */ - .Attrs = (IB_QP_ATTR_RECVPSN | - IB_QP_ATTR_DESTQPNUMBER | - IB_QP_ATTR_RESPONDERRESOURCES | - IB_QP_ATTR_DESTAV | - IB_QP_ATTR_PATHMTU | - IB_QP_ATTR_MINRNRTIMER), - }; - GetAVFromPath(0, path, &modify_attr.PathMTU, NULL, - &modify_attr.DestAV); - - frc = iba_modify_qp(qp, &modify_attr, NULL); - if (frc != FSUCCESS) { - CERROR("Can't set QP %s ready to receive: %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), frc); - return -EIO; - } - - rc = kibnal_post_receives(conn); - if (rc != 0) { - CERROR("Can't post receives for %s: %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); - return rc; - } - - modify_attr = (IB_QP_ATTRIBUTES_MODIFY) { - .RequestState = QPStateReadyToSend, - .FlowControl = TRUE, - .InitiatorDepth = init_depth, - .SendPSN = psn, - .LocalAckTimeout = path->PktLifeTime + 2, /* 2 or 1? */ - .RetryCount = IBNAL_RETRY, - .RnrRetryCount = IBNAL_RNR_RETRY, - .Attrs = (IB_QP_ATTR_FLOWCONTROL | - IB_QP_ATTR_INITIATORDEPTH | - IB_QP_ATTR_SENDPSN | - IB_QP_ATTR_LOCALACKTIMEOUT | - IB_QP_ATTR_RETRYCOUNT | - IB_QP_ATTR_RNRRETRYCOUNT), - }; - - frc = iba_modify_qp(qp, &modify_attr, NULL); - if (frc != FSUCCESS) { - CERROR("Can't set QP %s ready to send: %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), frc); - return -EIO; - } - - frc = iba_query_qp(conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs, NULL); - if (frc != FSUCCESS) { - CERROR ("Can't query QP %s attributes: %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), frc); - return -EIO; - } - - return 0; -} - -kib_conn_t * -kibnal_create_conn (lnet_nid_t nid, int proto_version) -{ - kib_conn_t *conn; - int i; - int page_offset; - int ipage; - int rc; - FSTATUS frc; - union { - IB_QP_ATTRIBUTES_CREATE qp_create; - IB_QP_ATTRIBUTES_MODIFY qp_attr; - } params; - - LIBCFS_ALLOC (conn, sizeof (*conn)); - if (conn == NULL) { - CERROR ("Can't allocate connection for %s\n", - libcfs_nid2str(nid)); - return (NULL); - } - - /* zero flags, NULL pointers etc... */ - memset (conn, 0, sizeof (*conn)); - conn->ibc_state = IBNAL_CONN_INIT_NOTHING; - conn->ibc_version = proto_version; - - INIT_LIST_HEAD (&conn->ibc_early_rxs); - INIT_LIST_HEAD (&conn->ibc_tx_queue_nocred); - INIT_LIST_HEAD (&conn->ibc_tx_queue); - INIT_LIST_HEAD (&conn->ibc_tx_queue_rsrvd); - INIT_LIST_HEAD (&conn->ibc_active_txs); - spin_lock_init (&conn->ibc_lock); - - atomic_inc (&kibnal_data.kib_nconns); - /* well not really, but I call destroy() on failure, which decrements */ - - LIBCFS_ALLOC(conn->ibc_cvars, sizeof (*conn->ibc_cvars)); - if (conn->ibc_cvars == NULL) { - CERROR ("Can't allocate connvars for %s\n", - libcfs_nid2str(nid)); - goto failed; - } - memset(conn->ibc_cvars, 0, sizeof (*conn->ibc_cvars)); - - LIBCFS_ALLOC(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t)); - if (conn->ibc_rxs == NULL) { - CERROR("Cannot allocate RX descriptors for %s\n", - libcfs_nid2str(nid)); - goto failed; - } - memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t)); - - rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES); - if (rc != 0) { - CERROR("Can't allocate RX buffers for %s\n", - libcfs_nid2str(nid)); - goto failed; - } - - for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) { - struct page *page = conn->ibc_rx_pages->ibp_pages[ipage]; - kib_rx_t *rx = &conn->ibc_rxs[i]; - - rx->rx_conn = conn; - rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + - page_offset); - - rx->rx_hca_msg = kibnal_data.kib_whole_mem.md_addr + - lnet_page2phys(page) + page_offset; - - page_offset += IBNAL_MSG_SIZE; - LASSERT (page_offset <= PAGE_SIZE); - - if (page_offset == PAGE_SIZE) { - page_offset = 0; - ipage++; - LASSERT (ipage <= IBNAL_RX_MSG_PAGES); - } - } - - params.qp_create = (IB_QP_ATTRIBUTES_CREATE) { - .Type = QPTypeReliableConnected, - .SendQDepth = (1 + IBNAL_MAX_RDMA_FRAGS) * - (*kibnal_tunables.kib_concurrent_sends), - .RecvQDepth = IBNAL_RX_MSGS, - .SendDSListDepth = 1, - .RecvDSListDepth = 1, - .SendCQHandle = kibnal_data.kib_cq, - .RecvCQHandle = kibnal_data.kib_cq, - .PDHandle = kibnal_data.kib_pd, - .SendSignaledCompletions = TRUE, - }; - frc = iba_create_qp(kibnal_data.kib_hca, ¶ms.qp_create, NULL, - &conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs); - if (frc != 0) { - CERROR ("Can't create QP %s: %d\n", libcfs_nid2str(nid), frc); - goto failed; - } - - /* Mark QP created */ - kibnal_set_conn_state(conn, IBNAL_CONN_INIT_QP); - - params.qp_attr = (IB_QP_ATTRIBUTES_MODIFY) { - .RequestState = QPStateInit, - .Attrs = (IB_QP_ATTR_PORTGUID | - IB_QP_ATTR_PKEYINDEX | - IB_QP_ATTR_ACCESSCONTROL), - .PortGUID = kibnal_data.kib_port_guid, - .PkeyIndex = 0, - .AccessControl = { - .s = { - .RdmaWrite = 1, - .RdmaRead = 1, - }, - }, - }; - frc = iba_modify_qp(conn->ibc_qp, ¶ms.qp_attr, NULL); - if (frc != 0) { - CERROR ("Can't set QP %s state to INIT: %d\n", - libcfs_nid2str(nid), frc); - goto failed; - } - - frc = iba_query_qp(conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs, NULL); - if (frc != FSUCCESS) { - CERROR ("Can't query QP %s attributes: %d\n", - libcfs_nid2str(nid), frc); - goto failed; - } - - /* 1 ref for caller */ - atomic_set (&conn->ibc_refcount, 1); - CDEBUG(D_NET, "New conn %p\n", conn); - return (conn); - - failed: - kibnal_destroy_conn (conn); - return (NULL); -} - -void -kibnal_destroy_conn (kib_conn_t *conn) -{ - FSTATUS frc; - - LASSERT (!in_interrupt()); - - CDEBUG (D_NET, "connection %s\n", - (conn->ibc_peer) == NULL ? "" : - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - - LASSERT (atomic_read (&conn->ibc_refcount) == 0); - LASSERT (list_empty(&conn->ibc_early_rxs)); - LASSERT (list_empty(&conn->ibc_tx_queue)); - LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd)); - LASSERT (list_empty(&conn->ibc_tx_queue_nocred)); - LASSERT (list_empty(&conn->ibc_active_txs)); - LASSERT (conn->ibc_nsends_posted == 0); - - switch (conn->ibc_state) { - case IBNAL_CONN_INIT_NOTHING: - case IBNAL_CONN_INIT_QP: - case IBNAL_CONN_DISCONNECTED: - break; - - default: - /* conn must either have never engaged with the CM, or have - * completely disengaged from it */ - CERROR("Bad conn %s state %d\n", - (conn->ibc_peer) == NULL ? "" : - libcfs_nid2str(conn->ibc_peer->ibp_nid), conn->ibc_state); - LBUG(); - } - - if (conn->ibc_cep != NULL) { - frc = iba_cm_destroy_cep(conn->ibc_cep); - if (frc != FSUCCESS) - CERROR("Error destroying CEP %p: %d\n", - conn->ibc_cep, frc); - } - - if (conn->ibc_qp != NULL) { - frc = iba_destroy_qp(conn->ibc_qp); - if (frc != FSUCCESS) - CERROR("Error destroying QP %p: %d\n", - conn->ibc_qp, frc); - } - - if (conn->ibc_rx_pages != NULL) - kibnal_free_pages(conn->ibc_rx_pages); - - if (conn->ibc_rxs != NULL) - LIBCFS_FREE(conn->ibc_rxs, - IBNAL_RX_MSGS * sizeof(kib_rx_t)); - - if (conn->ibc_cvars != NULL) - LIBCFS_FREE(conn->ibc_cvars, sizeof(*conn->ibc_cvars)); - - if (conn->ibc_peer != NULL) - kibnal_peer_decref(conn->ibc_peer); - - LIBCFS_FREE(conn, sizeof (*conn)); - - atomic_dec(&kibnal_data.kib_nconns); -} - -int -kibnal_close_peer_conns_locked (kib_peer_t *peer, int why) -{ - kib_conn_t *conn; - struct list_head *ctmp; - struct list_head *cnxt; - int count = 0; - - list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { - conn = list_entry (ctmp, kib_conn_t, ibc_list); - - count++; - kibnal_close_conn_locked (conn, why); - } - - return (count); -} - -int -kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation) -{ - kib_conn_t *conn; - struct list_head *ctmp; - struct list_head *cnxt; - int count = 0; - - list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { - conn = list_entry (ctmp, kib_conn_t, ibc_list); - - if (conn->ibc_incarnation == incarnation) - continue; - - CDEBUG(D_NET, "Closing stale conn nid:%s incarnation:"LPX64"("LPX64")\n", - libcfs_nid2str(peer->ibp_nid), - conn->ibc_incarnation, incarnation); - - count++; - kibnal_close_conn_locked (conn, -ESTALE); - } - - return (count); -} - -int -kibnal_close_matching_conns (lnet_nid_t nid) -{ - unsigned long flags; - kib_peer_t *peer; - struct list_head *ptmp; - struct list_head *pnxt; - int lo; - int hi; - int i; - int count = 0; - - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - - if (nid != LNET_NID_ANY) - lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; - else { - lo = 0; - hi = kibnal_data.kib_peer_hash_size - 1; - } - - for (i = lo; i <= hi; i++) { - list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) { - - peer = list_entry (ptmp, kib_peer_t, ibp_list); - LASSERT (peer->ibp_persistence != 0 || - kibnal_peer_connecting(peer) || - !list_empty (&peer->ibp_conns)); - - if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid)) - continue; - - count += kibnal_close_peer_conns_locked (peer, 0); - } - } - - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - - /* wildcards always succeed */ - if (nid == LNET_NID_ANY) - return (0); - - return (count == 0 ? -ENOENT : 0); -} - -int -kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) -{ - struct libcfs_ioctl_data *data = arg; - int rc = -EINVAL; - ENTRY; - - LASSERT (ni == kibnal_data.kib_ni); - - switch(cmd) { - case IOC_LIBCFS_GET_PEER: { - lnet_nid_t nid = 0; - int share_count = 0; - - rc = kibnal_get_peer_info(data->ioc_count, - &nid, &share_count); - data->ioc_nid = nid; - data->ioc_count = share_count; - break; - } - case IOC_LIBCFS_ADD_PEER: { - rc = kibnal_add_persistent_peer (data->ioc_nid); - break; - } - case IOC_LIBCFS_DEL_PEER: { - rc = kibnal_del_peer (data->ioc_nid); - break; - } - case IOC_LIBCFS_GET_CONN: { - kib_conn_t *conn = kibnal_get_conn_by_idx (data->ioc_count); - - if (conn == NULL) - rc = -ENOENT; - else { - rc = 0; - data->ioc_nid = conn->ibc_peer->ibp_nid; - kibnal_conn_decref(conn); - } - break; - } - case IOC_LIBCFS_CLOSE_CONNECTION: { - rc = kibnal_close_matching_conns (data->ioc_nid); - break; - } - case IOC_LIBCFS_REGISTER_MYNID: { - if (ni->ni_nid == data->ioc_nid) { - rc = 0; - } else { - CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n", - libcfs_nid2str(data->ioc_nid), - libcfs_nid2str(ni->ni_nid)); - rc = -EINVAL; - } - break; - } - } - - RETURN(rc); -} - -void -kibnal_free_pages (kib_pages_t *p) -{ - int npages = p->ibp_npages; - int i; - - for (i = 0; i < npages; i++) - if (p->ibp_pages[i] != NULL) - __free_page(p->ibp_pages[i]); - - LIBCFS_FREE (p, offsetof(kib_pages_t, ibp_pages[npages])); -} - -int -kibnal_alloc_pages (kib_pages_t **pp, int npages) -{ - kib_pages_t *p; - int i; - - LIBCFS_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages])); - if (p == NULL) { - CERROR ("Can't allocate buffer %d\n", npages); - return (-ENOMEM); - } - - memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages])); - p->ibp_npages = npages; - - for (i = 0; i < npages; i++) { - p->ibp_pages[i] = alloc_page (GFP_KERNEL); - if (p->ibp_pages[i] == NULL) { - CERROR ("Can't allocate page %d of %d\n", i, npages); - kibnal_free_pages(p); - return (-ENOMEM); - } - } - - *pp = p; - return (0); -} - -int -kibnal_alloc_tx_descs (void) -{ - int i; - - LIBCFS_ALLOC (kibnal_data.kib_tx_descs, - IBNAL_TX_MSGS() * sizeof(kib_tx_t)); - if (kibnal_data.kib_tx_descs == NULL) - return -ENOMEM; - - memset(kibnal_data.kib_tx_descs, 0, - IBNAL_TX_MSGS() * sizeof(kib_tx_t)); - - for (i = 0; i < IBNAL_TX_MSGS(); i++) { - kib_tx_t *tx = &kibnal_data.kib_tx_descs[i]; - -#if IBNAL_USE_FMR - LIBCFS_ALLOC(tx->tx_pages, LNET_MAX_IOV * - sizeof(*tx->tx_pages)); - if (tx->tx_pages == NULL) - return -ENOMEM; -#else - LIBCFS_ALLOC(tx->tx_wrq, - (1 + IBNAL_MAX_RDMA_FRAGS) * - sizeof(*tx->tx_wrq)); - if (tx->tx_wrq == NULL) - return -ENOMEM; - - LIBCFS_ALLOC(tx->tx_gl, - (1 + IBNAL_MAX_RDMA_FRAGS) * - sizeof(*tx->tx_gl)); - if (tx->tx_gl == NULL) - return -ENOMEM; - - LIBCFS_ALLOC(tx->tx_rd, - offsetof(kib_rdma_desc_t, - rd_frags[IBNAL_MAX_RDMA_FRAGS])); - if (tx->tx_rd == NULL) - return -ENOMEM; -#endif - } - - return 0; -} - -void -kibnal_free_tx_descs (void) -{ - int i; - - if (kibnal_data.kib_tx_descs == NULL) - return; - - for (i = 0; i < IBNAL_TX_MSGS(); i++) { - kib_tx_t *tx = &kibnal_data.kib_tx_descs[i]; - -#if IBNAL_USE_FMR - if (tx->tx_pages != NULL) - LIBCFS_FREE(tx->tx_pages, LNET_MAX_IOV * - sizeof(*tx->tx_pages)); -#else - if (tx->tx_wrq != NULL) - LIBCFS_FREE(tx->tx_wrq, - (1 + IBNAL_MAX_RDMA_FRAGS) * - sizeof(*tx->tx_wrq)); - - if (tx->tx_gl != NULL) - LIBCFS_FREE(tx->tx_gl, - (1 + IBNAL_MAX_RDMA_FRAGS) * - sizeof(*tx->tx_gl)); - - if (tx->tx_rd != NULL) - LIBCFS_FREE(tx->tx_rd, - offsetof(kib_rdma_desc_t, - rd_frags[IBNAL_MAX_RDMA_FRAGS])); -#endif - } - - LIBCFS_FREE(kibnal_data.kib_tx_descs, - IBNAL_TX_MSGS() * sizeof(kib_tx_t)); -} - -int -kibnal_setup_tx_descs (void) -{ - int ipage = 0; - int page_offset = 0; - struct page *page; - kib_tx_t *tx; - int i; - int rc; - - /* pre-mapped messages are not bigger than 1 page */ - CLASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE); - - /* No fancy arithmetic when we do the buffer calculations */ - CLASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0); - - rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, - IBNAL_TX_MSG_PAGES()); - if (rc != 0) - return (rc); - - for (i = 0; i < IBNAL_TX_MSGS(); i++) { - page = kibnal_data.kib_tx_pages->ibp_pages[ipage]; - tx = &kibnal_data.kib_tx_descs[i]; - -#if IBNAL_USE_FMR - /* Allocate an FMR for this TX so it can map src/sink buffers - * for large transfers */ -#endif - tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + - page_offset); - - tx->tx_hca_msg = kibnal_data.kib_whole_mem.md_addr + - lnet_page2phys(page) + page_offset; - - CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", - i, tx, tx->tx_msg, tx->tx_hca_msg); - - list_add (&tx->tx_list, &kibnal_data.kib_idle_txs); - - page_offset += IBNAL_MSG_SIZE; - LASSERT (page_offset <= PAGE_SIZE); - - if (page_offset == PAGE_SIZE) { - page_offset = 0; - ipage++; - LASSERT (ipage <= IBNAL_TX_MSG_PAGES()); - } - } - - return (0); -} - -int -kibnal_register_all_memory(void) -{ - /* CAVEAT EMPTOR: this assumes all physical memory is in 1 contiguous - * chunk starting at 0 */ - struct sysinfo si; - __u64 total; - __u64 total2; - __u64 roundup = (128<<20); /* round up in big chunks */ - IB_MR_PHYS_BUFFER phys; - IB_ACCESS_CONTROL access; - FSTATUS frc; - - memset(&access, 0, sizeof(access)); - access.s.MWBindable = 1; - access.s.LocalWrite = 1; - access.s.RdmaRead = 1; - access.s.RdmaWrite = 1; - - /* XXX we don't bother with first-gen cards */ - if (kibnal_data.kib_hca_attrs.VendorId == 0xd0b7 && - kibnal_data.kib_hca_attrs.DeviceId == 0x3101) { - CERROR("Can't register all memory on first generation HCAs\n"); - return -EINVAL; - } - - si_meminfo(&si); - - CDEBUG(D_NET, "si_meminfo: %lu/%u, num_physpages %lu/%lu\n", - si.totalram, si.mem_unit, num_physpages, PAGE_SIZE); - - total = ((__u64)si.totalram) * si.mem_unit; - total2 = num_physpages * PAGE_SIZE; - if (total < total2) - total = total2; - - if (total == 0) { - CERROR("Can't determine memory size\n"); - return -ENOMEM; - } - - roundup = (128<<20); - total = (total + (roundup - 1)) & ~(roundup - 1); - - phys.PhysAddr = 0; - phys.Length = total; - - frc = iba_register_contig_pmr(kibnal_data.kib_hca, 0, &phys, 1, 0, - kibnal_data.kib_pd, access, - &kibnal_data.kib_whole_mem.md_handle, - &kibnal_data.kib_whole_mem.md_addr, - &kibnal_data.kib_whole_mem.md_lkey, - &kibnal_data.kib_whole_mem.md_rkey); - - if (frc != FSUCCESS) { - CERROR("registering physical memory failed: %d\n", frc); - return -EIO; - } - - CDEBUG(D_WARNING, "registered phys mem from 0("LPX64") for "LPU64"("LPU64") -> "LPX64"\n", - phys.PhysAddr, total, phys.Length, kibnal_data.kib_whole_mem.md_addr); - - return 0; -} - -void -kibnal_shutdown (lnet_ni_t *ni) -{ - int i; - int rc; - - LASSERT (ni == kibnal_data.kib_ni); - LASSERT (ni->ni_data == &kibnal_data); - - CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", - atomic_read (&libcfs_kmemory)); - - switch (kibnal_data.kib_init) { - default: - CERROR ("Unexpected state %d\n", kibnal_data.kib_init); - LBUG(); - - case IBNAL_INIT_ALL: - /* stop accepting connections, prevent new peers and start to - * tear down all existing ones... */ - kibnal_stop_listener(1); - - /* Wait for all peer state to clean up */ - i = 2; - while (atomic_read (&kibnal_data.kib_npeers) != 0) { - i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ - "waiting for %d peers to disconnect\n", - atomic_read (&kibnal_data.kib_npeers)); - set_current_state (TASK_UNINTERRUPTIBLE); - schedule_timeout (HZ); - } - /* fall through */ - - case IBNAL_INIT_CQ: - rc = iba_destroy_cq(kibnal_data.kib_cq); - if (rc != 0) - CERROR ("Destroy CQ error: %d\n", rc); - /* fall through */ - - case IBNAL_INIT_TXD: - kibnal_free_pages (kibnal_data.kib_tx_pages); - /* fall through */ - - case IBNAL_INIT_MD: - rc = iba_deregister_mr(kibnal_data.kib_whole_mem.md_handle); - if (rc != FSUCCESS) - CERROR ("Deregister memory: %d\n", rc); - /* fall through */ - - case IBNAL_INIT_PD: - rc = iba_free_pd(kibnal_data.kib_pd); - if (rc != 0) - CERROR ("Destroy PD error: %d\n", rc); - /* fall through */ - - case IBNAL_INIT_SD: - rc = iba_sd_deregister(kibnal_data.kib_sd); - if (rc != 0) - CERROR ("Deregister SD error: %d\n", rc); - /* fall through */ - - case IBNAL_INIT_PORTATTRS: - LIBCFS_FREE(kibnal_data.kib_hca_attrs.PortAttributesList, - kibnal_data.kib_hca_attrs.PortAttributesListSize); - /* fall through */ - - case IBNAL_INIT_HCA: - rc = iba_close_ca(kibnal_data.kib_hca); - if (rc != 0) - CERROR ("Close HCA error: %d\n", rc); - /* fall through */ - - case IBNAL_INIT_DATA: - LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0); - LASSERT (kibnal_data.kib_peers != NULL); - for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { - LASSERT (list_empty (&kibnal_data.kib_peers[i])); - } - LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0); - LASSERT (list_empty (&kibnal_data.kib_connd_zombies)); - LASSERT (list_empty (&kibnal_data.kib_connd_conns)); - LASSERT (list_empty (&kibnal_data.kib_connd_peers)); - - /* flag threads to terminate; wake and wait for them to die */ - kibnal_data.kib_shutdown = 1; - wake_up_all (&kibnal_data.kib_sched_waitq); - wake_up_all (&kibnal_data.kib_connd_waitq); - - i = 2; - while (atomic_read (&kibnal_data.kib_nthreads) != 0) { - i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ - "Waiting for %d threads to terminate\n", - atomic_read (&kibnal_data.kib_nthreads)); - set_current_state (TASK_INTERRUPTIBLE); - schedule_timeout (HZ); - } - /* fall through */ - - case IBNAL_INIT_NOTHING: - break; - } - - kibnal_free_tx_descs(); - - if (kibnal_data.kib_peers != NULL) - LIBCFS_FREE (kibnal_data.kib_peers, - sizeof (struct list_head) * - kibnal_data.kib_peer_hash_size); - - CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", - atomic_read (&libcfs_kmemory)); - - kibnal_data.kib_init = IBNAL_INIT_NOTHING; - PORTAL_MODULE_UNUSE; -} - -int -kibnal_get_ipif_name(char *ifname, int ifname_size, int idx) -{ - char *basename = *kibnal_tunables.kib_ipif_basename; - int n = strlen(basename); - int baseidx; - int m; - - if (n == 0) { /* empty string */ - CERROR("Empty IP interface basename specified\n"); - return -EINVAL; - } - - for (m = n; m > 0; m--) /* find max numeric postfix */ - if (sscanf(basename + m - 1, "%d", &baseidx) != 1) - break; - - if (m == 0) /* just a number */ - m = n; - - if (m == n) /* no postfix */ - baseidx = 1; /* default to 1 */ - - if (m >= ifname_size) - m = ifname_size - 1; - - memcpy(ifname, basename, m); /* copy prefix name */ - - snprintf(ifname + m, ifname_size - m, "%d", baseidx + idx); - - if (strlen(ifname) == ifname_size - 1) { - CERROR("IP interface basename %s too long\n", basename); - return -EINVAL; - } - - return 0; -} - -int -kibnal_startup (lnet_ni_t *ni) -{ - char ipif_name[32]; - __u32 ip; - __u32 netmask; - int up; - int nob; - struct timeval tv; - IB_PORT_ATTRIBUTES *pattr; - FSTATUS frc; - int rc; - __u32 n; - int i; - - LASSERT (ni->ni_lnd == &the_kiblnd); - - /* Only 1 instance supported */ - if (kibnal_data.kib_init != IBNAL_INIT_NOTHING) { - CERROR ("Only 1 instance supported\n"); - return -EPERM; - } - - if (*kibnal_tunables.kib_credits > *kibnal_tunables.kib_ntx) { - CERROR ("Can't set credits(%d) > ntx(%d)\n", - *kibnal_tunables.kib_credits, - *kibnal_tunables.kib_ntx); - return -EINVAL; - } - - ni->ni_maxtxcredits = *kibnal_tunables.kib_credits; - ni->ni_peertxcredits = *kibnal_tunables.kib_peercredits; - - CLASSERT (LNET_MAX_INTERFACES > 1); - - if (ni->ni_interfaces[0] == NULL) { - kibnal_data.kib_hca_idx = 0; - } else { - /* Use the HCA specified in 'networks=' */ - if (ni->ni_interfaces[1] != NULL) { - CERROR("Multiple interfaces not supported\n"); - return -EPERM; - } - - /* Parse into kib_hca_idx */ - nob = strlen(ni->ni_interfaces[0]); - if (sscanf(ni->ni_interfaces[0], "%d%n", - &kibnal_data.kib_hca_idx, &nob) < 1 || - nob != strlen(ni->ni_interfaces[0])) { - CERROR("Can't parse interface '%s'\n", - ni->ni_interfaces[0]); - return -EINVAL; - } - } - - rc = kibnal_get_ipif_name(ipif_name, sizeof(ipif_name), - kibnal_data.kib_hca_idx); - if (rc != 0) - return rc; - - rc = libcfs_ipif_query(ipif_name, &up, &ip, &netmask); - if (rc != 0) { - CERROR("Can't query IPoIB interface %s: %d\n", ipif_name, rc); - return -ENETDOWN; - } - - if (!up) { - CERROR("Can't query IPoIB interface %s: it's down\n", ipif_name); - return -ENETDOWN; - } - - ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ip); - - ni->ni_data = &kibnal_data; - kibnal_data.kib_ni = ni; - - do_gettimeofday(&tv); - kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; - - PORTAL_MODULE_USE; - - rwlock_init(&kibnal_data.kib_global_lock); - - kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE; - LIBCFS_ALLOC (kibnal_data.kib_peers, - sizeof (struct list_head) * kibnal_data.kib_peer_hash_size); - if (kibnal_data.kib_peers == NULL) { - goto failed; - } - for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) - INIT_LIST_HEAD(&kibnal_data.kib_peers[i]); - - spin_lock_init (&kibnal_data.kib_connd_lock); - INIT_LIST_HEAD (&kibnal_data.kib_connd_peers); - INIT_LIST_HEAD (&kibnal_data.kib_connd_conns); - INIT_LIST_HEAD (&kibnal_data.kib_connd_zombies); - init_waitqueue_head (&kibnal_data.kib_connd_waitq); - - spin_lock_init (&kibnal_data.kib_sched_lock); - init_waitqueue_head (&kibnal_data.kib_sched_waitq); - - spin_lock_init (&kibnal_data.kib_tx_lock); - INIT_LIST_HEAD (&kibnal_data.kib_idle_txs); - - rc = kibnal_alloc_tx_descs(); - if (rc != 0) { - CERROR("Can't allocate tx descs\n"); - goto failed; - } - - /* lists/ptrs/locks initialised */ - kibnal_data.kib_init = IBNAL_INIT_DATA; - /*****************************************************/ - - kibnal_data.kib_sdretry.RetryCount = *kibnal_tunables.kib_sd_retries; - kibnal_data.kib_sdretry.Timeout = (*kibnal_tunables.kib_timeout * 1000)/ - *kibnal_tunables.kib_sd_retries; - - for (i = 0; i < IBNAL_N_SCHED; i++) { - rc = kibnal_thread_start (kibnal_scheduler, - (void *)(unsigned long)i); - if (rc != 0) { - CERROR("Can't spawn iib scheduler[%d]: %d\n", - i, rc); - goto failed; - } - } - - rc = kibnal_thread_start (kibnal_connd, NULL); - if (rc != 0) { - CERROR ("Can't spawn iib connd: %d\n", rc); - goto failed; - } - - n = sizeof(kibnal_data.kib_hca_guids) / - sizeof(kibnal_data.kib_hca_guids[0]); - frc = iba_get_caguids(&n, kibnal_data.kib_hca_guids); - if (frc != FSUCCESS) { - CERROR ("Can't get HCA guids: %d\n", frc); - goto failed; - } - - if (n == 0) { - CERROR ("No HCAs found\n"); - goto failed; - } - - if (n <= kibnal_data.kib_hca_idx) { - CERROR("Invalid HCA %d requested: (must be 0 - %d inclusive)\n", - kibnal_data.kib_hca_idx, n - 1); - goto failed; - } - - /* Infinicon has per-HCA notification callbacks */ - frc = iba_open_ca(kibnal_data.kib_hca_guids[kibnal_data.kib_hca_idx], - kibnal_hca_callback, - kibnal_hca_async_callback, - NULL, - &kibnal_data.kib_hca); - if (frc != FSUCCESS) { - CERROR ("Can't open HCA[%d]: %d\n", - kibnal_data.kib_hca_idx, frc); - goto failed; - } - - /* Channel Adapter opened */ - kibnal_data.kib_init = IBNAL_INIT_HCA; - /*****************************************************/ - - kibnal_data.kib_hca_attrs.PortAttributesList = NULL; - kibnal_data.kib_hca_attrs.PortAttributesListSize = 0; - frc = iba_query_ca(kibnal_data.kib_hca, - &kibnal_data.kib_hca_attrs, NULL); - if (frc != FSUCCESS) { - CERROR ("Can't size port attrs: %d\n", frc); - goto failed; - } - - LIBCFS_ALLOC(kibnal_data.kib_hca_attrs.PortAttributesList, - kibnal_data.kib_hca_attrs.PortAttributesListSize); - if (kibnal_data.kib_hca_attrs.PortAttributesList == NULL) - goto failed; - - /* Port attrs allocated */ - kibnal_data.kib_init = IBNAL_INIT_PORTATTRS; - /*****************************************************/ - - frc = iba_query_ca(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs, - NULL); - if (frc != FSUCCESS) { - CERROR ("Can't get port attrs for HCA %d: %d\n", - kibnal_data.kib_hca_idx, frc); - goto failed; - } - - for (i = 0, pattr = kibnal_data.kib_hca_attrs.PortAttributesList; - pattr != NULL; - i++, pattr = pattr->Next) { - switch (pattr->PortState) { - default: - CERROR("Unexpected port[%d] state %d\n", - i, pattr->PortState); - continue; - case PortStateDown: - CDEBUG(D_NET, "port[%d] Down\n", i); - continue; - case PortStateInit: - CDEBUG(D_NET, "port[%d] Init\n", i); - continue; - case PortStateArmed: - CDEBUG(D_NET, "port[%d] Armed\n", i); - continue; - - case PortStateActive: - CDEBUG(D_NET, "port[%d] Active\n", i); - kibnal_data.kib_port = i; - kibnal_data.kib_port_guid = pattr->GUID; - kibnal_data.kib_port_pkey = pattr->PkeyTable[0]; - break; - } - break; - } - - if (pattr == NULL) { - CERROR ("Can't find an active port\n"); - goto failed; - } - - CDEBUG(D_NET, "got guid "LPX64"\n", kibnal_data.kib_port_guid); - - frc = iba_sd_register(&kibnal_data.kib_sd, NULL); - if (frc != FSUCCESS) { - CERROR ("Can't register with SD: %d\n", frc); - goto failed; - } - - /* Registered with SD OK */ - kibnal_data.kib_init = IBNAL_INIT_SD; - /*****************************************************/ - - frc = iba_alloc_pd(kibnal_data.kib_hca, 0, &kibnal_data.kib_pd); - if (frc != FSUCCESS) { - CERROR ("Can't create PD: %d\n", rc); - goto failed; - } - - /* flag PD initialised */ - kibnal_data.kib_init = IBNAL_INIT_PD; - /*****************************************************/ - - rc = kibnal_register_all_memory(); - if (rc != 0) { - CERROR ("Can't register all memory\n"); - goto failed; - } - - /* flag whole memory MD initialised */ - kibnal_data.kib_init = IBNAL_INIT_MD; - /*****************************************************/ - - rc = kibnal_setup_tx_descs(); - if (rc != 0) { - CERROR ("Can't register tx descs: %d\n", rc); - goto failed; - } - - /* flag TX descs initialised */ - kibnal_data.kib_init = IBNAL_INIT_TXD; - /*****************************************************/ - - frc = iba_create_cq(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES(), - &kibnal_data.kib_cq, &kibnal_data.kib_cq, - &n); - if (frc != FSUCCESS) { - CERROR ("Can't create RX CQ: %d\n", frc); - goto failed; - } - - /* flag CQ initialised */ - kibnal_data.kib_init = IBNAL_INIT_CQ; - /*****************************************************/ - - if (n < IBNAL_CQ_ENTRIES()) { - CERROR ("CQ only has %d entries: %d needed\n", - n, IBNAL_CQ_ENTRIES()); - goto failed; - } - - rc = iba_rearm_cq(kibnal_data.kib_cq, CQEventSelNextWC); - if (rc != 0) { - CERROR ("Failed to re-arm completion queue: %d\n", rc); - goto failed; - } - - rc = kibnal_start_listener(); - if (rc != 0) { - CERROR("Can't start listener: %d\n", rc); - goto failed; - } - - /* flag everything initialised */ - kibnal_data.kib_init = IBNAL_INIT_ALL; - /*****************************************************/ - - return (0); - - failed: - kibnal_shutdown (ni); - return (-ENETDOWN); -} - -void __exit -kibnal_module_fini (void) -{ - lnet_unregister_lnd(&the_kiblnd); - kibnal_tunables_fini(); -} - -int __init -kibnal_module_init (void) -{ - int rc; - - rc = kibnal_tunables_init(); - if (rc != 0) - return rc; - - lnet_register_lnd(&the_kiblnd); - - return 0; -} - -MODULE_AUTHOR("Sun Microsystems, Inc. "); -MODULE_DESCRIPTION("Kernel Infinicon IB LND v1.00"); -MODULE_LICENSE("GPL"); - -module_init(kibnal_module_init); -module_exit(kibnal_module_fini); diff --git a/lnet/klnds/iiblnd/iiblnd.h b/lnet/klnds/iiblnd/iiblnd.h deleted file mode 100644 index ae37d4d..0000000 --- a/lnet/klnds/iiblnd/iiblnd.h +++ /dev/null @@ -1,757 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. - * - * GPL HEADER END - */ -/* - * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/iiblnd/iiblnd.h - * - * Author: Eric Barton - */ - -#ifndef EXPORT_SYMTAB -# define EXPORT_SYMTAB -#endif -#ifndef AUTOCONF_INCLUDED -#include -#endif -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#define DEBUG_SUBSYSTEM S_LND - -#include -#include -#include -#include - -#include - -#define GCC_VERSION (__GNUC__ * 10000 \ - + __GNUC_MINOR__ * 100 \ - + __GNUC_PATCHLEVEL__) - -/* Test for GCC > 3.2.2 */ -#if GCC_VERSION <= 30202 -/* GCC 3.2.2, and presumably several versions before it, will - * miscompile this driver. See - * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9853. */ -#error Invalid GCC version. Must use GCC >= 3.2.3 -#endif - -#ifdef CONFIG_SMP -# define IBNAL_N_SCHED num_online_cpus() /* # schedulers */ -#else -# define IBNAL_N_SCHED 1 /* # schedulers */ -#endif - -#define IBNAL_USE_FMR 0 /* map on demand v. use whole mem mapping */ -#define KIBLND_DETAILED_DEBUG 0 - -/* tunables fixed at compile time */ -#define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */ -#define IBNAL_RESCHED 100 /* # scheduler loops before reschedule */ -#define IBNAL_MSG_QUEUE_SIZE 8 /* # messages/RDMAs in-flight */ -#define IBNAL_CREDIT_HIGHWATER 7 /* when to eagerly return credits */ -#define IBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */ -#define IBNAL_RDMA_BASE 0x0eeb0000 -#define IBNAL_STARTING_PSN 1 - -/* QP tunables */ -/* 7 indicates infinite retry attempts, Infinicon recommended 5 */ -#define IBNAL_RETRY 5 /* # times to retry */ -#define IBNAL_RNR_RETRY 5 /* */ -#define IBNAL_CM_RETRY 5 /* # times to retry connection */ -#define IBNAL_FLOW_CONTROL 1 -#define IBNAL_ACK_TIMEOUT 20 /* supposedly 4 secs */ -#define IBNAL_EE_FLOW 1 -#define IBNAL_LOCAL_SUB 1 -#define IBNAL_FAILOVER_ACCEPTED 0 - -/************************/ -/* derived constants... */ - -/* TX messages (shared by all connections) */ -#define IBNAL_TX_MSGS() (*kibnal_tunables.kib_ntx) -#define IBNAL_TX_MSG_BYTES() (IBNAL_TX_MSGS() * IBNAL_MSG_SIZE) -#define IBNAL_TX_MSG_PAGES() ((IBNAL_TX_MSG_BYTES() + PAGE_SIZE - 1)/PAGE_SIZE) - -#if IBNAL_USE_FMR -# define IBNAL_MAX_RDMA_FRAGS 1 -# define IBNAL_CONCURRENT_SENDS IBNAL_RX_MSGS -#else -# define IBNAL_MAX_RDMA_FRAGS LNET_MAX_IOV -# define IBNAL_CONCURRENT_SENDS IBNAL_MSG_QUEUE_SIZE -#endif - -/* RX messages (per connection) */ -#define IBNAL_RX_MSGS (IBNAL_MSG_QUEUE_SIZE * 2) -#define IBNAL_RX_MSG_BYTES (IBNAL_RX_MSGS * IBNAL_MSG_SIZE) -#define IBNAL_RX_MSG_PAGES ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) - -#define IBNAL_CQ_ENTRIES() (IBNAL_TX_MSGS() * (1 + IBNAL_MAX_RDMA_FRAGS) + \ - (IBNAL_RX_MSGS * *kibnal_tunables.kib_concurrent_peers)) - -typedef struct -{ - char **kib_hca_basename; /* HCA base name */ - char **kib_ipif_basename; /* IPoIB interface base name */ - char **kib_service_name; /* global service name */ - unsigned int *kib_service_number; /* global service number */ - int *kib_min_reconnect_interval; /* min connect retry seconds... */ - int *kib_max_reconnect_interval; /* max connect retry seconds */ - int *kib_concurrent_peers; /* max # peers */ - int *kib_cksum; /* checksum kib_msg_t? */ - int *kib_timeout; /* comms timeout (seconds) */ - int *kib_keepalive; /* keepalive timeout (seconds) */ - int *kib_ntx; /* # tx descs */ - int *kib_credits; /* # concurrent sends */ - int *kib_peercredits; /* # concurrent sends to 1 peer */ - int *kib_sd_retries; /* # concurrent sends to 1 peer */ - int *kib_concurrent_sends; /* send work queue sizing */ -#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM - cfs_sysctl_table_header_t *kib_sysctl; /* sysctl interface */ -#endif -} kib_tunables_t; - -/* NB The Infinicon stack has specific typedefs for some things - * (e.g. IB_{L,R}_KEY), that just map back to __u32 etc */ -typedef struct -{ - int ibp_npages; /* # pages */ - struct page *ibp_pages[0]; -} kib_pages_t; - -typedef struct -{ - IB_HANDLE md_handle; - __u32 md_lkey; - __u32 md_rkey; - __u64 md_addr; -} kib_md_t; - -typedef struct -{ - int kib_init; /* initialisation state */ - __u64 kib_incarnation; /* which one am I */ - int kib_shutdown; /* shut down? */ - atomic_t kib_nthreads; /* # live threads */ - lnet_ni_t *kib_ni; /* _the_ iib instance */ - - __u64 kib_port_guid; /* my GUID (lo 64 of GID)*/ - __u16 kib_port_pkey; /* my pkey, whatever that is */ - struct semaphore kib_listener_signal; /* signal completion */ - IB_HANDLE kib_listener_cep; /* connection end point */ - - rwlock_t kib_global_lock; /* stabilize peer/conn ops */ - int kib_ready; /* CQ callback fired */ - int kib_checking_cq; /* a scheduler is checking the CQ */ - - struct list_head *kib_peers; /* hash table of all my known peers */ - int kib_peer_hash_size; /* size of kib_peers */ - atomic_t kib_npeers; /* # peers extant */ - atomic_t kib_nconns; /* # connections extant */ - - struct list_head kib_connd_zombies; /* connections to free */ - struct list_head kib_connd_conns; /* connections to progress */ - struct list_head kib_connd_peers; /* peers waiting for a connection */ - wait_queue_head_t kib_connd_waitq; /* connection daemon sleep here */ - spinlock_t kib_connd_lock; /* serialise */ - - wait_queue_head_t kib_sched_waitq; /* schedulers sleep here */ - spinlock_t kib_sched_lock; /* serialise */ - - struct kib_tx *kib_tx_descs; /* all the tx descriptors */ - kib_pages_t *kib_tx_pages; /* premapped tx msg pages */ - - struct list_head kib_idle_txs; /* idle tx descriptors */ - __u64 kib_next_tx_cookie; /* RDMA completion cookie */ - spinlock_t kib_tx_lock; /* serialise */ - - IB_HANDLE kib_hca; /* The HCA */ - int kib_port; /* port on the device */ - IB_HANDLE kib_pd; /* protection domain */ - IB_HANDLE kib_sd; /* SD handle */ - IB_HANDLE kib_cq; /* completion queue */ - kib_md_t kib_whole_mem; /* whole-mem registration */ - - int kib_hca_idx; /* my HCA number */ - uint64 kib_hca_guids[8]; /* all the HCA guids */ - IB_CA_ATTRIBUTES kib_hca_attrs; /* where to get HCA attrs */ - - COMMAND_CONTROL_PARAMETERS kib_sdretry; /* control SD query retries */ -} kib_data_t; - -#define IBNAL_INIT_NOTHING 0 -#define IBNAL_INIT_DATA 1 -#define IBNAL_INIT_LIB 2 -#define IBNAL_INIT_HCA 3 -#define IBNAL_INIT_PORTATTRS 4 -#define IBNAL_INIT_SD 5 -#define IBNAL_INIT_PD 6 -#define IBNAL_INIT_MD 7 -#define IBNAL_INIT_TXD 8 -#define IBNAL_INIT_CQ 9 -#define IBNAL_INIT_ALL 10 - -/************************************************************************ - * Wire message structs. - * These are sent in sender's byte order (i.e. receiver flips). - * CAVEAT EMPTOR: other structs communicated between nodes (e.g. MAD - * private data and SM service info), is LE on the wire. - */ - -typedef struct kib_connparams -{ - __u32 ibcp_queue_depth; - __u32 ibcp_max_msg_size; - __u32 ibcp_max_frags; -} WIRE_ATTR kib_connparams_t; - -typedef struct -{ - lnet_hdr_t ibim_hdr; /* portals header */ - char ibim_payload[0]; /* piggy-backed payload */ -} WIRE_ATTR kib_immediate_msg_t; - -#if IBNAL_USE_FMR -typedef struct -{ - __u64 rd_addr; /* IO VMA address */ - __u32 rd_nob; /* # of bytes */ - __u32 rd_key; /* remote key */ -} WIRE_ATTR kib_rdma_desc_t; -#else -typedef struct -{ - __u32 rf_nob; /* # of bytes */ - __u64 rf_addr; /* remote io vaddr */ -} WIRE_ATTR kib_rdma_frag_t; - -typedef struct -{ - __u32 rd_key; /* local/remote key */ - __u32 rd_nfrag; /* # fragments */ - kib_rdma_frag_t rd_frags[0]; /* buffer frags */ -} WIRE_ATTR kib_rdma_desc_t; -#endif - -typedef struct -{ - lnet_hdr_t ibprm_hdr; /* LNET header */ - __u64 ibprm_cookie; /* opaque completion cookie */ -} WIRE_ATTR kib_putreq_msg_t; - -typedef struct -{ - __u64 ibpam_src_cookie; /* reflected completion cookie */ - __u64 ibpam_dst_cookie; /* opaque completion cookie */ - kib_rdma_desc_t ibpam_rd; /* sender's sink buffer */ -} WIRE_ATTR kib_putack_msg_t; - -typedef struct -{ - lnet_hdr_t ibgm_hdr; /* LNET header */ - __u64 ibgm_cookie; /* opaque completion cookie */ - kib_rdma_desc_t ibgm_rd; /* sender's sink buffer */ -} WIRE_ATTR kib_get_msg_t; - -typedef struct -{ - __u64 ibcm_cookie; /* opaque completion cookie */ - __u32 ibcm_status; /* completion status */ -} WIRE_ATTR kib_completion_msg_t; - -typedef struct -{ - /* First 2 fields fixed FOR ALL TIME */ - __u32 ibm_magic; /* I'm an openibnal message */ - __u16 ibm_version; /* this is my version number */ - - __u8 ibm_type; /* msg type */ - __u8 ibm_credits; /* returned credits */ - __u32 ibm_nob; /* # bytes in whole message */ - __u32 ibm_cksum; /* checksum (0 == no checksum) */ - __u64 ibm_srcnid; /* sender's NID */ - __u64 ibm_srcstamp; /* sender's incarnation */ - __u64 ibm_dstnid; /* destination's NID */ - __u64 ibm_dststamp; /* destination's incarnation */ - __u64 ibm_seq; /* sequence number */ - - union { - kib_connparams_t connparams; - kib_immediate_msg_t immediate; - kib_putreq_msg_t putreq; - kib_putack_msg_t putack; - kib_get_msg_t get; - kib_completion_msg_t completion; - } WIRE_ATTR ibm_u; -} WIRE_ATTR kib_msg_t; - -#define IBNAL_MSG_MAGIC LNET_PROTO_IIB_MAGIC /* unique magic */ -#define IBNAL_MSG_VERSION 2 /* current protocol version */ -#define IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD 1 /* previous version */ - -#define IBNAL_MSG_CONNREQ 0xc0 /* connection request */ -#define IBNAL_MSG_CONNACK 0xc1 /* connection acknowledge */ -#define IBNAL_MSG_NOOP 0xd0 /* nothing (just credits) */ -#define IBNAL_MSG_IMMEDIATE 0xd1 /* immediate */ -#define IBNAL_MSG_PUT_REQ 0xd2 /* putreq (src->sink) */ -#define IBNAL_MSG_PUT_NAK 0xd3 /* completion (sink->src) */ -#define IBNAL_MSG_PUT_ACK 0xd4 /* putack (sink->src) */ -#define IBNAL_MSG_PUT_DONE 0xd5 /* completion (src->sink) */ -#define IBNAL_MSG_GET_REQ 0xd6 /* getreq (sink->src) */ -#define IBNAL_MSG_GET_DONE 0xd7 /* completion (src->sink: all OK) */ - -/* connection rejection reasons */ -#define IBNAL_REJECT_CONN_RACE 0 /* You lost connection race */ -#define IBNAL_REJECT_NO_RESOURCES 1 /* Out of memory/conns etc */ -#define IBNAL_REJECT_FATAL 2 /* Anything else */ - -/***********************************************************************/ - -typedef struct kib_rx /* receive message */ -{ - struct list_head rx_list; /* queue for attention */ - struct kib_conn *rx_conn; /* owning conn */ - int rx_nob; /* # bytes received (-1 while posted) */ - __u64 rx_hca_msg; /* pre-mapped buffer (hca vaddr) */ - kib_msg_t *rx_msg; /* pre-mapped buffer (host vaddr) */ - IB_WORK_REQ2 rx_wrq; - IB_LOCAL_DATASEGMENT rx_gl; /* and its memory */ -} kib_rx_t; - -typedef struct kib_tx /* transmit message */ -{ - struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */ - struct kib_conn *tx_conn; /* owning conn */ - int tx_mapped; /* mapped for RDMA? */ - int tx_sending; /* # tx callbacks outstanding */ - int tx_queued; /* queued for sending */ - int tx_waiting; /* waiting for peer */ - int tx_status; /* completion status */ - unsigned long tx_deadline; /* completion deadline */ - __u64 tx_cookie; /* completion cookie */ - lnet_msg_t *tx_lntmsg[2]; /* lnet msgs to finalize on completion */ - kib_msg_t *tx_msg; /* pre-mapped buffer (host vaddr) */ - __u64 tx_hca_msg; /* pre-mapped buffer (HCA vaddr) */ - int tx_nwrq; /* # send work items */ -#if IBNAL_USE_FMR - IB_WORK_REQ2 tx_wrq[2]; /* send work items... */ - IB_LOCAL_DATASEGMENT tx_gl[2]; /* ...and their memory */ - kib_rdma_desc_t tx_rd[1]; /* rdma descriptor */ - kib_md_t tx_md; /* mapping */ - __u64 *tx_pages; /* page phys addrs */ -#else - IB_WORK_REQ2 *tx_wrq; /* send work items... */ - IB_LOCAL_DATASEGMENT *tx_gl; /* ...and their memory */ - kib_rdma_desc_t *tx_rd; /* rdma descriptor (src buffers) */ -#endif -} kib_tx_t; - -typedef struct -{ - /* scratchpad during connection establishment */ - IB_QP_ATTRIBUTES_QUERY cv_qpattrs; - QUERY cv_query; - IB_SERVICE_RECORD cv_svcrec; - IB_PATH_RECORD cv_path; - CM_CONN_INFO cv_cmci; -} kib_connvars_t; - -typedef struct kib_conn -{ - struct kib_peer *ibc_peer; /* owning peer */ - struct list_head ibc_list; /* stash on peer's conn list */ - __u64 ibc_incarnation; /* which instance of the peer */ - __u64 ibc_txseq; /* tx sequence number */ - __u64 ibc_rxseq; /* rx sequence number */ - __u32 ibc_version; /* peer protocol version */ - atomic_t ibc_refcount; /* # users */ - int ibc_state; /* what's happening */ - int ibc_nsends_posted; /* # uncompleted sends */ - int ibc_credits; /* # credits I have */ - int ibc_outstanding_credits; /* # credits to return */ - int ibc_reserved_credits; /* # credits for ACK/DONE msgs */ - unsigned long ibc_last_send; /* time of last send */ - struct list_head ibc_early_rxs; /* rxs completed before ESTABLISHED */ - struct list_head ibc_tx_queue_nocred; /* sends that don't need a cred */ - struct list_head ibc_tx_queue_rsrvd; /* sends that need a reserved cred */ - struct list_head ibc_tx_queue; /* send queue */ - struct list_head ibc_active_txs; /* active tx awaiting completion */ - spinlock_t ibc_lock; /* serialise */ - kib_rx_t *ibc_rxs; /* the rx descs */ - kib_pages_t *ibc_rx_pages; /* premapped rx msg pages */ - IB_HANDLE ibc_qp; /* queue pair */ - IB_HANDLE ibc_cep; /* CM endpoint */ - kib_connvars_t *ibc_cvars; /* connection scratchpad */ -} kib_conn_t; - -#define IBNAL_CONN_INIT_NOTHING 0 /* initial state */ -#define IBNAL_CONN_INIT_QP 1 /* ibc_qp set up */ -#define IBNAL_CONN_CONNECTING 2 /* started to connect */ -#define IBNAL_CONN_ESTABLISHED 3 /* connection established */ -#define IBNAL_CONN_DISCONNECTING 4 /* to send disconnect req */ -#define IBNAL_CONN_DISCONNECTED 5 /* no more QP or CM traffic */ - -/* types of connection */ -#define IBNAL_CONN_ACTIVE 0 /* active connect */ -#define IBNAL_CONN_PASSIVE 1 /* passive connect */ -#define IBNAL_CONN_WAITING 2 /* waiting for connect */ - -typedef struct kib_peer -{ - struct list_head ibp_list; /* stash on global peer list */ - struct list_head ibp_connd_list; /* schedule on kib_connd_peers */ - lnet_nid_t ibp_nid; /* who's on the other end(s) */ - atomic_t ibp_refcount; /* # users */ - int ibp_persistence; /* "known" peer refs */ - int ibp_version; /* protocol version */ - struct list_head ibp_conns; /* all active connections */ - struct list_head ibp_tx_queue; /* msgs waiting for a conn */ - int ibp_connecting; /* active connects in progress */ - int ibp_accepting; /* passive connects in progress */ - int ibp_passivewait; /* waiting for peer to connect */ - unsigned long ibp_passivewait_deadline; /* when passive wait must complete */ - unsigned long ibp_reconnect_time; /* when reconnect may be attempted */ - unsigned long ibp_reconnect_interval; /* exponential backoff */ - int ibp_error; /* errno on closing this peer */ - cfs_time_t ibp_last_alive; /* when (in jiffies) I was last alive */ -} kib_peer_t; - - -extern kib_data_t kibnal_data; -extern kib_tunables_t kibnal_tunables; - -/******************************************************************************/ - -/* these are purposely avoiding using local vars so they don't increase - * stack consumption. */ - -#define kibnal_conn_addref(conn) \ -do { \ - CDEBUG(D_NET, "conn[%p] (%d)++\n", \ - (conn), atomic_read(&(conn)->ibc_refcount)); \ - LASSERT(atomic_read(&(conn)->ibc_refcount) > 0); \ - atomic_inc(&(conn)->ibc_refcount); \ -} while (0) - -#define kibnal_conn_decref(conn) \ -do { \ - unsigned long flags; \ - \ - CDEBUG(D_NET, "conn[%p] (%d)--\n", \ - (conn), atomic_read(&(conn)->ibc_refcount)); \ - LASSERT(atomic_read(&(conn)->ibc_refcount) > 0); \ - if (atomic_dec_and_test(&(conn)->ibc_refcount)) { \ - spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); \ - list_add_tail(&(conn)->ibc_list, \ - &kibnal_data.kib_connd_zombies); \ - wake_up(&kibnal_data.kib_connd_waitq); \ - spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); \ - } \ -} while (0) - -#define kibnal_peer_addref(peer) \ -do { \ - CDEBUG(D_NET, "peer[%p] -> %s (%d)++\n", \ - (peer), libcfs_nid2str((peer)->ibp_nid), \ - atomic_read (&(peer)->ibp_refcount)); \ - LASSERT(atomic_read(&(peer)->ibp_refcount) > 0); \ - atomic_inc(&(peer)->ibp_refcount); \ -} while (0) - -#define kibnal_peer_decref(peer) \ -do { \ - CDEBUG(D_NET, "peer[%p] -> %s (%d)--\n", \ - (peer), libcfs_nid2str((peer)->ibp_nid), \ - atomic_read (&(peer)->ibp_refcount)); \ - LASSERT(atomic_read(&(peer)->ibp_refcount) > 0); \ - if (atomic_dec_and_test(&(peer)->ibp_refcount)) \ - kibnal_destroy_peer(peer); \ -} while (0) - -/******************************************************************************/ - -static inline struct list_head * -kibnal_nid2peerlist (lnet_nid_t nid) -{ - unsigned int hash = ((unsigned int)nid) % kibnal_data.kib_peer_hash_size; - - return (&kibnal_data.kib_peers [hash]); -} - -static inline int -kibnal_peer_active(kib_peer_t *peer) -{ - /* Am I in the peer hash table? */ - return (!list_empty(&peer->ibp_list)); -} - -static inline int -kibnal_peer_connecting(kib_peer_t *peer) -{ - /* Am I expecting a connection to materialise? */ - return (peer->ibp_connecting != 0 || - peer->ibp_accepting != 0 || - peer->ibp_passivewait); -} - -static inline void -kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn) -{ - struct list_head *q; - - LASSERT (tx->tx_nwrq > 0); /* work items set up */ - LASSERT (!tx->tx_queued); /* not queued for sending already */ - - tx->tx_queued = 1; - tx->tx_deadline = jiffies + (*kibnal_tunables.kib_timeout * HZ); - - if (tx->tx_conn == NULL) { - kibnal_conn_addref(conn); - tx->tx_conn = conn; - LASSERT (tx->tx_msg->ibm_type != IBNAL_MSG_PUT_DONE); - } else { - LASSERT (tx->tx_conn == conn); - LASSERT (tx->tx_msg->ibm_type == IBNAL_MSG_PUT_DONE); - } - - if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) { - /* All messages have simple credit control */ - q = &conn->ibc_tx_queue; - } else { - LASSERT (conn->ibc_version == IBNAL_MSG_VERSION); - - switch (tx->tx_msg->ibm_type) { - case IBNAL_MSG_PUT_REQ: - case IBNAL_MSG_GET_REQ: - /* RDMA request: reserve a buffer for the RDMA reply - * before sending */ - q = &conn->ibc_tx_queue_rsrvd; - break; - - case IBNAL_MSG_PUT_NAK: - case IBNAL_MSG_PUT_ACK: - case IBNAL_MSG_PUT_DONE: - case IBNAL_MSG_GET_DONE: - /* RDMA reply/completion: no credits; peer has reserved - * a reply buffer */ - q = &conn->ibc_tx_queue_nocred; - break; - - case IBNAL_MSG_NOOP: - case IBNAL_MSG_IMMEDIATE: - /* Otherwise: consume a credit before sending */ - q = &conn->ibc_tx_queue; - break; - - default: - LBUG(); - q = NULL; - } - } - - list_add_tail(&tx->tx_list, q); -} - -static inline int -kibnal_send_keepalive(kib_conn_t *conn) -{ - return (*kibnal_tunables.kib_keepalive > 0) && - time_after(jiffies, conn->ibc_last_send + - *kibnal_tunables.kib_keepalive*HZ); -} - -#define KIBNAL_SERVICE_KEY_MASK (IB_SERVICE_RECORD_COMP_SERVICENAME | \ - IB_SERVICE_RECORD_COMP_SERVICEDATA8_1 | \ - IB_SERVICE_RECORD_COMP_SERVICEDATA8_2 | \ - IB_SERVICE_RECORD_COMP_SERVICEDATA8_3 | \ - IB_SERVICE_RECORD_COMP_SERVICEDATA8_4 | \ - IB_SERVICE_RECORD_COMP_SERVICEDATA8_5 | \ - IB_SERVICE_RECORD_COMP_SERVICEDATA8_6 | \ - IB_SERVICE_RECORD_COMP_SERVICEDATA8_7 | \ - IB_SERVICE_RECORD_COMP_SERVICEDATA8_8) - -static inline __u64* -kibnal_service_nid_field(IB_SERVICE_RECORD *srv) -{ - /* must be consistent with KIBNAL_SERVICE_KEY_MASK */ - return (__u64 *)srv->ServiceData8; -} - -static inline void -kibnal_set_service_keys(IB_SERVICE_RECORD *srv, lnet_nid_t nid) -{ - char *svc_name = *kibnal_tunables.kib_service_name; - - LASSERT (strlen(svc_name) < sizeof(srv->ServiceName)); - memset (srv->ServiceName, 0, sizeof(srv->ServiceName)); - strcpy (srv->ServiceName, svc_name); - - *kibnal_service_nid_field(srv) = cpu_to_le64(nid); -} - -/* CAVEAT EMPTOR: We rely on tx/rx descriptor alignment to allow us to use the - * lowest 2 bits of the work request id to stash the work item type (the op - * field is not valid when the wc completes in error). */ - -#define IBNAL_WID_TX 0 -#define IBNAL_WID_RX 1 -#define IBNAL_WID_RDMA 2 -#define IBNAL_WID_MASK 3UL - -static inline __u64 -kibnal_ptr2wreqid (void *ptr, int type) -{ - unsigned long lptr = (unsigned long)ptr; - - LASSERT ((lptr & IBNAL_WID_MASK) == 0); - LASSERT ((type & ~IBNAL_WID_MASK) == 0); - return (__u64)(lptr | type); -} - -static inline void * -kibnal_wreqid2ptr (__u64 wreqid) -{ - return (void *)(((unsigned long)wreqid) & ~IBNAL_WID_MASK); -} - -static inline int -kibnal_wreqid2type (__u64 wreqid) -{ - return (wreqid & IBNAL_WID_MASK); -} - -static inline void -kibnal_set_conn_state (kib_conn_t *conn, int state) -{ - CDEBUG(D_NET,"%p state %d\n", conn, state); - conn->ibc_state = state; - mb(); -} - -#if IBNAL_USE_FMR - -static inline int -kibnal_rd_size (kib_rdma_desc_t *rd) -{ - return rd->rd_nob; -} - -#else -static inline int -kibnal_rd_size (kib_rdma_desc_t *rd) -{ - int i; - int size; - - for (i = size = 0; i < rd->rd_nfrag; i++) - size += rd->rd_frags[i].rf_nob; - - return size; -} -#endif - -int kibnal_startup (lnet_ni_t *ni); -void kibnal_shutdown (lnet_ni_t *ni); -int kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg); -int kibnal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); -int kibnal_eager_recv (lnet_ni_t *ni, void *private, - lnet_msg_t *lntmsg, void **new_private); -int kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *msg, - int delayed, unsigned int niov, - struct iovec *iov, lnet_kiov_t *kiov, - unsigned int offset, unsigned int mlen, unsigned int rlen); -void kibnal_init_msg(kib_msg_t *msg, int type, int body_nob); -void kibnal_pack_msg(kib_msg_t *msg, __u32 version, int credits, - lnet_nid_t dstnid, __u64 dststamp, __u64 seq); -void kibnal_pack_connmsg(kib_msg_t *msg, __u32 version, int nob, int type, - lnet_nid_t dstnid, __u64 dststamp); -int kibnal_unpack_msg(kib_msg_t *msg, __u32 expected_version, int nob); -IB_HANDLE kibnal_create_cep(lnet_nid_t nid); -int kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid); -void kibnal_destroy_peer (kib_peer_t *peer); -kib_peer_t *kibnal_find_peer_locked (lnet_nid_t nid); -int kibnal_del_peer (lnet_nid_t nid); -void kibnal_peer_alive (kib_peer_t *peer); -void kibnal_unlink_peer_locked (kib_peer_t *peer); -int kibnal_add_persistent_peer (lnet_nid_t nid); -int kibnal_close_stale_conns_locked (kib_peer_t *peer, - __u64 incarnation); -int kibnal_conn_rts(kib_conn_t *conn, - __u32 qpn, __u8 resp_res, __u8 init_depth, __u32 psn); -kib_conn_t *kibnal_create_conn (lnet_nid_t nid, int proto_version); -void kibnal_destroy_conn (kib_conn_t *conn); -void kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg); -int kibnal_alloc_pages (kib_pages_t **pp, int npages); -void kibnal_free_pages (kib_pages_t *p); -void kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn); -void kibnal_txlist_done (struct list_head *txlist, int status); -int kibnal_post_receives (kib_conn_t *conn); -int kibnal_init_rdma (kib_tx_t *tx, int type, int nob, - kib_rdma_desc_t *dstrd, __u64 dstcookie); -void kibnal_check_sends (kib_conn_t *conn); -void kibnal_close_conn_locked (kib_conn_t *conn, int error); -int kibnal_thread_start (int (*fn)(void *arg), void *arg); -int kibnal_scheduler(void *arg); -int kibnal_connd (void *arg); -void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob); -void kibnal_close_conn (kib_conn_t *conn, int why); -void kibnal_start_active_rdma (int type, int status, - kib_rx_t *rx, lnet_msg_t *lntmsg, - unsigned int niov, - struct iovec *iov, lnet_kiov_t *kiov, - unsigned int offset, unsigned int nob); -void kibnal_hca_async_callback (void *hca_arg, IB_EVENT_RECORD *ev); -void kibnal_hca_callback (void *hca_arg, void *cq_arg); -int kibnal_tunables_init (void); -void kibnal_tunables_fini (void); diff --git a/lnet/klnds/iiblnd/iiblnd_cb.c b/lnet/klnds/iiblnd/iiblnd_cb.c deleted file mode 100644 index 1726f18..0000000 --- a/lnet/klnds/iiblnd/iiblnd_cb.c +++ /dev/null @@ -1,3416 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. - * - * GPL HEADER END - */ -/* - * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/iiblnd/iiblnd_cb.c - * - * Author: Eric Barton - */ - -#include "iiblnd.h" - -void -hexdump(char *string, void *ptr, int len) -{ - unsigned char *c = ptr; - int i; - - return; - - if (len < 0 || len > 2048) { - printk("XXX what the hell? %d\n",len); - return; - } - - printk("%d bytes of '%s' from 0x%p\n", len, string, ptr); - - for (i = 0; i < len;) { - printk("%02x",*(c++)); - i++; - if (!(i & 15)) { - printk("\n"); - } else if (!(i&1)) { - printk(" "); - } - } - - if(len & 15) { - printk("\n"); - } -} - -void -kibnal_tx_done (kib_tx_t *tx) -{ - lnet_msg_t *lntmsg[2]; - int rc = tx->tx_status; - int i; - - LASSERT (!in_interrupt()); - LASSERT (!tx->tx_queued); /* mustn't be queued for sending */ - LASSERT (tx->tx_sending == 0); /* mustn't be awaiting sent callback */ - LASSERT (!tx->tx_waiting); /* mustn't be awaiting peer response */ - -#if IBNAL_USE_FMR - /* Handle unmapping if required */ -#endif - /* tx may have up to 2 lnet msgs to finalise */ - lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL; - lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL; - - if (tx->tx_conn != NULL) { - kibnal_conn_decref(tx->tx_conn); - tx->tx_conn = NULL; - } - - tx->tx_nwrq = 0; - tx->tx_status = 0; - - spin_lock(&kibnal_data.kib_tx_lock); - - list_add (&tx->tx_list, &kibnal_data.kib_idle_txs); - - spin_unlock(&kibnal_data.kib_tx_lock); - - /* delay finalize until my descs have been freed */ - for (i = 0; i < 2; i++) { - if (lntmsg[i] == NULL) - continue; - - lnet_finalize (kibnal_data.kib_ni, lntmsg[i], rc); - } -} - -kib_tx_t * -kibnal_get_idle_tx (void) -{ - kib_tx_t *tx; - - spin_lock(&kibnal_data.kib_tx_lock); - - if (list_empty (&kibnal_data.kib_idle_txs)) { - spin_unlock(&kibnal_data.kib_tx_lock); - return NULL; - } - - tx = list_entry (kibnal_data.kib_idle_txs.next, kib_tx_t, tx_list); - list_del (&tx->tx_list); - - /* Allocate a new completion cookie. It might not be needed, - * but we've got a lock right now and we're unlikely to - * wrap... */ - tx->tx_cookie = kibnal_data.kib_next_tx_cookie++; - - spin_unlock(&kibnal_data.kib_tx_lock); - - LASSERT (tx->tx_nwrq == 0); - LASSERT (!tx->tx_queued); - LASSERT (tx->tx_sending == 0); - LASSERT (!tx->tx_waiting); - LASSERT (tx->tx_status == 0); - LASSERT (tx->tx_conn == NULL); - LASSERT (tx->tx_lntmsg[0] == NULL); - LASSERT (tx->tx_lntmsg[1] == NULL); - - return tx; -} - -int -kibnal_post_rx (kib_rx_t *rx, int credit, int rsrvd_credit) -{ - kib_conn_t *conn = rx->rx_conn; - int rc = 0; - FSTATUS frc; - - LASSERT (!in_interrupt()); - /* old peers don't reserve rxs for RDMA replies */ - LASSERT (!rsrvd_credit || - conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD); - - rx->rx_gl = (IB_LOCAL_DATASEGMENT) { - .Address = rx->rx_hca_msg, - .Lkey = kibnal_data.kib_whole_mem.md_lkey, - .Length = IBNAL_MSG_SIZE, - }; - - rx->rx_wrq = (IB_WORK_REQ2) { - .Next = NULL, - .WorkReqId = kibnal_ptr2wreqid(rx, IBNAL_WID_RX), - .MessageLen = IBNAL_MSG_SIZE, - .DSList = &rx->rx_gl, - .DSListDepth = 1, - .Operation = WROpRecv, - }; - - LASSERT (conn->ibc_state >= IBNAL_CONN_CONNECTING); - LASSERT (rx->rx_nob >= 0); /* not posted */ - - CDEBUG(D_NET, "posting rx [%d %x "LPX64"]\n", - rx->rx_wrq.DSList->Length, - rx->rx_wrq.DSList->Lkey, - rx->rx_wrq.DSList->Address); - - if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) { - /* No more posts for this rx; so lose its ref */ - kibnal_conn_decref(conn); - return 0; - } - - rx->rx_nob = -1; /* flag posted */ - mb(); - - frc = iba_post_recv2(conn->ibc_qp, &rx->rx_wrq, NULL); - if (frc == FSUCCESS) { - if (credit || rsrvd_credit) { - spin_lock(&conn->ibc_lock); - - if (credit) - conn->ibc_outstanding_credits++; - if (rsrvd_credit) - conn->ibc_reserved_credits++; - - spin_unlock(&conn->ibc_lock); - - kibnal_check_sends(conn); - } - return 0; - } - - CERROR ("post rx -> %s failed %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), frc); - rc = -EIO; - kibnal_close_conn(rx->rx_conn, rc); - /* No more posts for this rx; so lose its ref */ - kibnal_conn_decref(conn); - return rc; -} - -int -kibnal_post_receives (kib_conn_t *conn) -{ - int i; - int rc; - - LASSERT (conn->ibc_state == IBNAL_CONN_CONNECTING); - - for (i = 0; i < IBNAL_RX_MSGS; i++) { - /* +1 ref for rx desc. This ref remains until kibnal_post_rx - * fails (i.e. actual failure or we're disconnecting) */ - kibnal_conn_addref(conn); - rc = kibnal_post_rx (&conn->ibc_rxs[i], 0, 0); - if (rc != 0) - return rc; - } - - return 0; -} - -kib_tx_t * -kibnal_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie) -{ - struct list_head *tmp; - - list_for_each(tmp, &conn->ibc_active_txs) { - kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list); - - LASSERT (!tx->tx_queued); - LASSERT (tx->tx_sending != 0 || tx->tx_waiting); - - if (tx->tx_cookie != cookie) - continue; - - if (tx->tx_waiting && - tx->tx_msg->ibm_type == txtype) - return tx; - - CWARN("Bad completion: %swaiting, type %x (wanted %x)\n", - tx->tx_waiting ? "" : "NOT ", - tx->tx_msg->ibm_type, txtype); - } - return NULL; -} - -void -kibnal_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie) -{ - kib_tx_t *tx; - int idle; - - spin_lock(&conn->ibc_lock); - - tx = kibnal_find_waiting_tx_locked(conn, txtype, cookie); - if (tx == NULL) { - spin_unlock(&conn->ibc_lock); - - CWARN("Unmatched completion type %x cookie "LPX64" from %s\n", - txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - kibnal_close_conn (conn, -EPROTO); - return; - } - - if (tx->tx_status == 0) { /* success so far */ - if (status < 0) { /* failed? */ - tx->tx_status = status; - } else if (txtype == IBNAL_MSG_GET_REQ) { - lnet_set_reply_msg_len(kibnal_data.kib_ni, - tx->tx_lntmsg[1], status); - } - } - - tx->tx_waiting = 0; - - idle = !tx->tx_queued && (tx->tx_sending == 0); - if (idle) - list_del(&tx->tx_list); - - spin_unlock(&conn->ibc_lock); - - if (idle) - kibnal_tx_done(tx); -} - -void -kibnal_send_completion (kib_conn_t *conn, int type, int status, __u64 cookie) -{ - kib_tx_t *tx = kibnal_get_idle_tx(); - - if (tx == NULL) { - CERROR("Can't get tx for completion %x for %s\n", - type, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - return; - } - - tx->tx_msg->ibm_u.completion.ibcm_status = status; - tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie; - kibnal_init_tx_msg(tx, type, sizeof(kib_completion_msg_t)); - - kibnal_queue_tx(tx, conn); -} - -void -kibnal_handle_rx (kib_rx_t *rx) -{ - kib_msg_t *msg = rx->rx_msg; - kib_conn_t *conn = rx->rx_conn; - int credits = msg->ibm_credits; - kib_tx_t *tx; - int rc = 0; - int repost = 1; - int rsrvd_credit = 0; - int rc2; - - LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED); - - CDEBUG (D_NET, "Received %x[%d] from %s\n", - msg->ibm_type, credits, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - - if (credits != 0) { - /* Have I received credits that will let me send? */ - spin_lock(&conn->ibc_lock); - conn->ibc_credits += credits; - spin_unlock(&conn->ibc_lock); - - kibnal_check_sends(conn); - } - - switch (msg->ibm_type) { - default: - CERROR("Bad IBNAL message type %x from %s\n", - msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - rc = -EPROTO; - break; - - case IBNAL_MSG_NOOP: - break; - - case IBNAL_MSG_IMMEDIATE: - rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.immediate.ibim_hdr, - msg->ibm_srcnid, rx, 0); - repost = rc < 0; /* repost on error */ - break; - - case IBNAL_MSG_PUT_REQ: - rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.putreq.ibprm_hdr, - msg->ibm_srcnid, rx, 1); - repost = rc < 0; /* repost on error */ - break; - - case IBNAL_MSG_PUT_NAK: - rsrvd_credit = 1; /* rdma reply (was pre-reserved) */ - - CWARN ("PUT_NACK from %s\n", libcfs_nid2str(conn->ibc_peer->ibp_nid)); - kibnal_handle_completion(conn, IBNAL_MSG_PUT_REQ, - msg->ibm_u.completion.ibcm_status, - msg->ibm_u.completion.ibcm_cookie); - break; - - case IBNAL_MSG_PUT_ACK: - rsrvd_credit = 1; /* rdma reply (was pre-reserved) */ - - spin_lock(&conn->ibc_lock); - tx = kibnal_find_waiting_tx_locked(conn, IBNAL_MSG_PUT_REQ, - msg->ibm_u.putack.ibpam_src_cookie); - if (tx != NULL) - list_del(&tx->tx_list); - spin_unlock(&conn->ibc_lock); - - if (tx == NULL) { - CERROR("Unmatched PUT_ACK from %s\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - rc = -EPROTO; - break; - } - - LASSERT (tx->tx_waiting); - /* CAVEAT EMPTOR: I could be racing with tx_complete, but... - * (a) I can overwrite tx_msg since my peer has received it! - * (b) tx_waiting set tells tx_complete() it's not done. */ - - tx->tx_nwrq = 0; /* overwrite PUT_REQ */ - - rc2 = kibnal_init_rdma(tx, IBNAL_MSG_PUT_DONE, - kibnal_rd_size(&msg->ibm_u.putack.ibpam_rd), - &msg->ibm_u.putack.ibpam_rd, - msg->ibm_u.putack.ibpam_dst_cookie); - if (rc2 < 0) - CERROR("Can't setup rdma for PUT to %s: %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2); - - spin_lock(&conn->ibc_lock); - if (tx->tx_status == 0 && rc2 < 0) - tx->tx_status = rc2; - tx->tx_waiting = 0; /* clear waiting and queue atomically */ - kibnal_queue_tx_locked(tx, conn); - spin_unlock(&conn->ibc_lock); - break; - - case IBNAL_MSG_PUT_DONE: - /* This buffer was pre-reserved by not returning the credit - * when the PUT_REQ's buffer was reposted, so I just return it - * now */ - kibnal_handle_completion(conn, IBNAL_MSG_PUT_ACK, - msg->ibm_u.completion.ibcm_status, - msg->ibm_u.completion.ibcm_cookie); - break; - - case IBNAL_MSG_GET_REQ: - rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.get.ibgm_hdr, - msg->ibm_srcnid, rx, 1); - repost = rc < 0; /* repost on error */ - break; - - case IBNAL_MSG_GET_DONE: - rsrvd_credit = 1; /* rdma reply (was pre-reserved) */ - - kibnal_handle_completion(conn, IBNAL_MSG_GET_REQ, - msg->ibm_u.completion.ibcm_status, - msg->ibm_u.completion.ibcm_cookie); - break; - } - - if (rc < 0) /* protocol error */ - kibnal_close_conn(conn, rc); - - if (repost) { - if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) - rsrvd_credit = 0; /* peer isn't pre-reserving */ - - kibnal_post_rx(rx, !rsrvd_credit, rsrvd_credit); - } -} - -void -kibnal_rx_complete (IB_WORK_COMPLETION *wc, __u64 rxseq) -{ - kib_rx_t *rx = (kib_rx_t *)kibnal_wreqid2ptr(wc->WorkReqId); - int nob = wc->Length; - kib_msg_t *msg = rx->rx_msg; - kib_conn_t *conn = rx->rx_conn; - unsigned long flags; - int rc; - int err = -EIO; - - LASSERT (rx->rx_nob < 0); /* was posted */ - rx->rx_nob = 0; /* isn't now */ - mb(); - - /* receives complete with error in any case after we've started - * disconnecting */ - if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) - goto ignore; - - if (wc->Status != WRStatusSuccess) { - CERROR("Rx from %s failed: %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), wc->Status); - goto failed; - } - - rc = kibnal_unpack_msg(msg, conn->ibc_version, nob); - if (rc != 0) { - CERROR ("Error %d unpacking rx from %s\n", - rc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - goto failed; - } - - rx->rx_nob = nob; /* Now I know nob > 0 */ - mb(); - - if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid || - msg->ibm_dstnid != kibnal_data.kib_ni->ni_nid || - msg->ibm_srcstamp != conn->ibc_incarnation || - msg->ibm_dststamp != kibnal_data.kib_incarnation) { - CERROR ("Stale rx from %s\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - err = -ESTALE; - goto failed; - } - - if (msg->ibm_seq != rxseq) { - CERROR ("Out-of-sequence rx from %s" - ": got "LPD64" but expected "LPD64"\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), - msg->ibm_seq, rxseq); - goto failed; - } - - /* set time last known alive */ - kibnal_peer_alive(conn->ibc_peer); - - /* racing with connection establishment/teardown! */ - - if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) { - write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - /* must check holding global lock to eliminate race */ - if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) { - list_add_tail(&rx->rx_list, &conn->ibc_early_rxs); - write_unlock_irqrestore(&kibnal_data.kib_global_lock, - flags); - return; - } - write_unlock_irqrestore(&kibnal_data.kib_global_lock, - flags); - } - kibnal_handle_rx(rx); - return; - - failed: - kibnal_close_conn(conn, err); - ignore: - /* Don't re-post rx & drop its ref on conn */ - kibnal_conn_decref(conn); -} - -struct page * -kibnal_kvaddr_to_page (unsigned long vaddr) -{ - struct page *page; - - if (vaddr >= VMALLOC_START && - vaddr < VMALLOC_END) { - page = vmalloc_to_page ((void *)vaddr); - LASSERT (page != NULL); - return page; - } -#ifdef CONFIG_HIGHMEM - if (vaddr >= PKMAP_BASE && - vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) { - /* No highmem pages only used for bulk (kiov) I/O */ - CERROR("find page for address in highmem\n"); - LBUG(); - } -#endif - page = virt_to_page (vaddr); - LASSERT (page != NULL); - return page; -} - -#if !IBNAL_USE_FMR -int -kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page, - unsigned long page_offset, unsigned long len) -{ - kib_rdma_frag_t *frag = &rd->rd_frags[rd->rd_nfrag]; - - if (rd->rd_nfrag >= IBNAL_MAX_RDMA_FRAGS) { - CERROR ("Too many RDMA fragments\n"); - return -EMSGSIZE; - } - - if (active) { - if (rd->rd_nfrag == 0) - rd->rd_key = kibnal_data.kib_whole_mem.md_lkey; - } else { - if (rd->rd_nfrag == 0) - rd->rd_key = kibnal_data.kib_whole_mem.md_rkey; - } - - frag->rf_nob = len; - frag->rf_addr = kibnal_data.kib_whole_mem.md_addr + - lnet_page2phys(page) + page_offset; - - CDEBUG(D_NET,"map key %x frag [%d]["LPX64" for %d]\n", - rd->rd_key, rd->rd_nfrag, frag->rf_addr, frag->rf_nob); - - rd->rd_nfrag++; - return 0; -} - -int -kibnal_setup_rd_iov(kib_tx_t *tx, kib_rdma_desc_t *rd, int active, - unsigned int niov, struct iovec *iov, int offset, int nob) - -{ - int fragnob; - int rc; - unsigned long vaddr; - struct page *page; - int page_offset; - - LASSERT (nob > 0); - LASSERT (niov > 0); - LASSERT ((rd != tx->tx_rd) == !active); - - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - niov--; - iov++; - LASSERT (niov > 0); - } - - rd->rd_nfrag = 0; - do { - LASSERT (niov > 0); - - vaddr = ((unsigned long)iov->iov_base) + offset; - page_offset = vaddr & (PAGE_SIZE - 1); - page = kibnal_kvaddr_to_page(vaddr); - if (page == NULL) { - CERROR ("Can't find page\n"); - return -EFAULT; - } - - fragnob = min((int)(iov->iov_len - offset), nob); - fragnob = min(fragnob, (int)PAGE_SIZE - page_offset); - - rc = kibnal_append_rdfrag(rd, active, page, - page_offset, fragnob); - if (rc != 0) - return rc; - - if (offset + fragnob < iov->iov_len) { - offset += fragnob; - } else { - offset = 0; - iov++; - niov--; - } - nob -= fragnob; - } while (nob > 0); - - return 0; -} - -int -kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, int active, - int nkiov, lnet_kiov_t *kiov, int offset, int nob) -{ - int fragnob; - int rc; - - CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob); - - LASSERT (nob > 0); - LASSERT (nkiov > 0); - LASSERT ((rd != tx->tx_rd) == !active); - - while (offset >= kiov->kiov_len) { - offset -= kiov->kiov_len; - nkiov--; - kiov++; - LASSERT (nkiov > 0); - } - - rd->rd_nfrag = 0; - do { - LASSERT (nkiov > 0); - fragnob = min((int)(kiov->kiov_len - offset), nob); - - rc = kibnal_append_rdfrag(rd, active, kiov->kiov_page, - kiov->kiov_offset + offset, - fragnob); - if (rc != 0) - return rc; - - offset = 0; - kiov++; - nkiov--; - nob -= fragnob; - } while (nob > 0); - - return 0; -} -#else -int -kibnal_map_tx (kib_tx_t *tx, kib_rdma_desc_t *rd, int active, - int npages, unsigned long page_offset, int nob) -{ - IB_ACCESS_CONTROL access = {0,}; - FSTATUS frc; - - LASSERT ((rd != tx->tx_rd) == !active); - LASSERT (!tx->tx_md.md_active); - LASSERT (tx->tx_md.md_fmrcount > 0); - LASSERT (page_offset < PAGE_SIZE); - LASSERT (npages >= (1 + ((page_offset + nob - 1)>>PAGE_SHIFT))); - LASSERT (npages <= LNET_MAX_IOV); - - if (!active) { - // access.s.MWBindable = 1; - access.s.LocalWrite = 1; - access.s.RdmaWrite = 1; - } - - /* Map the memory described by tx->tx_pages - frc = iibt_register_physical_memory(kibnal_data.kib_hca, - IBNAL_RDMA_BASE, - tx->tx_pages, npages, - page_offset, - kibnal_data.kib_pd, - access, - &tx->tx_md.md_handle, - &tx->tx_md.md_addr, - &tx->tx_md.md_lkey, - &tx->tx_md.md_rkey); - */ - return -EINVAL; -} - -int -kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd, int active, - unsigned int niov, struct iovec *iov, int offset, int nob) - -{ - int resid; - int fragnob; - struct page *page; - int npages; - unsigned long page_offset; - unsigned long vaddr; - - LASSERT (nob > 0); - LASSERT (niov > 0); - - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - niov--; - iov++; - LASSERT (niov > 0); - } - - if (nob > iov->iov_len - offset) { - CERROR ("Can't map multiple vaddr fragments\n"); - return (-EMSGSIZE); - } - - vaddr = ((unsigned long)iov->iov_base) + offset; - - page_offset = vaddr & (PAGE_SIZE - 1); - resid = nob; - npages = 0; - - do { - LASSERT (npages < LNET_MAX_IOV); - - page = kibnal_kvaddr_to_page(vaddr); - if (page == NULL) { - CERROR("Can't find page for %lu\n", vaddr); - return -EFAULT; - } - - tx->tx_pages[npages++] = lnet_page2phys(page); - - fragnob = PAGE_SIZE - (vaddr & (PAGE_SIZE - 1)); - vaddr += fragnob; - resid -= fragnob; - - } while (resid > 0); - - return kibnal_map_tx(tx, rd, active, npages, page_offset, nob); -} - -int -kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, int active, - int nkiov, lnet_kiov_t *kiov, int offset, int nob) -{ - int resid; - int npages; - unsigned long page_offset; - - CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob); - - LASSERT (nob > 0); - LASSERT (nkiov > 0); - LASSERT (nkiov <= LNET_MAX_IOV); - LASSERT (!tx->tx_md.md_active); - LASSERT ((rd != tx->tx_rd) == !active); - - while (offset >= kiov->kiov_len) { - offset -= kiov->kiov_len; - nkiov--; - kiov++; - LASSERT (nkiov > 0); - } - - page_offset = kiov->kiov_offset + offset; - - resid = offset + nob; - npages = 0; - - do { - LASSERT (npages < LNET_MAX_IOV); - LASSERT (nkiov > 0); - - if ((npages > 0 && kiov->kiov_offset != 0) || - (resid > kiov->kiov_len && - (kiov->kiov_offset + kiov->kiov_len) != PAGE_SIZE)) { - /* Can't have gaps */ - CERROR ("Can't make payload contiguous in I/O VM:" - "page %d, offset %d, len %d \n", - npages, kiov->kiov_offset, kiov->kiov_len); - - return -EINVAL; - } - - tx->tx_pages[npages++] = lnet_page2phys(kiov->kiov_page); - resid -= kiov->kiov_len; - kiov++; - nkiov--; - } while (resid > 0); - - return kibnal_map_tx(tx, rd, active, npages, page_offset, nob); -} -#endif - -kib_conn_t * -kibnal_find_conn_locked (kib_peer_t *peer) -{ - struct list_head *tmp; - - /* just return the first connection */ - list_for_each (tmp, &peer->ibp_conns) { - return (list_entry(tmp, kib_conn_t, ibc_list)); - } - - return (NULL); -} - -void -kibnal_check_sends (kib_conn_t *conn) -{ - kib_tx_t *tx; - FSTATUS frc; - int rc; - int consume_cred; - int done; - - LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED); - - spin_lock(&conn->ibc_lock); - - LASSERT (conn->ibc_nsends_posted <= - *kibnal_tunables.kib_concurrent_sends); - LASSERT (conn->ibc_reserved_credits >= 0); - - while (conn->ibc_reserved_credits > 0 && - !list_empty(&conn->ibc_tx_queue_rsrvd)) { - LASSERT (conn->ibc_version != - IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD); - tx = list_entry(conn->ibc_tx_queue_rsrvd.next, - kib_tx_t, tx_list); - list_del(&tx->tx_list); - list_add_tail(&tx->tx_list, &conn->ibc_tx_queue); - conn->ibc_reserved_credits--; - } - - if (list_empty(&conn->ibc_tx_queue) && - list_empty(&conn->ibc_tx_queue_nocred) && - (conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER || - kibnal_send_keepalive(conn))) { - spin_unlock(&conn->ibc_lock); - - tx = kibnal_get_idle_tx(); - if (tx != NULL) - kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0); - - spin_lock(&conn->ibc_lock); - - if (tx != NULL) - kibnal_queue_tx_locked(tx, conn); - } - - for (;;) { - if (!list_empty(&conn->ibc_tx_queue_nocred)) { - LASSERT (conn->ibc_version != - IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD); - tx = list_entry (conn->ibc_tx_queue_nocred.next, - kib_tx_t, tx_list); - consume_cred = 0; - } else if (!list_empty (&conn->ibc_tx_queue)) { - tx = list_entry (conn->ibc_tx_queue.next, - kib_tx_t, tx_list); - consume_cred = 1; - } else { - /* nothing waiting */ - break; - } - - LASSERT (tx->tx_queued); - /* We rely on this for QP sizing */ - LASSERT (tx->tx_nwrq > 0 && tx->tx_nwrq <= 1 + IBNAL_MAX_RDMA_FRAGS); - - LASSERT (conn->ibc_outstanding_credits >= 0); - LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE); - LASSERT (conn->ibc_credits >= 0); - LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE); - - if (conn->ibc_nsends_posted == - *kibnal_tunables.kib_concurrent_sends) { - /* We've got some tx completions outstanding... */ - CDEBUG(D_NET, "%s: posted enough\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - break; - } - - if (consume_cred) { - if (conn->ibc_credits == 0) { /* no credits */ - CDEBUG(D_NET, "%s: no credits\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - break; - } - - if (conn->ibc_credits == 1 && /* last credit reserved for */ - conn->ibc_outstanding_credits == 0) { /* giving back credits */ - CDEBUG(D_NET, "%s: not using last credit\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - break; - } - } - - list_del (&tx->tx_list); - tx->tx_queued = 0; - - /* NB don't drop ibc_lock before bumping tx_sending */ - - if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP && - (!list_empty(&conn->ibc_tx_queue) || - !list_empty(&conn->ibc_tx_queue_nocred) || - (conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER && - !kibnal_send_keepalive(conn)))) { - /* redundant NOOP */ - spin_unlock(&conn->ibc_lock); - kibnal_tx_done(tx); - spin_lock(&conn->ibc_lock); - CDEBUG(D_NET, "%s: redundant noop\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - continue; - } - - kibnal_pack_msg(tx->tx_msg, conn->ibc_version, - conn->ibc_outstanding_credits, - conn->ibc_peer->ibp_nid, conn->ibc_incarnation, - conn->ibc_txseq); - - conn->ibc_txseq++; - conn->ibc_outstanding_credits = 0; - conn->ibc_nsends_posted++; - if (consume_cred) - conn->ibc_credits--; - - /* CAVEAT EMPTOR! This tx could be the PUT_DONE of an RDMA - * PUT. If so, it was first queued here as a PUT_REQ, sent and - * stashed on ibc_active_txs, matched by an incoming PUT_ACK, - * and then re-queued here. It's (just) possible that - * tx_sending is non-zero if we've not done the tx_complete() from - * the first send; hence the ++ rather than = below. */ - tx->tx_sending++; - - list_add (&tx->tx_list, &conn->ibc_active_txs); - - LASSERT (tx->tx_nwrq > 0); - - rc = 0; - frc = FSUCCESS; - if (conn->ibc_state != IBNAL_CONN_ESTABLISHED) { - rc = -ECONNABORTED; - } else { - frc = iba_post_send2(conn->ibc_qp, tx->tx_wrq, NULL); - if (frc != FSUCCESS) - rc = -EIO; - } - - conn->ibc_last_send = jiffies; - - if (rc != 0) { - /* NB credits are transferred in the actual - * message, which can only be the last work item */ - conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits; - if (consume_cred) - conn->ibc_credits++; - conn->ibc_nsends_posted--; - - tx->tx_status = rc; - tx->tx_waiting = 0; - tx->tx_sending--; - - done = (tx->tx_sending == 0); - if (done) - list_del (&tx->tx_list); - - spin_unlock(&conn->ibc_lock); - - if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) - CERROR ("Error %d posting transmit to %s\n", - frc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - else - CDEBUG (D_NET, "Error %d posting transmit to %s\n", - rc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - - kibnal_close_conn (conn, rc); - - if (done) - kibnal_tx_done (tx); - return; - } - } - - spin_unlock(&conn->ibc_lock); -} - -void -kibnal_tx_complete (IB_WORK_COMPLETION *wc) -{ - kib_tx_t *tx = (kib_tx_t *)kibnal_wreqid2ptr(wc->WorkReqId); - kib_conn_t *conn = tx->tx_conn; - int failed = wc->Status != WRStatusSuccess; - int idle; - - CDEBUG(D_NET, "%s: sending %d nwrq %d status %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), - tx->tx_sending, tx->tx_nwrq, wc->Status); - - LASSERT (tx->tx_sending > 0); - - if (failed && - tx->tx_status == 0 && - conn->ibc_state == IBNAL_CONN_ESTABLISHED) { -#if KIBLND_DETAILED_DEBUG - int i; - IB_WORK_REQ2 *wrq = &tx->tx_wrq[0]; - IB_LOCAL_DATASEGMENT *gl = &tx->tx_gl[0]; - lnet_msg_t *lntmsg = tx->tx_lntmsg[0]; -#endif - CDEBUG(D_NETERROR, "tx -> %s type %x cookie "LPX64 - " sending %d waiting %d failed %d nwrk %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), - tx->tx_msg->ibm_type, tx->tx_cookie, - tx->tx_sending, tx->tx_waiting, wc->Status, - tx->tx_nwrq); -#if KIBLND_DETAILED_DEBUG - for (i = 0; i < tx->tx_nwrq; i++, wrq++, gl++) { - switch (wrq->Operation) { - default: - CDEBUG(D_NETERROR, " [%3d] Addr %p Next %p OP %d " - "DSList %p(%p)/%d: "LPX64"/%d K %x\n", - i, wrq, wrq->Next, wrq->Operation, - wrq->DSList, gl, wrq->DSListDepth, - gl->Address, gl->Length, gl->Lkey); - break; - case WROpSend: - CDEBUG(D_NETERROR, " [%3d] Addr %p Next %p SEND " - "DSList %p(%p)/%d: "LPX64"/%d K %x\n", - i, wrq, wrq->Next, - wrq->DSList, gl, wrq->DSListDepth, - gl->Address, gl->Length, gl->Lkey); - break; - case WROpRdmaWrite: - CDEBUG(D_NETERROR, " [%3d] Addr %p Next %p DMA " - "DSList: %p(%p)/%d "LPX64"/%d K %x -> " - LPX64" K %x\n", - i, wrq, wrq->Next, - wrq->DSList, gl, wrq->DSListDepth, - gl->Address, gl->Length, gl->Lkey, - wrq->Req.SendRC.RemoteDS.Address, - wrq->Req.SendRC.RemoteDS.Rkey); - break; - } - } - - switch (tx->tx_msg->ibm_type) { - default: - CDEBUG(D_NETERROR, " msg type %x %p/%d, No RDMA\n", - tx->tx_msg->ibm_type, - tx->tx_msg, tx->tx_msg->ibm_nob); - break; - - case IBNAL_MSG_PUT_DONE: - case IBNAL_MSG_GET_DONE: - CDEBUG(D_NETERROR, " msg type %x %p/%d, RDMA key %x frags %d...\n", - tx->tx_msg->ibm_type, - tx->tx_msg, tx->tx_msg->ibm_nob, - tx->tx_rd->rd_key, tx->tx_rd->rd_nfrag); - for (i = 0; i < tx->tx_rd->rd_nfrag; i++) - CDEBUG(D_NETERROR, " [%d] "LPX64"/%d\n", i, - tx->tx_rd->rd_frags[i].rf_addr, - tx->tx_rd->rd_frags[i].rf_nob); - if (lntmsg == NULL) { - CDEBUG(D_NETERROR, " No lntmsg\n"); - } else if (lntmsg->msg_iov != NULL) { - CDEBUG(D_NETERROR, " lntmsg in %d VIRT frags...\n", - lntmsg->msg_niov); - for (i = 0; i < lntmsg->msg_niov; i++) - CDEBUG(D_NETERROR, " [%d] %p/%d\n", i, - lntmsg->msg_iov[i].iov_base, - lntmsg->msg_iov[i].iov_len); - } else if (lntmsg->msg_kiov != NULL) { - CDEBUG(D_NETERROR, " lntmsg in %d PAGE frags...\n", - lntmsg->msg_niov); - for (i = 0; i < lntmsg->msg_niov; i++) - CDEBUG(D_NETERROR, " [%d] %p+%d/%d\n", i, - lntmsg->msg_kiov[i].kiov_page, - lntmsg->msg_kiov[i].kiov_offset, - lntmsg->msg_kiov[i].kiov_len); - } else { - CDEBUG(D_NETERROR, " lntmsg in %d frags\n", - lntmsg->msg_niov); - } - - break; - } -#endif - } - - spin_lock(&conn->ibc_lock); - - /* I could be racing with rdma completion. Whoever makes 'tx' idle - * gets to free it, which also drops its ref on 'conn'. */ - - tx->tx_sending--; - conn->ibc_nsends_posted--; - - if (failed) { - tx->tx_waiting = 0; - tx->tx_status = -EIO; - } - - idle = (tx->tx_sending == 0) && /* This is the final callback */ - !tx->tx_waiting && /* Not waiting for peer */ - !tx->tx_queued; /* Not re-queued (PUT_DONE) */ - if (idle) - list_del(&tx->tx_list); - - kibnal_conn_addref(conn); /* 1 ref for me.... */ - - spin_unlock(&conn->ibc_lock); - - if (idle) - kibnal_tx_done (tx); - - if (failed) { - kibnal_close_conn (conn, -EIO); - } else { - kibnal_peer_alive(conn->ibc_peer); - kibnal_check_sends(conn); - } - - kibnal_conn_decref(conn); /* ...until here */ -} - -void -kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob) -{ - IB_LOCAL_DATASEGMENT *gl = &tx->tx_gl[tx->tx_nwrq]; - IB_WORK_REQ2 *wrq = &tx->tx_wrq[tx->tx_nwrq]; - int nob = offsetof (kib_msg_t, ibm_u) + body_nob; - - LASSERT (tx->tx_nwrq >= 0 && - tx->tx_nwrq < (1 + IBNAL_MAX_RDMA_FRAGS)); - LASSERT (nob <= IBNAL_MSG_SIZE); - - kibnal_init_msg(tx->tx_msg, type, body_nob); - - *gl = (IB_LOCAL_DATASEGMENT) { - .Address = tx->tx_hca_msg, - .Length = IBNAL_MSG_SIZE, - .Lkey = kibnal_data.kib_whole_mem.md_lkey, - }; - - wrq->Next = NULL; /* This is the last one */ - - wrq->WorkReqId = kibnal_ptr2wreqid(tx, IBNAL_WID_TX); - wrq->Operation = WROpSend; - wrq->DSList = gl; - wrq->DSListDepth = 1; - wrq->MessageLen = nob; - wrq->Req.SendRC.ImmediateData = 0; - wrq->Req.SendRC.Options.s.SolicitedEvent = 1; - wrq->Req.SendRC.Options.s.SignaledCompletion = 1; - wrq->Req.SendRC.Options.s.ImmediateData = 0; - wrq->Req.SendRC.Options.s.Fence = 0; - /* fence only needed on RDMA reads */ - - tx->tx_nwrq++; -} - -int -kibnal_init_rdma (kib_tx_t *tx, int type, int nob, - kib_rdma_desc_t *dstrd, __u64 dstcookie) -{ - kib_msg_t *ibmsg = tx->tx_msg; - kib_rdma_desc_t *srcrd = tx->tx_rd; - IB_LOCAL_DATASEGMENT *gl; - IB_WORK_REQ2 *wrq; - int rc; - -#if IBNAL_USE_FMR - LASSERT (tx->tx_nwrq == 0); - - gl = &tx->tx_gl[0]; - gl->Length = nob; - gl->Address = srcrd->rd_addr; - gl->Lkey = srcrd->rd_key; - - wrq = &tx->tx_wrq[0]; - - wrq->Next = wrq + 1; - wrq->WorkReqId = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA); - wrq->Operation = WROpRdmaWrite; - wrq->DSList = gl; - wrq->DSListDepth = 1; - wrq->MessageLen = nob; - - wrq->Req.SendRC.ImmediateData = 0; - wrq->Req.SendRC.Options.s.SolicitedEvent = 0; - wrq->Req.SendRC.Options.s.SignaledCompletion = 0; - wrq->Req.SendRC.Options.s.ImmediateData = 0; - wrq->Req.SendRC.Options.s.Fence = 0; - - wrq->Req.SendRC.RemoteDS.Address = dstrd->rd_addr; - wrq->Req.SendRC.RemoteDS.Rkey = dstrd->rd_key; - - tx->tx_nwrq = 1; - rc = nob; -#else - /* CAVEAT EMPTOR: this 'consumes' the frags in 'dstrd' */ - int resid = nob; - kib_rdma_frag_t *srcfrag; - int srcidx; - kib_rdma_frag_t *dstfrag; - int dstidx; - int wrknob; - - /* Called by scheduler */ - LASSERT (!in_interrupt()); - - LASSERT (type == IBNAL_MSG_GET_DONE || - type == IBNAL_MSG_PUT_DONE); - - srcidx = dstidx = 0; - srcfrag = &srcrd->rd_frags[0]; - dstfrag = &dstrd->rd_frags[0]; - rc = resid; - - while (resid > 0) { - if (srcidx >= srcrd->rd_nfrag) { - CERROR("Src buffer exhausted: %d frags\n", srcidx); - rc = -EPROTO; - break; - } - - if (dstidx == dstrd->rd_nfrag) { - CERROR("Dst buffer exhausted: %d frags\n", dstidx); - rc = -EPROTO; - break; - } - - if (tx->tx_nwrq == IBNAL_MAX_RDMA_FRAGS) { - CERROR("RDMA too fragmented: %d/%d src %d/%d dst frags\n", - srcidx, srcrd->rd_nfrag, - dstidx, dstrd->rd_nfrag); - rc = -EMSGSIZE; - break; - } - - wrknob = MIN(MIN(srcfrag->rf_nob, dstfrag->rf_nob), resid); - - gl = &tx->tx_gl[tx->tx_nwrq]; - gl->Length = wrknob; - gl->Address = srcfrag->rf_addr; - gl->Lkey = srcrd->rd_key; - - wrq = &tx->tx_wrq[tx->tx_nwrq]; - - wrq->Next = wrq + 1; - wrq->WorkReqId = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA); - wrq->Operation = WROpRdmaWrite; - wrq->DSList = gl; - wrq->DSListDepth = 1; - wrq->MessageLen = nob; - - wrq->Req.SendRC.ImmediateData = 0; - wrq->Req.SendRC.Options.s.SolicitedEvent = 0; - wrq->Req.SendRC.Options.s.SignaledCompletion = 0; - wrq->Req.SendRC.Options.s.ImmediateData = 0; - wrq->Req.SendRC.Options.s.Fence = 0; - - wrq->Req.SendRC.RemoteDS.Address = dstfrag->rf_addr; - wrq->Req.SendRC.RemoteDS.Rkey = dstrd->rd_key; - - resid -= wrknob; - if (wrknob < srcfrag->rf_nob) { - srcfrag->rf_addr += wrknob; - srcfrag->rf_nob -= wrknob; - } else { - srcfrag++; - srcidx++; - } - - if (wrknob < dstfrag->rf_nob) { - dstfrag->rf_addr += wrknob; - dstfrag->rf_nob -= wrknob; - } else { - dstfrag++; - dstidx++; - } - - tx->tx_nwrq++; - } - - if (rc < 0) /* no RDMA if completing with failure */ - tx->tx_nwrq = 0; -#endif - - ibmsg->ibm_u.completion.ibcm_status = rc; - ibmsg->ibm_u.completion.ibcm_cookie = dstcookie; - kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t)); - - return rc; -} - -void -kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn) -{ - spin_lock(&conn->ibc_lock); - kibnal_queue_tx_locked (tx, conn); - spin_unlock(&conn->ibc_lock); - - kibnal_check_sends(conn); -} - -void -kibnal_schedule_active_connect_locked (kib_peer_t *peer, int proto_version) -{ - /* Called holding kib_global_lock exclusive with IRQs disabled */ - - peer->ibp_version = proto_version; /* proto version for new conn */ - peer->ibp_connecting++; /* I'm connecting */ - kibnal_peer_addref(peer); /* extra ref for connd */ - - spin_lock(&kibnal_data.kib_connd_lock); - - list_add_tail (&peer->ibp_connd_list, &kibnal_data.kib_connd_peers); - wake_up (&kibnal_data.kib_connd_waitq); - - spin_unlock(&kibnal_data.kib_connd_lock); -} - -void -kibnal_schedule_active_connect (kib_peer_t *peer, int proto_version) -{ - unsigned long flags; - - write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - - kibnal_schedule_active_connect_locked(peer, proto_version); - - write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); -} - -void -kibnal_launch_tx (kib_tx_t *tx, lnet_nid_t nid) -{ - kib_peer_t *peer; - kib_conn_t *conn; - unsigned long flags; - rwlock_t *g_lock = &kibnal_data.kib_global_lock; - int retry; - int rc; - - /* If I get here, I've committed to send, so I complete the tx with - * failure on any problems */ - - LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */ - LASSERT (tx->tx_nwrq > 0); /* work items have been set up */ - - for (retry = 0; ; retry = 1) { - read_lock_irqsave(g_lock, flags); - - peer = kibnal_find_peer_locked (nid); - if (peer != NULL) { - conn = kibnal_find_conn_locked (peer); - if (conn != NULL) { - kibnal_conn_addref(conn); /* 1 ref for me... */ - read_unlock_irqrestore(g_lock, flags); - - kibnal_queue_tx (tx, conn); - kibnal_conn_decref(conn); /* ...to here */ - return; - } - } - - /* Making one or more connections; I'll need a write lock... */ - read_unlock(g_lock); - write_lock(g_lock); - - peer = kibnal_find_peer_locked (nid); - if (peer != NULL) - break; - - write_unlock_irqrestore(g_lock, flags); - - if (retry) { - CERROR("Can't find peer %s\n", libcfs_nid2str(nid)); - - tx->tx_status = -EHOSTUNREACH; - tx->tx_waiting = 0; - kibnal_tx_done (tx); - return; - } - - rc = kibnal_add_persistent_peer(nid); - if (rc != 0) { - CERROR("Can't add peer %s: %d\n", - libcfs_nid2str(nid), rc); - - tx->tx_status = -EHOSTUNREACH; - tx->tx_waiting = 0; - kibnal_tx_done (tx); - return; - } - } - - conn = kibnal_find_conn_locked (peer); - if (conn != NULL) { - /* Connection exists; queue message on it */ - kibnal_conn_addref(conn); /* 1 ref for me... */ - write_unlock_irqrestore(g_lock, flags); - - kibnal_queue_tx (tx, conn); - kibnal_conn_decref(conn); /* ...until here */ - return; - } - - if (!kibnal_peer_connecting(peer)) { - if (!(peer->ibp_reconnect_interval == 0 || /* first attempt */ - time_after_eq(jiffies, peer->ibp_reconnect_time))) { - write_unlock_irqrestore(g_lock, flags); - tx->tx_status = -EHOSTUNREACH; - tx->tx_waiting = 0; - kibnal_tx_done (tx); - return; - } - - kibnal_schedule_active_connect_locked(peer, IBNAL_MSG_VERSION); - } - - /* A connection is being established; queue the message... */ - list_add_tail (&tx->tx_list, &peer->ibp_tx_queue); - - write_unlock_irqrestore(g_lock, flags); -} - -void -kibnal_txlist_done (struct list_head *txlist, int status) -{ - kib_tx_t *tx; - - while (!list_empty (txlist)) { - tx = list_entry (txlist->next, kib_tx_t, tx_list); - - list_del (&tx->tx_list); - /* complete now */ - tx->tx_waiting = 0; - tx->tx_status = status; - kibnal_tx_done (tx); - } -} - -int -kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) -{ - lnet_hdr_t *hdr = &lntmsg->msg_hdr; - int type = lntmsg->msg_type; - lnet_process_id_t target = lntmsg->msg_target; - int target_is_router = lntmsg->msg_target_is_router; - int routing = lntmsg->msg_routing; - unsigned int payload_niov = lntmsg->msg_niov; - struct iovec *payload_iov = lntmsg->msg_iov; - lnet_kiov_t *payload_kiov = lntmsg->msg_kiov; - unsigned int payload_offset = lntmsg->msg_offset; - unsigned int payload_nob = lntmsg->msg_len; - kib_msg_t *ibmsg; - kib_tx_t *tx; - int nob; - int rc; - - /* NB 'private' is different depending on what we're sending.... */ - - CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n", - payload_nob, payload_niov, libcfs_id2str(target)); - - LASSERT (payload_nob == 0 || payload_niov > 0); - LASSERT (payload_niov <= LNET_MAX_IOV); - - /* Thread context */ - LASSERT (!in_interrupt()); - /* payload is either all vaddrs or all pages */ - LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); - - switch (type) { - default: - LBUG(); - return (-EIO); - - case LNET_MSG_ACK: - LASSERT (payload_nob == 0); - break; - - case LNET_MSG_GET: - if (routing || target_is_router) - break; /* send IMMEDIATE */ - - /* is the REPLY message too small for RDMA? */ - nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]); - if (nob <= IBNAL_MSG_SIZE) - break; /* send IMMEDIATE */ - - tx = kibnal_get_idle_tx(); - if (tx == NULL) { - CERROR("Can allocate txd for GET to %s: \n", - libcfs_nid2str(target.nid)); - return -ENOMEM; - } - - ibmsg = tx->tx_msg; - ibmsg->ibm_u.get.ibgm_hdr = *hdr; - ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie; - - if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0) - rc = kibnal_setup_rd_iov(tx, &ibmsg->ibm_u.get.ibgm_rd, - 0, - lntmsg->msg_md->md_niov, - lntmsg->msg_md->md_iov.iov, - 0, lntmsg->msg_md->md_length); - else - rc = kibnal_setup_rd_kiov(tx, &ibmsg->ibm_u.get.ibgm_rd, - 0, - lntmsg->msg_md->md_niov, - lntmsg->msg_md->md_iov.kiov, - 0, lntmsg->msg_md->md_length); - if (rc != 0) { - CERROR("Can't setup GET sink for %s: %d\n", - libcfs_nid2str(target.nid), rc); - kibnal_tx_done(tx); - return -EIO; - } - -#if IBNAL_USE_FMR - nob = sizeof(kib_get_msg_t); -#else - { - int n = ibmsg->ibm_u.get.ibgm_rd.rd_nfrag; - - nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]); - } -#endif - kibnal_init_tx_msg(tx, IBNAL_MSG_GET_REQ, nob); - - tx->tx_lntmsg[1] = lnet_create_reply_msg(kibnal_data.kib_ni, - lntmsg); - if (tx->tx_lntmsg[1] == NULL) { - CERROR("Can't create reply for GET -> %s\n", - libcfs_nid2str(target.nid)); - kibnal_tx_done(tx); - return -EIO; - } - - tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg[0,1] on completion */ - tx->tx_waiting = 1; /* waiting for GET_DONE */ - kibnal_launch_tx(tx, target.nid); - return 0; - - case LNET_MSG_REPLY: - case LNET_MSG_PUT: - /* Is the payload small enough not to need RDMA? */ - nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); - if (nob <= IBNAL_MSG_SIZE) - break; /* send IMMEDIATE */ - - tx = kibnal_get_idle_tx(); - if (tx == NULL) { - CERROR("Can't allocate %s txd for %s\n", - type == LNET_MSG_PUT ? "PUT" : "REPLY", - libcfs_nid2str(target.nid)); - return -ENOMEM; - } - - if (payload_kiov == NULL) - rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 1, - payload_niov, payload_iov, - payload_offset, payload_nob); - else - rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 1, - payload_niov, payload_kiov, - payload_offset, payload_nob); - if (rc != 0) { - CERROR("Can't setup PUT src for %s: %d\n", - libcfs_nid2str(target.nid), rc); - kibnal_tx_done(tx); - return -EIO; - } - - ibmsg = tx->tx_msg; - ibmsg->ibm_u.putreq.ibprm_hdr = *hdr; - ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie; - kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_REQ, sizeof(kib_putreq_msg_t)); - - tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ - tx->tx_waiting = 1; /* waiting for PUT_{ACK,NAK} */ - kibnal_launch_tx(tx, target.nid); - return 0; - } - - /* send IMMEDIATE */ - - LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]) - <= IBNAL_MSG_SIZE); - - tx = kibnal_get_idle_tx(); - if (tx == NULL) { - CERROR ("Can't send %d to %s: tx descs exhausted\n", - type, libcfs_nid2str(target.nid)); - return -ENOMEM; - } - - ibmsg = tx->tx_msg; - ibmsg->ibm_u.immediate.ibim_hdr = *hdr; - - if (payload_kiov != NULL) - lnet_copy_kiov2flat(IBNAL_MSG_SIZE, ibmsg, - offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), - payload_niov, payload_kiov, - payload_offset, payload_nob); - else - lnet_copy_iov2flat(IBNAL_MSG_SIZE, ibmsg, - offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), - payload_niov, payload_iov, - payload_offset, payload_nob); - - nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]); - kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, nob); - - tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ - kibnal_launch_tx(tx, target.nid); - return 0; -} - -void -kibnal_reply(lnet_ni_t *ni, kib_rx_t *rx, lnet_msg_t *lntmsg) -{ - lnet_process_id_t target = lntmsg->msg_target; - unsigned int niov = lntmsg->msg_niov; - struct iovec *iov = lntmsg->msg_iov; - lnet_kiov_t *kiov = lntmsg->msg_kiov; - unsigned int offset = lntmsg->msg_offset; - unsigned int nob = lntmsg->msg_len; - kib_tx_t *tx; - int rc; - - tx = kibnal_get_idle_tx(); - if (tx == NULL) { - CERROR("Can't get tx for REPLY to %s\n", - libcfs_nid2str(target.nid)); - goto failed_0; - } - - if (nob == 0) - rc = 0; - else if (kiov == NULL) - rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 1, - niov, iov, offset, nob); - else - rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 1, - niov, kiov, offset, nob); - - if (rc != 0) { - CERROR("Can't setup GET src for %s: %d\n", - libcfs_nid2str(target.nid), rc); - goto failed_1; - } - - rc = kibnal_init_rdma(tx, IBNAL_MSG_GET_DONE, nob, - &rx->rx_msg->ibm_u.get.ibgm_rd, - rx->rx_msg->ibm_u.get.ibgm_cookie); - if (rc < 0) { - CERROR("Can't setup rdma for GET from %s: %d\n", - libcfs_nid2str(target.nid), rc); - goto failed_1; - } - - if (rc == 0) { - /* No RDMA: local completion may happen now! */ - lnet_finalize(ni, lntmsg, 0); - } else { - /* RDMA: lnet_finalize(lntmsg) when it - * completes */ - tx->tx_lntmsg[0] = lntmsg; - } - - kibnal_queue_tx(tx, rx->rx_conn); - return; - - failed_1: - kibnal_tx_done(tx); - failed_0: - lnet_finalize(ni, lntmsg, -EIO); -} - -int -kibnal_eager_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, - void **new_private) -{ - kib_rx_t *rx = private; - kib_conn_t *conn = rx->rx_conn; - - if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) { - /* Can't block if RDMA completions need normal credits */ - LCONSOLE_ERROR_MSG(0x12d, "Dropping message from %s: no " - "buffers free. %s is running an old version" - " of LNET that may deadlock if messages " - "wait for buffers)\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - return -EDEADLK; - } - - *new_private = private; - return 0; -} - -int -kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, - unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, - unsigned int offset, unsigned int mlen, unsigned int rlen) -{ - kib_rx_t *rx = private; - kib_msg_t *rxmsg = rx->rx_msg; - kib_conn_t *conn = rx->rx_conn; - kib_tx_t *tx; - kib_msg_t *txmsg; - int nob; - int post_cred = 1; - int rc = 0; - - LASSERT (mlen <= rlen); - LASSERT (!in_interrupt()); - /* Either all pages or all vaddrs */ - LASSERT (!(kiov != NULL && iov != NULL)); - - switch (rxmsg->ibm_type) { - default: - LBUG(); - - case IBNAL_MSG_IMMEDIATE: - nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]); - if (nob > rx->rx_nob) { - CERROR ("Immediate message from %s too big: %d(%d)\n", - libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid), - nob, rx->rx_nob); - rc = -EPROTO; - break; - } - - if (kiov != NULL) - lnet_copy_flat2kiov(niov, kiov, offset, - IBNAL_MSG_SIZE, rxmsg, - offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), - mlen); - else - lnet_copy_flat2iov(niov, iov, offset, - IBNAL_MSG_SIZE, rxmsg, - offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), - mlen); - lnet_finalize (ni, lntmsg, 0); - break; - - case IBNAL_MSG_PUT_REQ: - if (mlen == 0) { - lnet_finalize(ni, lntmsg, 0); - kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, 0, - rxmsg->ibm_u.putreq.ibprm_cookie); - break; - } - - tx = kibnal_get_idle_tx(); - if (tx == NULL) { - CERROR("Can't allocate tx for %s\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - /* Not replying will break the connection */ - rc = -ENOMEM; - break; - } - - txmsg = tx->tx_msg; - if (kiov == NULL) - rc = kibnal_setup_rd_iov(tx, - &txmsg->ibm_u.putack.ibpam_rd, - 0, - niov, iov, offset, mlen); - else - rc = kibnal_setup_rd_kiov(tx, - &txmsg->ibm_u.putack.ibpam_rd, - 0, - niov, kiov, offset, mlen); - if (rc != 0) { - CERROR("Can't setup PUT sink for %s: %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); - kibnal_tx_done(tx); - /* tell peer it's over */ - kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, rc, - rxmsg->ibm_u.putreq.ibprm_cookie); - break; - } - - txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie; - txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie; -#if IBNAL_USE_FMR - nob = sizeof(kib_putack_msg_t); -#else - { - int n = tx->tx_msg->ibm_u.putack.ibpam_rd.rd_nfrag; - - nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]); - } -#endif - kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_ACK, nob); - - tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ - tx->tx_waiting = 1; /* waiting for PUT_DONE */ - kibnal_queue_tx(tx, conn); - - if (conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) - post_cred = 0; /* peer still owns 'rx' for sending PUT_DONE */ - break; - - case IBNAL_MSG_GET_REQ: - if (lntmsg != NULL) { - /* Optimized GET; RDMA lntmsg's payload */ - kibnal_reply(ni, rx, lntmsg); - } else { - /* GET didn't match anything */ - kibnal_send_completion(rx->rx_conn, IBNAL_MSG_GET_DONE, - -ENODATA, - rxmsg->ibm_u.get.ibgm_cookie); - } - break; - } - - kibnal_post_rx(rx, post_cred, 0); - return rc; -} - -int -kibnal_thread_start (int (*fn)(void *arg), void *arg) -{ - long pid = kernel_thread (fn, arg, 0); - - if (pid < 0) - return ((int)pid); - - atomic_inc (&kibnal_data.kib_nthreads); - return (0); -} - -void -kibnal_thread_fini (void) -{ - atomic_dec (&kibnal_data.kib_nthreads); -} - -void -kibnal_peer_alive (kib_peer_t *peer) -{ - /* This is racy, but everyone's only writing cfs_time_current() */ - peer->ibp_last_alive = cfs_time_current(); - mb(); -} - -void -kibnal_peer_notify (kib_peer_t *peer) -{ - time_t last_alive = 0; - int error = 0; - unsigned long flags; - - read_lock_irqsave(&kibnal_data.kib_global_lock, flags); - - if (list_empty(&peer->ibp_conns) && - peer->ibp_accepting == 0 && - peer->ibp_connecting == 0 && - peer->ibp_error != 0) { - error = peer->ibp_error; - peer->ibp_error = 0; - last_alive = cfs_time_current_sec() - - cfs_duration_sec(cfs_time_current() - - peer->ibp_last_alive); - } - - read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - - if (error != 0) - lnet_notify(kibnal_data.kib_ni, peer->ibp_nid, 0, last_alive); -} - -void -kibnal_schedule_conn (kib_conn_t *conn) -{ - unsigned long flags; - - kibnal_conn_addref(conn); /* ++ref for connd */ - - spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); - - list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns); - wake_up (&kibnal_data.kib_connd_waitq); - - spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); -} - -void -kibnal_close_conn_locked (kib_conn_t *conn, int error) -{ - /* This just does the immediate housekeeping to start shutdown of an - * established connection. 'error' is zero for a normal shutdown. - * Caller holds kib_global_lock exclusively in irq context */ - kib_peer_t *peer = conn->ibc_peer; - - LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED); - - if (conn->ibc_state != IBNAL_CONN_ESTABLISHED) - return; /* already being handled */ - - /* NB Can't take ibc_lock here (could be in IRQ context), without - * risking deadlock, so access to ibc_{tx_queue,active_txs} is racey */ - - if (error == 0 && - list_empty(&conn->ibc_tx_queue) && - list_empty(&conn->ibc_tx_queue_rsrvd) && - list_empty(&conn->ibc_tx_queue_nocred) && - list_empty(&conn->ibc_active_txs)) { - CDEBUG(D_NET, "closing conn to %s" - " rx# "LPD64" tx# "LPD64"\n", - libcfs_nid2str(peer->ibp_nid), - conn->ibc_txseq, conn->ibc_rxseq); - } else { - CDEBUG(D_NETERROR, "Closing conn to %s: error %d%s%s%s%s" - " rx# "LPD64" tx# "LPD64"\n", - libcfs_nid2str(peer->ibp_nid), error, - list_empty(&conn->ibc_tx_queue) ? "" : "(sending)", - list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)", - list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)", - list_empty(&conn->ibc_active_txs) ? "" : "(waiting)", - conn->ibc_txseq, conn->ibc_rxseq); -#if 0 - /* can't skip down the queue without holding ibc_lock (see above) */ - list_for_each(tmp, &conn->ibc_tx_queue) { - kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list); - - CERROR(" queued tx type %x cookie "LPX64 - " sending %d waiting %d ticks %ld/%d\n", - tx->tx_msg->ibm_type, tx->tx_cookie, - tx->tx_sending, tx->tx_waiting, - (long)(tx->tx_deadline - jiffies), HZ); - } - - list_for_each(tmp, &conn->ibc_active_txs) { - kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list); - - CERROR(" active tx type %x cookie "LPX64 - " sending %d waiting %d ticks %ld/%d\n", - tx->tx_msg->ibm_type, tx->tx_cookie, - tx->tx_sending, tx->tx_waiting, - (long)(tx->tx_deadline - jiffies), HZ); - } -#endif - } - - list_del (&conn->ibc_list); - - if (list_empty (&peer->ibp_conns)) { /* no more conns */ - if (peer->ibp_persistence == 0 && /* non-persistent peer */ - kibnal_peer_active(peer)) /* still in peer table */ - kibnal_unlink_peer_locked (peer); - - peer->ibp_error = error; /* set/clear error on last conn */ - } - - kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTING); - - kibnal_schedule_conn(conn); - kibnal_conn_decref(conn); /* lose ibc_list's ref */ -} - -void -kibnal_close_conn (kib_conn_t *conn, int error) -{ - unsigned long flags; - - write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - - kibnal_close_conn_locked (conn, error); - - write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); -} - -void -kibnal_handle_early_rxs(kib_conn_t *conn) -{ - unsigned long flags; - kib_rx_t *rx; - - LASSERT (!in_interrupt()); - LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED); - - write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - while (!list_empty(&conn->ibc_early_rxs)) { - rx = list_entry(conn->ibc_early_rxs.next, - kib_rx_t, rx_list); - list_del(&rx->rx_list); - write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - - kibnal_handle_rx(rx); - - write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - } - write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); -} - -void -kibnal_abort_txs(kib_conn_t *conn, struct list_head *txs) -{ - LIST_HEAD (zombies); - struct list_head *tmp; - struct list_head *nxt; - kib_tx_t *tx; - - spin_lock(&conn->ibc_lock); - - list_for_each_safe (tmp, nxt, txs) { - tx = list_entry (tmp, kib_tx_t, tx_list); - - if (txs == &conn->ibc_active_txs) { - LASSERT (!tx->tx_queued); - LASSERT (tx->tx_waiting || tx->tx_sending != 0); - } else { - LASSERT (tx->tx_queued); - } - - tx->tx_status = -ECONNABORTED; - tx->tx_queued = 0; - tx->tx_waiting = 0; - - if (tx->tx_sending == 0) { - list_del (&tx->tx_list); - list_add (&tx->tx_list, &zombies); - } - } - - spin_unlock(&conn->ibc_lock); - - kibnal_txlist_done(&zombies, -ECONNABORTED); -} - -void -kibnal_conn_disconnected(kib_conn_t *conn) -{ - static IB_QP_ATTRIBUTES_MODIFY qpam = {.RequestState = QPStateError}; - - FSTATUS frc; - - LASSERT (conn->ibc_state >= IBNAL_CONN_INIT_QP); - - kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTED); - - /* move QP to error state to make posted work items complete */ - frc = iba_modify_qp(conn->ibc_qp, &qpam, NULL); - if (frc != FSUCCESS) - CERROR("can't move qp state to error: %d\n", frc); - - /* Complete all tx descs not waiting for sends to complete. - * NB we should be safe from RDMA now that the QP has changed state */ - - kibnal_abort_txs(conn, &conn->ibc_tx_queue); - kibnal_abort_txs(conn, &conn->ibc_tx_queue_rsrvd); - kibnal_abort_txs(conn, &conn->ibc_tx_queue); - kibnal_abort_txs(conn, &conn->ibc_active_txs); - - kibnal_handle_early_rxs(conn); -} - -void -kibnal_peer_connect_failed (kib_peer_t *peer, int type, int error) -{ - LIST_HEAD (zombies); - unsigned long flags; - - LASSERT (error != 0); - LASSERT (!in_interrupt()); - - write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - - LASSERT (kibnal_peer_connecting(peer)); - - switch (type) { - case IBNAL_CONN_ACTIVE: - LASSERT (peer->ibp_connecting > 0); - peer->ibp_connecting--; - break; - - case IBNAL_CONN_PASSIVE: - LASSERT (peer->ibp_accepting > 0); - peer->ibp_accepting--; - break; - - case IBNAL_CONN_WAITING: - /* Can't assert; I might be racing with a successful connection - * which clears passivewait */ - peer->ibp_passivewait = 0; - break; - default: - LBUG(); - } - - if (kibnal_peer_connecting(peer) || /* another attempt underway */ - !list_empty(&peer->ibp_conns)) { /* got connected */ - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - return; - } - - /* Say when active connection can be re-attempted */ - peer->ibp_reconnect_interval *= 2; - peer->ibp_reconnect_interval = - MAX(peer->ibp_reconnect_interval, - *kibnal_tunables.kib_min_reconnect_interval); - peer->ibp_reconnect_interval = - MIN(peer->ibp_reconnect_interval, - *kibnal_tunables.kib_max_reconnect_interval); - - peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval * HZ; - - /* Take peer's blocked transmits to complete with error */ - list_add(&zombies, &peer->ibp_tx_queue); - list_del_init(&peer->ibp_tx_queue); - - if (kibnal_peer_active(peer) && - peer->ibp_persistence == 0) { - /* failed connection attempt on non-persistent peer */ - kibnal_unlink_peer_locked (peer); - } - - peer->ibp_error = error; - - write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - - kibnal_peer_notify(peer); - - if (list_empty (&zombies)) - return; - - CDEBUG (D_NETERROR, "Deleting messages for %s: connection failed\n", - libcfs_nid2str(peer->ibp_nid)); - - kibnal_txlist_done (&zombies, -EHOSTUNREACH); -} - -void -kibnal_connreq_done (kib_conn_t *conn, int type, int status) -{ - kib_peer_t *peer = conn->ibc_peer; - struct list_head txs; - kib_tx_t *tx; - unsigned long flags; - - LASSERT (!in_interrupt()); - LASSERT (type == IBNAL_CONN_ACTIVE || type == IBNAL_CONN_PASSIVE); - LASSERT (conn->ibc_state >= IBNAL_CONN_INIT_QP); - LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED); - LASSERT (kibnal_peer_connecting(peer)); - - LIBCFS_FREE(conn->ibc_cvars, sizeof(*conn->ibc_cvars)); - conn->ibc_cvars = NULL; - - if (status != 0) { - /* failed to establish connection */ - kibnal_peer_connect_failed(conn->ibc_peer, type, status); - kibnal_conn_disconnected(conn); - kibnal_conn_decref(conn); /* Lose CM's ref */ - return; - } - - /* connection established */ - LASSERT(conn->ibc_state == IBNAL_CONN_CONNECTING); - - conn->ibc_last_send = jiffies; - kibnal_set_conn_state(conn, IBNAL_CONN_ESTABLISHED); - kibnal_peer_alive(peer); - - CDEBUG(D_NET, "Connection %s ESTABLISHED\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - - write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - - peer->ibp_passivewait = 0; /* not waiting (got conn now) */ - kibnal_conn_addref(conn); /* +1 ref for ibc_list */ - list_add_tail(&conn->ibc_list, &peer->ibp_conns); - - if (!kibnal_peer_active(peer)) { - /* peer has been deleted */ - kibnal_close_conn_locked(conn, -ECONNABORTED); - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - - kibnal_peer_connect_failed(conn->ibc_peer, type, -ECONNABORTED); - kibnal_conn_decref(conn); /* lose CM's ref */ - return; - } - - switch (type) { - case IBNAL_CONN_ACTIVE: - LASSERT (peer->ibp_connecting > 0); - peer->ibp_connecting--; - break; - - case IBNAL_CONN_PASSIVE: - LASSERT (peer->ibp_accepting > 0); - peer->ibp_accepting--; - break; - default: - LBUG(); - } - - peer->ibp_reconnect_interval = 0; /* OK to reconnect at any time */ - - /* Nuke any dangling conns from a different peer instance... */ - kibnal_close_stale_conns_locked(peer, conn->ibc_incarnation); - - /* grab txs blocking for a conn */ - list_add(&txs, &peer->ibp_tx_queue); - list_del_init(&peer->ibp_tx_queue); - - write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - - /* Schedule blocked txs */ - spin_lock (&conn->ibc_lock); - while (!list_empty (&txs)) { - tx = list_entry (txs.next, kib_tx_t, tx_list); - list_del (&tx->tx_list); - - kibnal_queue_tx_locked (tx, conn); - } - spin_unlock (&conn->ibc_lock); - kibnal_check_sends (conn); -} - -void -kibnal_reject (lnet_nid_t nid, IB_HANDLE cep, int why) -{ - static CM_REJECT_INFO msgs[3]; - CM_REJECT_INFO *msg = &msgs[why]; - FSTATUS frc; - - LASSERT (why >= 0 && why < sizeof(msgs)/sizeof(msgs[0])); - - /* If I wasn't so lazy, I'd initialise this only once; it's effectively - * read-only... */ - msg->Reason = RC_USER_REJ; - msg->PrivateData[0] = (IBNAL_MSG_MAGIC) & 0xff; - msg->PrivateData[1] = (IBNAL_MSG_MAGIC >> 8) & 0xff; - msg->PrivateData[2] = (IBNAL_MSG_MAGIC >> 16) & 0xff; - msg->PrivateData[3] = (IBNAL_MSG_MAGIC >> 24) & 0xff; - msg->PrivateData[4] = (IBNAL_MSG_VERSION) & 0xff; - msg->PrivateData[5] = (IBNAL_MSG_VERSION >> 8) & 0xff; - msg->PrivateData[6] = why; - - frc = iba_cm_reject(cep, msg); - if (frc != FSUCCESS) - CERROR("Error %d rejecting %s\n", frc, libcfs_nid2str(nid)); -} - -void -kibnal_check_connreject(kib_conn_t *conn, int type, CM_REJECT_INFO *rej) -{ - kib_peer_t *peer = conn->ibc_peer; - unsigned long flags; - int magic; - int version; - int why; - - LASSERT (type == IBNAL_CONN_ACTIVE || - type == IBNAL_CONN_PASSIVE); - - CDEBUG(D_NET, "%s connection with %s rejected: %d\n", - (type == IBNAL_CONN_ACTIVE) ? "Active" : "Passive", - libcfs_nid2str(peer->ibp_nid), rej->Reason); - - switch (rej->Reason) { - case RC_STALE_CONN: - if (type == IBNAL_CONN_PASSIVE) { - CERROR("Connection to %s rejected (stale QP)\n", - libcfs_nid2str(peer->ibp_nid)); - } else { - CWARN("Connection from %s rejected (stale QP): " - "retrying...\n", libcfs_nid2str(peer->ibp_nid)); - - /* retry from scratch to allocate a new conn - * which will use a different QP */ - kibnal_schedule_active_connect(peer, peer->ibp_version); - } - - /* An FCM_DISCONNECTED callback is still outstanding: give it a - * ref since kibnal_connreq_done() drops the CM's ref on conn - * on failure */ - kibnal_conn_addref(conn); - break; - - case RC_USER_REJ: - magic = (rej->PrivateData[0]) | - (rej->PrivateData[1] << 8) | - (rej->PrivateData[2] << 16) | - (rej->PrivateData[3] << 24); - version = (rej->PrivateData[4]) | - (rej->PrivateData[5] << 8); - why = (rej->PrivateData[6]); - - /* retry with old proto version */ - if (magic == IBNAL_MSG_MAGIC && - version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD && - conn->ibc_version == IBNAL_MSG_VERSION && - type != IBNAL_CONN_PASSIVE) { - /* retry with a new conn */ - CWARN ("Connection to %s refused: " - "retrying with old protocol version 0x%x\n", - libcfs_nid2str(peer->ibp_nid), version); - kibnal_schedule_active_connect(peer, version); - break; - } - - if (magic != IBNAL_MSG_MAGIC || - version != IBNAL_MSG_VERSION) { - CERROR("%s connection with %s rejected " - "(magic/ver %08x/%d why %d): " - "incompatible protocol\n", - (type == IBNAL_CONN_ACTIVE) ? - "Active" : "Passive", - libcfs_nid2str(peer->ibp_nid), - magic, version, why); - break; - } - - if (type == IBNAL_CONN_ACTIVE && - why == IBNAL_REJECT_CONN_RACE) { - /* lost connection race */ - CWARN("Connection to %s rejected: " - "lost connection race\n", - libcfs_nid2str(peer->ibp_nid)); - - write_lock_irqsave(&kibnal_data.kib_global_lock, - flags); - - if (list_empty(&peer->ibp_conns)) { - peer->ibp_passivewait = 1; - peer->ibp_passivewait_deadline = - jiffies + - (*kibnal_tunables.kib_timeout * HZ); - } - write_unlock_irqrestore(&kibnal_data.kib_global_lock, - flags); - break; - } - - CERROR("%s connection with %s rejected: %d\n", - (type == IBNAL_CONN_ACTIVE) ? "Active" : "Passive", - libcfs_nid2str(peer->ibp_nid), why); - break; - - default: - CERROR("%s connection with %s rejected: %d\n", - (type == IBNAL_CONN_ACTIVE) ? "Active" : "Passive", - libcfs_nid2str(peer->ibp_nid), rej->Reason); - } - - kibnal_connreq_done(conn, type, -ECONNREFUSED); -} - -void -kibnal_cm_disconnect_callback(kib_conn_t *conn, CM_CONN_INFO *info) -{ - CDEBUG(D_NET, "%s: state %d, status 0x%x\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), - conn->ibc_state, info->Status); - - LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED); - - switch (info->Status) { - default: - LBUG(); - break; - - case FCM_DISCONNECT_REQUEST: - /* Schedule conn to iba_cm_disconnect() if it wasn't already */ - kibnal_close_conn (conn, 0); - break; - - case FCM_DISCONNECT_REPLY: /* peer acks my disconnect req */ - case FCM_DISCONNECTED: /* end of TIME_WAIT */ - CDEBUG(D_NET, "Connection %s disconnected.\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - kibnal_conn_decref(conn); /* Lose CM's ref */ - break; - } -} - -void -kibnal_cm_passive_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg) -{ - kib_conn_t *conn = arg; - - CDEBUG(D_NET, "status 0x%x\n", info->Status); - - /* Established Connection Notifier */ - switch (info->Status) { - default: - CERROR("Unexpected status %d on Connection %s\n", - info->Status, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - LBUG(); - break; - - case FCM_CONNECT_TIMEOUT: - kibnal_connreq_done(conn, IBNAL_CONN_PASSIVE, -ETIMEDOUT); - break; - - case FCM_CONNECT_REJECT: - kibnal_check_connreject(conn, IBNAL_CONN_PASSIVE, - &info->Info.Reject); - break; - - case FCM_CONNECT_ESTABLISHED: - kibnal_connreq_done(conn, IBNAL_CONN_PASSIVE, 0); - break; - - case FCM_DISCONNECT_REQUEST: - case FCM_DISCONNECT_REPLY: - case FCM_DISCONNECTED: - kibnal_cm_disconnect_callback(conn, info); - break; - } -} - -int -kibnal_accept (kib_conn_t **connp, IB_HANDLE cep, kib_msg_t *msg, int nob) -{ - lnet_nid_t nid; - kib_conn_t *conn; - kib_peer_t *peer; - kib_peer_t *peer2; - unsigned long flags; - int rc; - - rc = kibnal_unpack_msg(msg, 0, nob); - if (rc != 0) { - /* SILENT! kibnal_unpack_msg() complains if required */ - kibnal_reject(LNET_NID_ANY, cep, IBNAL_REJECT_FATAL); - return -EPROTO; - } - - nid = msg->ibm_srcnid; - - if (msg->ibm_version != IBNAL_MSG_VERSION) - CWARN("Connection from %s: old protocol version 0x%x\n", - libcfs_nid2str(nid), msg->ibm_version); - - if (msg->ibm_type != IBNAL_MSG_CONNREQ) { - CERROR("Can't accept %s: bad request type %d (%d expected)\n", - libcfs_nid2str(nid), msg->ibm_type, IBNAL_MSG_CONNREQ); - kibnal_reject(nid, cep, IBNAL_REJECT_FATAL); - return -EPROTO; - } - - if (msg->ibm_dstnid != kibnal_data.kib_ni->ni_nid) { - CERROR("Can't accept %s: bad dst NID %s (%s expected)\n", - libcfs_nid2str(nid), - libcfs_nid2str(msg->ibm_dstnid), - libcfs_nid2str(kibnal_data.kib_ni->ni_nid)); - kibnal_reject(nid, cep, IBNAL_REJECT_FATAL); - return -EPROTO; - } - - if (msg->ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE || - msg->ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE || - msg->ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) { - CERROR("Reject %s: q %d sz %d frag %d, (%d %d %d expected)\n", - libcfs_nid2str(nid), - msg->ibm_u.connparams.ibcp_queue_depth, - msg->ibm_u.connparams.ibcp_max_msg_size, - msg->ibm_u.connparams.ibcp_max_frags, - IBNAL_MSG_QUEUE_SIZE, - IBNAL_MSG_SIZE, - IBNAL_MAX_RDMA_FRAGS); - kibnal_reject(nid, cep, IBNAL_REJECT_FATAL); - return -EPROTO; - } - - conn = kibnal_create_conn(nid, msg->ibm_version); - if (conn == NULL) { - kibnal_reject(nid, cep, IBNAL_REJECT_NO_RESOURCES); - return -ENOMEM; - } - - /* assume 'nid' is a new peer */ - rc = kibnal_create_peer(&peer, nid); - if (rc != 0) { - kibnal_conn_decref(conn); - kibnal_reject(nid, cep, IBNAL_REJECT_NO_RESOURCES); - return -ENOMEM; - } - - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - - if (kibnal_data.kib_listener_cep == NULL) { /* shutdown started */ - write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - - kibnal_peer_decref(peer); - kibnal_conn_decref(conn); - kibnal_reject(nid, cep, IBNAL_REJECT_NO_RESOURCES); - return -ESHUTDOWN; - } - - peer2 = kibnal_find_peer_locked(nid); - if (peer2 == NULL) { - /* peer table takes my ref on peer */ - list_add_tail (&peer->ibp_list, kibnal_nid2peerlist(nid)); - LASSERT (peer->ibp_connecting == 0); - } else { - kibnal_peer_decref(peer); - peer = peer2; - - if (peer->ibp_connecting != 0 && - peer->ibp_nid < kibnal_data.kib_ni->ni_nid) { - /* Resolve concurrent connection attempts in favour of - * the higher NID */ - write_unlock_irqrestore(&kibnal_data.kib_global_lock, - flags); - kibnal_conn_decref(conn); - kibnal_reject(nid, cep, IBNAL_REJECT_CONN_RACE); - return -EALREADY; - } - } - - kibnal_peer_addref(peer); /* +1 ref for conn */ - peer->ibp_accepting++; - - kibnal_set_conn_state(conn, IBNAL_CONN_CONNECTING); - conn->ibc_peer = peer; - conn->ibc_incarnation = msg->ibm_srcstamp; - conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; - conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE; - LASSERT (conn->ibc_credits + conn->ibc_reserved_credits - <= IBNAL_RX_MSGS); - - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - - *connp = conn; - return 0; -} - -void -kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg) -{ - - CM_REQUEST_INFO *req = &info->Info.Request; - CM_REPLY_INFO *rep; - kib_conn_t *conn; - FSTATUS frc; - int rc; - - LASSERT(arg == NULL); /* no conn yet for passive */ - - CDEBUG(D_NET, "%x\n", info->Status); - - if (info->Status == FCM_CONNECT_CANCEL) { - up(&kibnal_data.kib_listener_signal); - return; - } - - LASSERT (info->Status == FCM_CONNECT_REQUEST); - - rc = kibnal_accept(&conn, cep, (kib_msg_t *)req->PrivateData, - CM_REQUEST_INFO_USER_LEN); - if (rc != 0) /* kibnal_accept has rejected */ - return; - - conn->ibc_cvars->cv_path = req->PathInfo.Path; - - rc = kibnal_conn_rts(conn, - req->CEPInfo.QPN, - req->CEPInfo.OfferedInitiatorDepth, - req->CEPInfo.OfferedResponderResources, - req->CEPInfo.StartingPSN); - if (rc != 0) { - kibnal_reject(conn->ibc_peer->ibp_nid, cep, - IBNAL_REJECT_NO_RESOURCES); - kibnal_connreq_done(conn, IBNAL_CONN_PASSIVE, -ECONNABORTED); - return; - } - - memset(&conn->ibc_cvars->cv_cmci, 0, sizeof(conn->ibc_cvars->cv_cmci)); - rep = &conn->ibc_cvars->cv_cmci.Info.Reply; - - rep->QPN = conn->ibc_cvars->cv_qpattrs.QPNumber; - rep->QKey = conn->ibc_cvars->cv_qpattrs.Qkey; - rep->StartingPSN = conn->ibc_cvars->cv_qpattrs.RecvPSN; - rep->EndToEndFlowControl = conn->ibc_cvars->cv_qpattrs.FlowControl; - rep->ArbInitiatorDepth = conn->ibc_cvars->cv_qpattrs.InitiatorDepth; - rep->ArbResponderResources = conn->ibc_cvars->cv_qpattrs.ResponderResources; - rep->TargetAckDelay = kibnal_data.kib_hca_attrs.LocalCaAckDelay; - rep->FailoverAccepted = IBNAL_FAILOVER_ACCEPTED; - rep->RnRRetryCount = req->CEPInfo.RnrRetryCount; - - CLASSERT (CM_REPLY_INFO_USER_LEN >= - offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t)); - - kibnal_pack_connmsg((kib_msg_t *)rep->PrivateData, - conn->ibc_version, - CM_REPLY_INFO_USER_LEN, - IBNAL_MSG_CONNACK, - conn->ibc_peer->ibp_nid, conn->ibc_incarnation); - - LASSERT (conn->ibc_cep == NULL); - kibnal_set_conn_state(conn, IBNAL_CONN_CONNECTING); - - frc = iba_cm_accept(cep, - &conn->ibc_cvars->cv_cmci, - NULL, - kibnal_cm_passive_callback, conn, - &conn->ibc_cep); - - if (frc == FSUCCESS || frc == FPENDING) - return; - - CERROR("iba_cm_accept(%s) failed: %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), frc); - kibnal_connreq_done(conn, IBNAL_CONN_PASSIVE, -ECONNABORTED); -} - -void -kibnal_check_connreply(kib_conn_t *conn, CM_REPLY_INFO *rep) -{ - kib_msg_t *msg = (kib_msg_t *)rep->PrivateData; - lnet_nid_t nid = conn->ibc_peer->ibp_nid; - FSTATUS frc; - int rc; - - rc = kibnal_unpack_msg(msg, conn->ibc_version, CM_REPLY_INFO_USER_LEN); - if (rc != 0) { - CERROR ("Error %d unpacking connack from %s\n", - rc, libcfs_nid2str(nid)); - kibnal_reject(nid, conn->ibc_cep, IBNAL_REJECT_FATAL); - kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EPROTO); - return; - } - - if (msg->ibm_type != IBNAL_MSG_CONNACK) { - CERROR("Bad connack request type %d (%d expected) from %s\n", - msg->ibm_type, IBNAL_MSG_CONNREQ, - libcfs_nid2str(msg->ibm_srcnid)); - kibnal_reject(nid, conn->ibc_cep, IBNAL_REJECT_FATAL); - kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EPROTO); - return; - } - - if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid || - msg->ibm_dstnid != kibnal_data.kib_ni->ni_nid || - msg->ibm_dststamp != kibnal_data.kib_incarnation) { - CERROR("Stale connack from %s(%s): %s(%s), "LPX64"("LPX64")\n", - libcfs_nid2str(msg->ibm_srcnid), - libcfs_nid2str(conn->ibc_peer->ibp_nid), - libcfs_nid2str(msg->ibm_dstnid), - libcfs_nid2str(kibnal_data.kib_ni->ni_nid), - msg->ibm_dststamp, kibnal_data.kib_incarnation); - kibnal_reject(nid, conn->ibc_cep, IBNAL_REJECT_FATAL); - kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -ESTALE); - return; - } - - if (msg->ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE || - msg->ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE || - msg->ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) { - CERROR("Reject %s: q %d sz %d frag %d, (%d %d %d expected)\n", - libcfs_nid2str(msg->ibm_srcnid), - msg->ibm_u.connparams.ibcp_queue_depth, - msg->ibm_u.connparams.ibcp_max_msg_size, - msg->ibm_u.connparams.ibcp_max_frags, - IBNAL_MSG_QUEUE_SIZE, - IBNAL_MSG_SIZE, - IBNAL_MAX_RDMA_FRAGS); - kibnal_reject(nid, conn->ibc_cep, IBNAL_REJECT_FATAL); - kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EPROTO); - return; - } - - CDEBUG(D_NET, "Connection %s REP_RECEIVED.\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - - conn->ibc_incarnation = msg->ibm_srcstamp; - conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; - conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE; - LASSERT (conn->ibc_credits + conn->ibc_reserved_credits - <= IBNAL_RX_MSGS); - - rc = kibnal_conn_rts(conn, - rep->QPN, - rep->ArbInitiatorDepth, - rep->ArbResponderResources, - rep->StartingPSN); - if (rc != 0) { - kibnal_reject(nid, conn->ibc_cep, IBNAL_REJECT_NO_RESOURCES); - kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EIO); - return; - } - - memset(&conn->ibc_cvars->cv_cmci, 0, sizeof(conn->ibc_cvars->cv_cmci)); - - frc = iba_cm_accept(conn->ibc_cep, - &conn->ibc_cvars->cv_cmci, - NULL, NULL, NULL, NULL); - - if (frc == FCM_CONNECT_ESTABLISHED) { - kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, 0); - return; - } - - CERROR("Connection %s CMAccept failed: %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), frc); - kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -ECONNABORTED); -} - -void -kibnal_cm_active_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg) -{ - kib_conn_t *conn = arg; - - CDEBUG(D_NET, "status 0x%x\n", info->Status); - - switch (info->Status) { - default: - CERROR("unknown status %d on Connection %s\n", - info->Status, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - LBUG(); - break; - - case FCM_CONNECT_TIMEOUT: - kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -ETIMEDOUT); - break; - - case FCM_CONNECT_REJECT: - kibnal_check_connreject(conn, IBNAL_CONN_ACTIVE, - &info->Info.Reject); - break; - - case FCM_CONNECT_REPLY: - kibnal_check_connreply(conn, &info->Info.Reply); - break; - - case FCM_DISCONNECT_REQUEST: - case FCM_DISCONNECT_REPLY: - case FCM_DISCONNECTED: - kibnal_cm_disconnect_callback(conn, info); - break; - } -} - -void -dump_path_records(PATH_RESULTS *results) -{ - IB_PATH_RECORD *path; - int i; - - for (i = 0; i < results->NumPathRecords; i++) { - path = &results->PathRecords[i]; - CDEBUG(D_NET, "%d: sgid "LPX64":"LPX64" dgid " - LPX64":"LPX64" pkey %x\n", - i, - path->SGID.Type.Global.SubnetPrefix, - path->SGID.Type.Global.InterfaceID, - path->DGID.Type.Global.SubnetPrefix, - path->DGID.Type.Global.InterfaceID, - path->P_Key); - } -} - -void -kibnal_pathreq_callback (void *arg, QUERY *qry, - QUERY_RESULT_VALUES *qrslt) -{ - IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs; - kib_conn_t *conn = arg; - CM_REQUEST_INFO *req = &conn->ibc_cvars->cv_cmci.Info.Request; - PATH_RESULTS *path = (PATH_RESULTS *)qrslt->QueryResult; - FSTATUS frc; - - if (qrslt->Status != FSUCCESS || - qrslt->ResultDataSize < sizeof(*path)) { - CDEBUG (D_NETERROR, "pathreq %s failed: status %d data size %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), - qrslt->Status, qrslt->ResultDataSize); - kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EHOSTUNREACH); - return; - } - - if (path->NumPathRecords < 1) { - CDEBUG (D_NETERROR, "pathreq %s failed: no path records\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EHOSTUNREACH); - return; - } - - //dump_path_records(path); - conn->ibc_cvars->cv_path = path->PathRecords[0]; - - LASSERT (conn->ibc_cep == NULL); - - conn->ibc_cep = kibnal_create_cep(conn->ibc_peer->ibp_nid); - if (conn->ibc_cep == NULL) { - kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -ENOMEM); - return; - } - - memset(req, 0, sizeof(*req)); - req->SID = conn->ibc_cvars->cv_svcrec.RID.ServiceID; - req->CEPInfo.CaGUID = kibnal_data.kib_hca_guids[kibnal_data.kib_hca_idx]; - req->CEPInfo.EndToEndFlowControl = IBNAL_EE_FLOW; - req->CEPInfo.PortGUID = conn->ibc_cvars->cv_path.SGID.Type.Global.InterfaceID; - req->CEPInfo.RetryCount = IBNAL_RETRY; - req->CEPInfo.RnrRetryCount = IBNAL_RNR_RETRY; - req->CEPInfo.AckTimeout = IBNAL_ACK_TIMEOUT; - req->CEPInfo.StartingPSN = IBNAL_STARTING_PSN; - req->CEPInfo.QPN = conn->ibc_cvars->cv_qpattrs.QPNumber; - req->CEPInfo.QKey = conn->ibc_cvars->cv_qpattrs.Qkey; - req->CEPInfo.OfferedResponderResources = ca_attr->MaxQPResponderResources; - req->CEPInfo.OfferedInitiatorDepth = ca_attr->MaxQPInitiatorDepth; - req->PathInfo.bSubnetLocal = IBNAL_LOCAL_SUB; - req->PathInfo.Path = conn->ibc_cvars->cv_path; - - CLASSERT (CM_REQUEST_INFO_USER_LEN >= - offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t)); - - kibnal_pack_connmsg((kib_msg_t *)req->PrivateData, - conn->ibc_version, - CM_REQUEST_INFO_USER_LEN, - IBNAL_MSG_CONNREQ, - conn->ibc_peer->ibp_nid, 0); - - if (the_lnet.ln_testprotocompat != 0) { - /* single-shot proto test */ - LNET_LOCK(); - if ((the_lnet.ln_testprotocompat & 1) != 0) { - ((kib_msg_t *)req->PrivateData)->ibm_version++; - the_lnet.ln_testprotocompat &= ~1; - } - if ((the_lnet.ln_testprotocompat & 2) != 0) { - ((kib_msg_t *)req->PrivateData)->ibm_magic = - LNET_PROTO_MAGIC; - the_lnet.ln_testprotocompat &= ~2; - } - LNET_UNLOCK(); - } - - /* Flag I'm getting involved with the CM... */ - kibnal_set_conn_state(conn, IBNAL_CONN_CONNECTING); - - /* cm callback gets my conn ref */ - frc = iba_cm_connect(conn->ibc_cep, req, - kibnal_cm_active_callback, conn); - if (frc == FPENDING || frc == FSUCCESS) - return; - - CERROR ("Connect %s failed: %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), frc); - kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EHOSTUNREACH); -} - -void -kibnal_dump_service_records(SERVICE_RECORD_RESULTS *results) -{ - IB_SERVICE_RECORD *svc; - int i; - - for (i = 0; i < results->NumServiceRecords; i++) { - svc = &results->ServiceRecords[i]; - CDEBUG(D_NET, "%d: sid "LPX64" gid "LPX64":"LPX64" pkey %x\n", - i, - svc->RID.ServiceID, - svc->RID.ServiceGID.Type.Global.SubnetPrefix, - svc->RID.ServiceGID.Type.Global.InterfaceID, - svc->RID.ServiceP_Key); - } -} - -void -kibnal_service_get_callback (void *arg, QUERY *qry, - QUERY_RESULT_VALUES *qrslt) -{ - kib_conn_t *conn = arg; - SERVICE_RECORD_RESULTS *svc; - FSTATUS frc; - - if (qrslt->Status != FSUCCESS || - qrslt->ResultDataSize < sizeof(*svc)) { - CDEBUG (D_NETERROR, "Lookup %s failed: status %d data size %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), - qrslt->Status, qrslt->ResultDataSize); - kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EHOSTUNREACH); - return; - } - - svc = (SERVICE_RECORD_RESULTS *)qrslt->QueryResult; - if (svc->NumServiceRecords < 1) { - CDEBUG (D_NETERROR, "lookup %s failed: no service records\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EHOSTUNREACH); - return; - } - - //kibnal_dump_service_records(svc); - conn->ibc_cvars->cv_svcrec = svc->ServiceRecords[0]; - - qry = &conn->ibc_cvars->cv_query; - memset(qry, 0, sizeof(*qry)); - - qry->OutputType = OutputTypePathRecord; - qry->InputType = InputTypePortGuidPair; - - qry->InputValue.PortGuidPair.SourcePortGuid = - kibnal_data.kib_port_guid; - qry->InputValue.PortGuidPair.DestPortGuid = - conn->ibc_cvars->cv_svcrec.RID.ServiceGID.Type.Global.InterfaceID; - - /* kibnal_pathreq_callback gets my conn ref */ - frc = iba_sd_query_port_fabric_info(kibnal_data.kib_sd, - kibnal_data.kib_port_guid, - qry, - kibnal_pathreq_callback, - &kibnal_data.kib_sdretry, - conn); - if (frc == FPENDING) - return; - - CERROR ("pathreq %s failed: %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), frc); - kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EHOSTUNREACH); -} - -void -kibnal_connect_peer (kib_peer_t *peer) -{ - QUERY *qry; - FSTATUS frc; - kib_conn_t *conn; - - LASSERT (peer->ibp_connecting != 0); - - conn = kibnal_create_conn(peer->ibp_nid, peer->ibp_version); - if (conn == NULL) { - CERROR ("Can't allocate conn\n"); - kibnal_peer_connect_failed(peer, IBNAL_CONN_ACTIVE, -ENOMEM); - return; - } - - conn->ibc_peer = peer; - kibnal_peer_addref(peer); - - qry = &conn->ibc_cvars->cv_query; - memset(qry, 0, sizeof(*qry)); - - qry->OutputType = OutputTypeServiceRecord; - qry->InputType = InputTypeServiceRecord; - - qry->InputValue.ServiceRecordValue.ComponentMask = - KIBNAL_SERVICE_KEY_MASK; - kibnal_set_service_keys( - &qry->InputValue.ServiceRecordValue.ServiceRecord, - peer->ibp_nid); - - /* kibnal_service_get_callback gets my conn ref */ - frc = iba_sd_query_port_fabric_info(kibnal_data.kib_sd, - kibnal_data.kib_port_guid, - qry, - kibnal_service_get_callback, - &kibnal_data.kib_sdretry, - conn); - if (frc == FPENDING) - return; - - CERROR("Lookup %s failed: %d\n", libcfs_nid2str(peer->ibp_nid), frc); - kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EHOSTUNREACH); -} - -int -kibnal_check_txs (kib_conn_t *conn, struct list_head *txs) -{ - kib_tx_t *tx; - struct list_head *ttmp; - int timed_out = 0; - - spin_lock(&conn->ibc_lock); - - list_for_each (ttmp, txs) { - tx = list_entry (ttmp, kib_tx_t, tx_list); - - if (txs == &conn->ibc_active_txs) { - LASSERT (!tx->tx_queued); - LASSERT (tx->tx_waiting || tx->tx_sending != 0); - } else { - LASSERT (tx->tx_queued); - } - - if (time_after_eq (jiffies, tx->tx_deadline)) { - timed_out = 1; - break; - } - } - - spin_unlock(&conn->ibc_lock); - return timed_out; -} - -int -kibnal_conn_timed_out (kib_conn_t *conn) -{ - return kibnal_check_txs(conn, &conn->ibc_tx_queue) || - kibnal_check_txs(conn, &conn->ibc_tx_queue_rsrvd) || - kibnal_check_txs(conn, &conn->ibc_tx_queue_nocred) || - kibnal_check_txs(conn, &conn->ibc_active_txs); -} - -void -kibnal_check_peers (int idx) -{ - rwlock_t *rwlock = &kibnal_data.kib_global_lock; - struct list_head *peers = &kibnal_data.kib_peers[idx]; - struct list_head *ptmp; - kib_peer_t *peer; - kib_conn_t *conn; - struct list_head *ctmp; - unsigned long flags; - - again: - /* NB. We expect to have a look at all the peers and not find any - * rdmas to time out, so we just use a shared lock while we - * take a look... */ - read_lock_irqsave(rwlock, flags); - - list_for_each (ptmp, peers) { - peer = list_entry (ptmp, kib_peer_t, ibp_list); - - if (peer->ibp_passivewait) { - LASSERT (list_empty(&peer->ibp_conns)); - - if (!time_after_eq(jiffies, - peer->ibp_passivewait_deadline)) - continue; - - kibnal_peer_addref(peer); /* ++ ref for me... */ - read_unlock_irqrestore(rwlock, flags); - - kibnal_peer_connect_failed(peer, IBNAL_CONN_WAITING, - -ETIMEDOUT); - kibnal_peer_decref(peer); /* ...until here */ - - /* start again now I've dropped the lock */ - goto again; - } - - list_for_each (ctmp, &peer->ibp_conns) { - conn = list_entry (ctmp, kib_conn_t, ibc_list); - - LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED); - - /* In case we have enough credits to return via a - * NOOP, but there were no non-blocking tx descs - * free to do it last time... */ - kibnal_check_sends(conn); - - if (!kibnal_conn_timed_out(conn)) - continue; - - /* Handle timeout by closing the whole connection. We - * can only be sure RDMA activity has ceased once the - * QP has been modified. */ - - kibnal_conn_addref(conn); /* 1 ref for me... */ - - read_unlock_irqrestore(rwlock, flags); - - CERROR("Timed out RDMA with %s\n", - libcfs_nid2str(peer->ibp_nid)); - - kibnal_close_conn (conn, -ETIMEDOUT); - kibnal_conn_decref(conn); /* ...until here */ - - /* start again now I've dropped the lock */ - goto again; - } - } - - read_unlock_irqrestore(rwlock, flags); -} - -void -kibnal_disconnect_conn (kib_conn_t *conn) -{ - FSTATUS frc; - - LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECTING); - - kibnal_conn_disconnected(conn); - - frc = iba_cm_disconnect(conn->ibc_cep, NULL, NULL); - switch (frc) { - case FSUCCESS: - break; - - case FINSUFFICIENT_RESOURCES: - CERROR("ENOMEM disconnecting %s\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - /* This might cause the module to become unloadable since the - * FCM_DISCONNECTED callback is still outstanding */ - break; - - default: - CERROR("Unexpected error disconnecting %s: %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), frc); - LBUG(); - } - - kibnal_peer_notify(conn->ibc_peer); -} - -int -kibnal_connd (void *arg) -{ - wait_queue_t wait; - unsigned long flags; - kib_conn_t *conn; - kib_peer_t *peer; - int timeout; - int i; - int did_something; - int peer_index = 0; - unsigned long deadline = jiffies; - - cfs_daemonize ("kibnal_connd"); - cfs_block_allsigs (); - - init_waitqueue_entry (&wait, current); - - spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); - - while (!kibnal_data.kib_shutdown) { - did_something = 0; - - if (!list_empty (&kibnal_data.kib_connd_zombies)) { - conn = list_entry (kibnal_data.kib_connd_zombies.next, - kib_conn_t, ibc_list); - list_del (&conn->ibc_list); - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - did_something = 1; - - kibnal_destroy_conn(conn); - - spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); - } - - if (!list_empty (&kibnal_data.kib_connd_conns)) { - conn = list_entry (kibnal_data.kib_connd_conns.next, - kib_conn_t, ibc_list); - list_del (&conn->ibc_list); - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - did_something = 1; - - kibnal_disconnect_conn(conn); - kibnal_conn_decref(conn); - - spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); - } - - if (!list_empty (&kibnal_data.kib_connd_peers)) { - peer = list_entry (kibnal_data.kib_connd_peers.next, - kib_peer_t, ibp_connd_list); - - list_del_init (&peer->ibp_connd_list); - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - did_something = 1; - - kibnal_connect_peer (peer); - kibnal_peer_decref (peer); - - spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); - } - - /* careful with the jiffy wrap... */ - while ((timeout = (int)(deadline - jiffies)) <= 0) { - const int n = 4; - const int p = 1; - int chunk = kibnal_data.kib_peer_hash_size; - - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - - /* Time to check for RDMA timeouts on a few more - * peers: I do checks every 'p' seconds on a - * proportion of the peer table and I need to check - * every connection 'n' times within a timeout - * interval, to ensure I detect a timeout on any - * connection within (n+1)/n times the timeout - * interval. */ - - if (*kibnal_tunables.kib_timeout > n * p) - chunk = (chunk * n * p) / - *kibnal_tunables.kib_timeout; - if (chunk == 0) - chunk = 1; - - for (i = 0; i < chunk; i++) { - kibnal_check_peers (peer_index); - peer_index = (peer_index + 1) % - kibnal_data.kib_peer_hash_size; - } - - deadline += p * HZ; - spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); - did_something = 1; - } - - if (did_something) - continue; - - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - - set_current_state (TASK_INTERRUPTIBLE); - add_wait_queue (&kibnal_data.kib_connd_waitq, &wait); - - if (!kibnal_data.kib_shutdown && - list_empty (&kibnal_data.kib_connd_conns) && - list_empty (&kibnal_data.kib_connd_peers)) - schedule_timeout (timeout); - - set_current_state (TASK_RUNNING); - remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait); - - spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); - } - - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - - kibnal_thread_fini (); - return (0); -} - - -void -kibnal_hca_async_callback (void *hca_arg, IB_EVENT_RECORD *ev) -{ - /* XXX flesh out. this seems largely for async errors */ - CERROR("type: %d code: %u\n", ev->EventType, ev->EventCode); -} - -void -kibnal_hca_callback (void *hca_arg, void *cq_arg) -{ - unsigned long flags; - - spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); - kibnal_data.kib_ready = 1; - wake_up(&kibnal_data.kib_sched_waitq); - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); -} - -int -kibnal_scheduler(void *arg) -{ - long id = (long)arg; - wait_queue_t wait; - char name[16]; - FSTATUS frc; - FSTATUS frc2; - IB_WORK_COMPLETION wc; - kib_rx_t *rx; - unsigned long flags; - __u64 rxseq = 0; - int busy_loops = 0; - - snprintf(name, sizeof(name), "kibnal_sd_%02ld", id); - cfs_daemonize(name); - cfs_block_allsigs(); - - init_waitqueue_entry(&wait, current); - - spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); - - while (!kibnal_data.kib_shutdown) { - if (busy_loops++ >= IBNAL_RESCHED) { - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, - flags); - - cfs_cond_resched(); - busy_loops = 0; - - spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); - } - - if (kibnal_data.kib_ready && - !kibnal_data.kib_checking_cq) { - /* take ownership of completion polling */ - kibnal_data.kib_checking_cq = 1; - /* Assume I'll exhaust the CQ */ - kibnal_data.kib_ready = 0; - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, - flags); - - frc = iba_poll_cq(kibnal_data.kib_cq, &wc); - if (frc == FNOT_DONE) { - /* CQ empty */ - frc2 = iba_rearm_cq(kibnal_data.kib_cq, - CQEventSelNextWC); - LASSERT (frc2 == FSUCCESS); - } - - if (frc == FSUCCESS && - kibnal_wreqid2type(wc.WorkReqId) == IBNAL_WID_RX) { - rx = (kib_rx_t *)kibnal_wreqid2ptr(wc.WorkReqId); - - /* Grab the RX sequence number NOW before - * anyone else can get an RX completion */ - rxseq = rx->rx_conn->ibc_rxseq++; - } - - spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); - /* give up ownership of completion polling */ - kibnal_data.kib_checking_cq = 0; - - if (frc == FNOT_DONE) - continue; - - LASSERT (frc == FSUCCESS); - /* Assume there's more: get another scheduler to check - * while I handle this completion... */ - - kibnal_data.kib_ready = 1; - wake_up(&kibnal_data.kib_sched_waitq); - - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, - flags); - - switch (kibnal_wreqid2type(wc.WorkReqId)) { - case IBNAL_WID_RX: - kibnal_rx_complete(&wc, rxseq); - break; - - case IBNAL_WID_TX: - kibnal_tx_complete(&wc); - break; - - case IBNAL_WID_RDMA: - /* We only get RDMA completion notification if - * it fails. So we just ignore them completely - * because... - * - * 1) If an RDMA fails, all subsequent work - * items, including the final SEND will fail - * too, so I'm still guaranteed to notice that - * this connection is hosed. - * - * 2) It's positively dangerous to look inside - * the tx descriptor obtained from an RDMA work - * item. As soon as I drop the kib_sched_lock, - * I give a scheduler on another CPU a chance - * to get the final SEND completion, so the tx - * descriptor can get freed as I inspect it. */ - CERROR ("RDMA failed: %d\n", wc.Status); - break; - - default: - LBUG(); - } - - spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); - continue; - } - - /* Nothing to do; sleep... */ - - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue_exclusive(&kibnal_data.kib_sched_waitq, &wait); - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, - flags); - - schedule(); - - remove_wait_queue(&kibnal_data.kib_sched_waitq, &wait); - set_current_state(TASK_RUNNING); - spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); - } - - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); - - kibnal_thread_fini(); - return (0); -} diff --git a/lnet/klnds/iiblnd/iiblnd_modparams.c b/lnet/klnds/iiblnd/iiblnd_modparams.c deleted file mode 100644 index 010337c..0000000 --- a/lnet/klnds/iiblnd/iiblnd_modparams.c +++ /dev/null @@ -1,325 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/iiblnd/iiblnd_modparams.c - * - * Author: Eric Barton - */ - -#include "iiblnd.h" - -static char *ipif_basename = "ib"; -CFS_MODULE_PARM(ipif_basename, "s", charp, 0444, - "IPoIB interface base name"); - -static char *service_name = "iiblnd"; -CFS_MODULE_PARM(service_name, "s", charp, 0444, - "IB service name"); - -static int service_number = 0x11b9a2; -CFS_MODULE_PARM(service_number, "i", int, 0444, - "IB service number"); - -static int min_reconnect_interval = 1; -CFS_MODULE_PARM(min_reconnect_interval, "i", int, 0644, - "minimum connection retry interval (seconds)"); - -static int max_reconnect_interval = 60; -CFS_MODULE_PARM(max_reconnect_interval, "i", int, 0644, - "maximum connection retry interval (seconds)"); - -static int concurrent_peers = 1152; -CFS_MODULE_PARM(concurrent_peers, "i", int, 0444, - "maximum number of peers that may connect"); - -static int cksum = 0; -CFS_MODULE_PARM(cksum, "i", int, 0644, - "set non-zero to enable message (not RDMA) checksums"); - -static int timeout = 50; -CFS_MODULE_PARM(timeout, "i", int, 0644, - "timeout (seconds)"); - -static int ntx = 256; -CFS_MODULE_PARM(ntx, "i", int, 0444, - "# of message descriptors"); - -static int credits = 128; -CFS_MODULE_PARM(credits, "i", int, 0444, - "# concurrent sends"); - -static int peer_credits = 8; -CFS_MODULE_PARM(peer_credits, "i", int, 0444, - "# concurrent sends to 1 peer"); - -static int sd_retries = 8; -CFS_MODULE_PARM(sd_retries, "i", int, 0444, - "# times to retry SD queries"); - -static int keepalive = 100; -CFS_MODULE_PARM(keepalive, "i", int, 0644, - "Idle time in seconds before sending a keepalive"); - -static int concurrent_sends = IBNAL_RX_MSGS; -CFS_MODULE_PARM(concurrent_sends, "i", int, 0644, - "Send work queue sizing"); - -kib_tunables_t kibnal_tunables = { - .kib_ipif_basename = &ipif_basename, - .kib_service_name = &service_name, - .kib_service_number = &service_number, - .kib_min_reconnect_interval = &min_reconnect_interval, - .kib_max_reconnect_interval = &max_reconnect_interval, - .kib_concurrent_peers = &concurrent_peers, - .kib_cksum = &cksum, - .kib_timeout = &timeout, - .kib_keepalive = &keepalive, - .kib_ntx = &ntx, - .kib_credits = &credits, - .kib_peercredits = &peer_credits, - .kib_sd_retries = &sd_retries, - .kib_concurrent_sends = &concurrent_sends, -}; - -#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM - -/* NB max_size specified for proc_dostring entries only needs to be big enough - * not to truncate the printout; it only needs to be the actual size of the - * string buffer if we allow writes (and we don't) */ - -#ifdef HAVE_SYSCTL_UNNUMBERED - -enum { - IIBLND_IPIF_BASENAME = 1, - IIBLND_SERVICE_NAME, - IIBLND_SERVICE_NUMBER, - IIBLND_RECONNECT_MIN, - IIBLND_RECONNECT_MAX, - IIBLND_CONCURRENT_PEERS, - IIBLND_CKSUM, - IIBLND_TIMEOUT, - IIBLND_NTX, - IIBLND_CREDITS, - IIBLND_PEER_CREDITS, - IIBLND_SD_RETRIES, - IIBLND_KEEPALIVE, - IIBLND_CONCURRENT_SENDS -}; - -#else - -#define IIBLND_IPIF_BASENAME CTL_UNNUMBERED -#define IIBLND_SERVICE_NAME CTL_UNNUMBERED -#define IIBLND_SERVICE_NUMBER CTL_UNNUMBERED -#define IIBLND_RECONNECT_MIN CTL_UNNUMBERED -#define IIBLND_RECONNECT_MAX CTL_UNNUMBERED -#define IIBLND_CONCURRENT_PEERS CTL_UNNUMBERED -#define IIBLND_CKSUM CTL_UNNUMBERED -#define IIBLND_TIMEOUT CTL_UNNUMBERED -#define IIBLND_NTX CTL_UNNUMBERED -#define IIBLND_CREDITS CTL_UNNUMBERED -#define IIBLND_PEER_CREDITS CTL_UNNUMBERED -#define IIBLND_SD_RETRIES CTL_UNNUMBERED -#define IIBLND_KEEPALIVE CTL_UNNUMBERED -#define IIBLND_CONCURRENT_SENDS CTL_UNNUMBERED - -#endif - -static cfs_sysctl_table_t kibnal_ctl_table[] = { - { - .ctl_name = IBBLND_IPIF_BASENAME, - .procname = "ipif_basename", - .data = &ipif_basename, - .maxlen = 1024, - .mode = 0444, - .proc_handler = &proc_dostring - }, - { - .ctl_name = IIBLND_SERVICE_NAME, - .procname = "service_name", - .data = &service_name, - .maxlen = 1024, - .mode = 0444, - .proc_handler = &proc_dostring - }, - { - .ctl_name = IIBLND_SERVICE_NUMBER, - .procname = "service_number", - .data = &service_number, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = IIBLND_RECONNECT_MIN, - .procname = "min_reconnect_interval", - .data = &min_reconnect_interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = IIBLND_RECONNECT_MAX, - .procname = "max_reconnect_interval", - .data = &max_reconnect_interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = IIBLND_CONCURRENT_PEERS, - .procname = "concurrent_peers", - .data = &concurrent_peers, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = IIBLND_CKSUM, - .procname = "cksum", - .data = &cksum, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = IIBLND_TIMEOUT, - .procname = "timeout", - .data = &timeout, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = IIBLND_NTX, - .procname = "ntx", - .data = &ntx, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = IIBLND_CREDITS, - .procname = "credits", - .data = &credits, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = IIBLND_PEER_CREDITS, - .procname = "peer_credits", - .data = &peer_credits, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = IIBLND_SD_RETRIES, - .procname = "sd_retries", - .data = &sd_retries, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = IIBLND_KEEPALIVE, - .procname = "keepalive", - .data = &keepalive, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = IIBLND_CONCURRENT_SENDS, - .procname = "concurrent_sends", - .data = &concurrent_sends, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - {0} -}; - -static cfs_sysctl_table_t kibnal_top_ctl_table[] = { - { - .ctl_name = CTL_IIBLND, - .procname = "iibnal", - .data = NULL, - .maxlen = 0, - .mode = 0555, - .child = kibnal_ctl_table - }, - {0} -}; - -int -kibnal_tunables_init () -{ - kibnal_tunables.kib_sysctl = - cfs_register_sysctl_table(kibnal_top_ctl_table, 0); - - if (kibnal_tunables.kib_sysctl == NULL) - CWARN("Can't setup /proc tunables\n"); - - if (*kibnal_tunables.kib_concurrent_sends > IBNAL_RX_MSGS) - *kibnal_tunables.kib_concurrent_sends = IBNAL_RX_MSGS; - if (*kibnal_tunables.kib_concurrent_sends < IBNAL_MSG_QUEUE_SIZE) - *kibnal_tunables.kib_concurrent_sends = IBNAL_MSG_QUEUE_SIZE; - - return 0; -} - -void -kibnal_tunables_fini () -{ - if (kibnal_tunables.kib_sysctl != NULL) - cfs_unregister_sysctl_table(kibnal_tunables.kib_sysctl); -} - -#else - -int -kibnal_tunables_init () -{ - return 0; -} - -void -kibnal_tunables_fini () -{ -} - -#endif diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h index 9122f2a..f946762 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.h +++ b/lnet/klnds/o2iblnd/o2iblnd.h @@ -412,7 +412,7 @@ typedef struct typedef struct { /* First 2 fields fixed FOR ALL TIME */ - __u32 ibm_magic; /* I'm an openibnal message */ + __u32 ibm_magic; /* I'm an ibnal message */ __u16 ibm_version; /* this is my version number */ __u8 ibm_type; /* msg type */ diff --git a/lnet/klnds/openiblnd/.gitignore b/lnet/klnds/openiblnd/.gitignore deleted file mode 100644 index b5d0279..0000000 --- a/lnet/klnds/openiblnd/.gitignore +++ /dev/null @@ -1,10 +0,0 @@ -/.deps -/Makefile -/.*.cmd -/autoMakefile.in -/autoMakefile -/*.ko -/*.mod.c -/.*.flags -/.tmp_versions -/.depend diff --git a/lnet/klnds/openiblnd/Makefile.in b/lnet/klnds/openiblnd/Makefile.in deleted file mode 100644 index 86fa9cd..0000000 --- a/lnet/klnds/openiblnd/Makefile.in +++ /dev/null @@ -1,6 +0,0 @@ -MODULES := kopeniblnd -kopeniblnd-objs := openiblnd.o openiblnd_cb.o openiblnd_modparams.o - -EXTRA_POST_CFLAGS := @OPENIBCPPFLAGS@ - -@INCLUDE_RULES@ diff --git a/lnet/klnds/openiblnd/autoMakefile.am b/lnet/klnds/openiblnd/autoMakefile.am deleted file mode 100644 index bee527b..0000000 --- a/lnet/klnds/openiblnd/autoMakefile.am +++ /dev/null @@ -1,44 +0,0 @@ -# -# GPL HEADER START -# -# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 only, -# as published by the Free Software Foundation. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License version 2 for more details (a copy is included -# in the LICENSE file that accompanied this code). -# -# You should have received a copy of the GNU General Public License -# version 2 along with this program; If not, see -# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf -# -# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, -# CA 95054 USA or visit www.sun.com if you need additional information or -# have any questions. -# -# GPL HEADER END -# - -# -# Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. -# Use is subject to license terms. -# - -# -# This file is part of Lustre, http://www.lustre.org/ -# Lustre is a trademark of Sun Microsystems, Inc. -# - -if MODULES -if BUILD_OPENIBLND -modulenet_DATA = kopeniblnd$(KMODEXT) -endif -endif - -MOSTLYCLEANFILES = @MOSTLYCLEANFILES@ -DIST_SOURCES = $(kopeniblnd-objs:%.o=%.c) openiblnd.h diff --git a/lnet/klnds/openiblnd/openiblnd.c b/lnet/klnds/openiblnd/openiblnd.c deleted file mode 100644 index 1bb72b7..0000000 --- a/lnet/klnds/openiblnd/openiblnd.c +++ /dev/null @@ -1,1875 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. - * - * GPL HEADER END - */ -/* - * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/openiblnd/openiblnd.c - * - * Author: Eric Barton - */ - -#include "openiblnd.h" - -lnd_t the_kiblnd = { -#ifdef USING_TSAPI - .lnd_type = CIBLND, -#else - .lnd_type = OPENIBLND, -#endif - .lnd_startup = kibnal_startup, - .lnd_shutdown = kibnal_shutdown, - .lnd_ctl = kibnal_ctl, - .lnd_send = kibnal_send, - .lnd_recv = kibnal_recv, - .lnd_eager_recv = kibnal_eager_recv, - .lnd_accept = kibnal_accept, -}; - -kib_data_t kibnal_data; - -__u32 -kibnal_cksum (void *ptr, int nob) -{ - char *c = ptr; - __u32 sum = 0; - - while (nob-- > 0) - sum = ((sum << 1) | (sum >> 31)) + *c++; - - /* ensure I don't return 0 (== no checksum) */ - return (sum == 0) ? 1 : sum; -} - -void -kibnal_init_msg(kib_msg_t *msg, int type, int body_nob) -{ - msg->ibm_type = type; - msg->ibm_nob = offsetof(kib_msg_t, ibm_u) + body_nob; -} - -void -kibnal_pack_msg(kib_msg_t *msg, int version, int credits, - lnet_nid_t dstnid, __u64 dststamp) -{ - /* CAVEAT EMPTOR! all message fields not set here should have been - * initialised previously. */ - msg->ibm_magic = IBNAL_MSG_MAGIC; - msg->ibm_version = version; - /* ibm_type */ - msg->ibm_credits = credits; - /* ibm_nob */ - msg->ibm_cksum = 0; - msg->ibm_srcnid = kibnal_data.kib_ni->ni_nid; - msg->ibm_srcstamp = kibnal_data.kib_incarnation; - msg->ibm_dstnid = dstnid; - msg->ibm_dststamp = dststamp; - - if (*kibnal_tunables.kib_cksum) { - /* NB ibm_cksum zero while computing cksum */ - msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob); - } -} - -int -kibnal_unpack_msg(kib_msg_t *msg, int expected_version, int nob) -{ - const int hdr_size = offsetof(kib_msg_t, ibm_u); - __u32 msg_cksum; - int msg_version; - int flip; - int msg_nob; - - if (nob < 6) { - CERROR("Short message: %d\n", nob); - return -EPROTO; - } - - if (msg->ibm_magic == IBNAL_MSG_MAGIC) { - flip = 0; - } else if (msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC)) { - flip = 1; - } else { - CERROR("Bad magic: %08x\n", msg->ibm_magic); - return -EPROTO; - } - - msg_version = flip ? __swab16(msg->ibm_version) : msg->ibm_version; - if ((expected_version == 0) ? - (msg_version != IBNAL_MSG_VERSION && - msg_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) : - (msg_version != expected_version)) { - CERROR("Bad version: %x\n", msg_version); - return -EPROTO; - } - - if (nob < hdr_size) { - CERROR("Short message: %d\n", nob); - return -EPROTO; - } - - msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob; - if (msg_nob > nob) { - CERROR("Short message: got %d, wanted %d\n", nob, msg_nob); - return -EPROTO; - } - - /* checksum must be computed with ibm_cksum zero and BEFORE anything - * gets flipped */ - msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum; - msg->ibm_cksum = 0; - if (msg_cksum != 0 && - msg_cksum != kibnal_cksum(msg, msg_nob)) { - CERROR("Bad checksum\n"); - return -EPROTO; - } - msg->ibm_cksum = msg_cksum; - - if (flip) { - /* leave magic unflipped as a clue to peer endianness */ - msg->ibm_version = msg_version; - LASSERT (sizeof(msg->ibm_type) == 1); - LASSERT (sizeof(msg->ibm_credits) == 1); - msg->ibm_nob = msg_nob; - __swab64s(&msg->ibm_srcnid); - __swab64s(&msg->ibm_srcstamp); - __swab64s(&msg->ibm_dstnid); - __swab64s(&msg->ibm_dststamp); - } - - if (msg->ibm_srcnid == LNET_NID_ANY) { - CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid)); - return -EPROTO; - } - - switch (msg->ibm_type) { - default: - CERROR("Unknown message type %x\n", msg->ibm_type); - return -EPROTO; - - case IBNAL_MSG_SVCQRY: - case IBNAL_MSG_NOOP: - break; - - case IBNAL_MSG_SVCRSP: - if (msg_nob < hdr_size + sizeof(msg->ibm_u.svcrsp)) { - CERROR("Short SVCRSP: %d(%d)\n", msg_nob, - (int)(hdr_size + sizeof(msg->ibm_u.svcrsp))); - return -EPROTO; - } - if (flip) { - __swab64s(&msg->ibm_u.svcrsp.ibsr_svc_id); - __swab16s(&msg->ibm_u.svcrsp.ibsr_svc_pkey); - } - break; - - case IBNAL_MSG_CONNREQ: - case IBNAL_MSG_CONNACK: - if (msg_nob < hdr_size + sizeof(msg->ibm_u.connparams)) { - CERROR("Short CONNREQ: %d(%d)\n", msg_nob, - (int)(hdr_size + sizeof(msg->ibm_u.connparams))); - return -EPROTO; - } - if (flip) - __swab32s(&msg->ibm_u.connparams.ibcp_queue_depth); - break; - - case IBNAL_MSG_IMMEDIATE: - if (msg_nob < offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])) { - CERROR("Short IMMEDIATE: %d(%d)\n", msg_nob, - (int)offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])); - return -EPROTO; - } - break; - - case IBNAL_MSG_PUT_RDMA: - case IBNAL_MSG_GET_RDMA: - if (msg_nob < hdr_size + sizeof(msg->ibm_u.rdma)) { - CERROR("Short RDMA req: %d(%d)\n", msg_nob, - (int)(hdr_size + sizeof(msg->ibm_u.rdma))); - return -EPROTO; - } - if (flip) { - __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_key); - __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_nob); - __swab64s(&msg->ibm_u.rdma.ibrm_desc.rd_addr); - } - break; - - case IBNAL_MSG_PUT_DONE: - case IBNAL_MSG_GET_DONE: - if (msg_nob < hdr_size + sizeof(msg->ibm_u.completion)) { - CERROR("Short RDMA completion: %d(%d)\n", msg_nob, - (int)(hdr_size + sizeof(msg->ibm_u.completion))); - return -EPROTO; - } - if (flip) - __swab32s(&msg->ibm_u.completion.ibcm_status); - break; - } - return 0; -} - -int -kibnal_make_svcqry (kib_conn_t *conn) -{ - kib_peer_t *peer = conn->ibc_peer; - int version = IBNAL_MSG_VERSION; - int msg_version; - kib_msg_t *msg; - struct socket *sock; - int rc; - int nob; - - LASSERT (conn->ibc_connreq != NULL); - msg = &conn->ibc_connreq->cr_msg; - - again: - kibnal_init_msg(msg, IBNAL_MSG_SVCQRY, 0); - kibnal_pack_msg(msg, version, 0, peer->ibp_nid, 0); - - rc = lnet_connect(&sock, peer->ibp_nid, - 0, peer->ibp_ip, peer->ibp_port); - if (rc != 0) - return -ECONNABORTED; - - rc = libcfs_sock_write(sock, msg, msg->ibm_nob, - lnet_acceptor_timeout()); - if (rc != 0) { - CERROR("Error %d sending svcqry to %s at %u.%u.%u.%u/%d\n", - rc, libcfs_nid2str(peer->ibp_nid), - HIPQUAD(peer->ibp_ip), peer->ibp_port); - goto out; - } - - /* The first 6 bytes are invariably MAGIC + proto version */ - rc = libcfs_sock_read(sock, msg, 6, *kibnal_tunables.kib_timeout); - if (rc != 0) { - CERROR("Error %d receiving svcrsp from %s at %u.%u.%u.%u/%d\n", - rc, libcfs_nid2str(peer->ibp_nid), - HIPQUAD(peer->ibp_ip), peer->ibp_port); - goto out; - } - - if (msg->ibm_magic != IBNAL_MSG_MAGIC && - msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) { - CERROR("Bad magic: %08x from %s at %u.%u.%u.%u/%d\n", - msg->ibm_magic, libcfs_nid2str(peer->ibp_nid), - HIPQUAD(peer->ibp_ip), peer->ibp_port); - rc = -EPROTO; - goto out; - } - - msg_version = (msg->ibm_magic == IBNAL_MSG_MAGIC) ? - msg->ibm_version : __swab16(msg->ibm_version); - if (msg_version != version) { - if (version == IBNAL_MSG_VERSION) { - /* retry with previous version */ - libcfs_sock_release(sock); - version = IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD; - goto again; - } - - CERROR("Bad version %x from %s at %u.%u.%u.%u/%d\n", - msg_version, libcfs_nid2str(peer->ibp_nid), - HIPQUAD(peer->ibp_ip), peer->ibp_port); - rc = -EPROTO; - goto out; - } - - /* Read in the rest of the message now we know the expected format */ - nob = offsetof(kib_msg_t, ibm_u) + sizeof(kib_svcrsp_t); - rc = libcfs_sock_read(sock, ((char *)msg) + 6, nob - 6, - *kibnal_tunables.kib_timeout); - if (rc != 0) { - CERROR("Error %d receiving svcrsp from %s at %u.%u.%u.%u/%d\n", - rc, libcfs_nid2str(peer->ibp_nid), - HIPQUAD(peer->ibp_ip), peer->ibp_port); - goto out; - } - - rc = kibnal_unpack_msg(msg, version, nob); - if (rc != 0) { - CERROR("Error %d unpacking svcrsp from %s at %u.%u.%u.%u/%d\n", - rc, libcfs_nid2str(peer->ibp_nid), - HIPQUAD(peer->ibp_ip), peer->ibp_port); - goto out; - } - - if (msg->ibm_type != IBNAL_MSG_SVCRSP) { - CERROR("Unexpected response type %d from %s at %u.%u.%u.%u/%d\n", - msg->ibm_type, libcfs_nid2str(peer->ibp_nid), - HIPQUAD(peer->ibp_ip), peer->ibp_port); - rc = -EPROTO; - goto out; - } - - if (kibnal_data.kib_ni->ni_nid != msg->ibm_dstnid || - msg->ibm_dststamp != kibnal_data.kib_incarnation) { - CERROR("Unexpected dst NID/stamp %s/"LPX64" from " - "%s at %u.%u.%u.%u/%d\n", - libcfs_nid2str(msg->ibm_dstnid), msg->ibm_dststamp, - libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip), - peer->ibp_port); - rc = -EPROTO; - goto out; - } - - if (peer->ibp_nid != msg->ibm_srcnid) { - CERROR("Unexpected src NID %s from %s at %u.%u.%u.%u/%d\n", - libcfs_nid2str(msg->ibm_srcnid), - libcfs_nid2str(peer->ibp_nid), - HIPQUAD(peer->ibp_ip), peer->ibp_port); - rc = -EPROTO; - goto out; - } - - conn->ibc_incarnation = msg->ibm_srcstamp; - conn->ibc_connreq->cr_svcrsp = msg->ibm_u.svcrsp; - conn->ibc_version = version; - - out: - libcfs_sock_release(sock); - return rc; -} - -void -kibnal_handle_svcqry (struct socket *sock) -{ - __u32 peer_ip; - unsigned int peer_port; - kib_msg_t *msg; - __u64 srcnid; - __u64 srcstamp; - int version; - int reject = 0; - int rc; - - rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port); - if (rc != 0) { - CERROR("Can't get peer's IP: %d\n", rc); - return; - } - - LIBCFS_ALLOC(msg, sizeof(*msg)); - if (msg == NULL) { - CERROR("Can't allocate msgs for %u.%u.%u.%u/%d\n", - HIPQUAD(peer_ip), peer_port); - return; - } - - rc = libcfs_sock_read(sock, &msg->ibm_magic, sizeof(msg->ibm_magic), - lnet_acceptor_timeout()); - if (rc != 0) { - CERROR("Error %d receiving svcqry(1) from %u.%u.%u.%u/%d\n", - rc, HIPQUAD(peer_ip), peer_port); - goto out; - } - - if (msg->ibm_magic != IBNAL_MSG_MAGIC && - msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) { - /* Unexpected magic! */ - if (msg->ibm_magic == LNET_PROTO_MAGIC || - msg->ibm_magic == __swab32(LNET_PROTO_MAGIC)) { - /* future protocol version compatibility! When LNET - * unifies protocols over all LNDs, the first thing - * sent will be a version query. I send back a reply - * in my current protocol to tell her I'm "old" */ - kibnal_init_msg(msg, 0, 0); - kibnal_pack_msg(msg, IBNAL_MSG_VERSION, 0, - LNET_NID_ANY, 0); - reject = 1; - goto reply; - } - - CERROR ("Bad magic(1) %#08x (%#08x expected) from " - "%u.%u.%u.%u/%d\n", msg->ibm_magic, - IBNAL_MSG_MAGIC, HIPQUAD(peer_ip), peer_port); - goto out; - } - - /* Now check version */ - - rc = libcfs_sock_read(sock, &msg->ibm_version, sizeof(msg->ibm_version), - lnet_acceptor_timeout()); - if (rc != 0) { - CERROR("Error %d receiving svcqry(2) from %u.%u.%u.%u/%d\n", - rc, HIPQUAD(peer_ip), peer_port); - goto out; - } - - version = (msg->ibm_magic == IBNAL_MSG_MAGIC) ? - msg->ibm_version : __swab16(msg->ibm_version); - /* Peer is a different protocol version: reply in my current protocol - * to tell her I'm "old" */ - if (version != IBNAL_MSG_VERSION && - version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) { - kibnal_init_msg(msg, 0, 0); - kibnal_pack_msg(msg, IBNAL_MSG_VERSION, 0, LNET_NID_ANY, 0); - reject = 1; - goto reply; - } - - /* Now read in all the rest */ - rc = libcfs_sock_read(sock, &msg->ibm_type, - offsetof(kib_msg_t, ibm_u) - - offsetof(kib_msg_t, ibm_type), - lnet_acceptor_timeout()); - if (rc != 0) { - CERROR("Error %d receiving svcqry(3) from %u.%u.%u.%u/%d\n", - rc, HIPQUAD(peer_ip), peer_port); - goto out; - } - - rc = kibnal_unpack_msg(msg, version, offsetof(kib_msg_t, ibm_u)); - if (rc != 0) { - CERROR("Error %d unpacking svcqry from %u.%u.%u.%u/%d\n", - rc, HIPQUAD(peer_ip), peer_port); - goto out; - } - - if (msg->ibm_type != IBNAL_MSG_SVCQRY) { - CERROR("Unexpected message %d from %u.%u.%u.%u/%d\n", - msg->ibm_type, HIPQUAD(peer_ip), peer_port); - goto out; - } - - if (kibnal_data.kib_ni->ni_nid != msg->ibm_dstnid) { - CERROR("Unexpected dstnid %s: expected %s from %u.%u.%u.%u/%d\n", - libcfs_nid2str(msg->ibm_dstnid), - libcfs_nid2str(kibnal_data.kib_ni->ni_nid), - HIPQUAD(peer_ip), peer_port); - goto out; - } - - srcnid = msg->ibm_srcnid; - srcstamp = msg->ibm_srcstamp; - - kibnal_init_msg(msg, IBNAL_MSG_SVCRSP, sizeof(msg->ibm_u.svcrsp)); - - msg->ibm_u.svcrsp.ibsr_svc_id = kibnal_data.kib_svc_id; - memcpy(msg->ibm_u.svcrsp.ibsr_svc_gid, kibnal_data.kib_svc_gid, - sizeof(kibnal_data.kib_svc_gid)); - msg->ibm_u.svcrsp.ibsr_svc_pkey = kibnal_data.kib_svc_pkey; - - kibnal_pack_msg(msg, version, 0, srcnid, srcstamp); - - reply: - rc = libcfs_sock_write (sock, msg, msg->ibm_nob, - lnet_acceptor_timeout()); - if (!reject && rc != 0) { - /* Only complain if we're not rejecting */ - CERROR("Error %d replying to svcqry from %u.%u.%u.%u/%d\n", - rc, HIPQUAD(peer_ip), peer_port); - } - - out: - LIBCFS_FREE(msg, sizeof(*msg)); -} - -void -kibnal_free_acceptsock (kib_acceptsock_t *as) -{ - libcfs_sock_release(as->ibas_sock); - LIBCFS_FREE(as, sizeof(*as)); -} - -int -kibnal_accept(lnet_ni_t *ni, struct socket *sock) -{ - kib_acceptsock_t *as; - unsigned long flags; - - LIBCFS_ALLOC(as, sizeof(*as)); - if (as == NULL) { - CERROR("Out of Memory\n"); - return -ENOMEM; - } - - as->ibas_sock = sock; - - spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); - - list_add_tail(&as->ibas_list, &kibnal_data.kib_connd_acceptq); - wake_up(&kibnal_data.kib_connd_waitq); - - spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); - return 0; -} - -int -kibnal_start_ib_listener (void) -{ - int rc; - - LASSERT (kibnal_data.kib_listen_handle == NULL); - - kibnal_data.kib_svc_id = ib_cm_service_assign(); - CDEBUG(D_NET, "svc id "LPX64"\n", kibnal_data.kib_svc_id); - - rc = ib_cached_gid_get(kibnal_data.kib_device, - kibnal_data.kib_port, 0, - kibnal_data.kib_svc_gid); - if (rc != 0) { - CERROR("Can't get port %d GID: %d\n", - kibnal_data.kib_port, rc); - return rc; - } - - rc = ib_cached_pkey_get(kibnal_data.kib_device, - kibnal_data.kib_port, 0, - &kibnal_data.kib_svc_pkey); - if (rc != 0) { - CERROR ("Can't get port %d PKEY: %d\n", - kibnal_data.kib_port, rc); - return rc; - } - - rc = ib_cm_listen(kibnal_data.kib_svc_id, - TS_IB_CM_SERVICE_EXACT_MASK, - kibnal_passive_conn_callback, NULL, - &kibnal_data.kib_listen_handle); - if (rc != 0) { - kibnal_data.kib_listen_handle = NULL; - CERROR ("Can't create IB listener: %d\n", rc); - return rc; - } - - LASSERT (kibnal_data.kib_listen_handle != NULL); - return 0; -} - -void -kibnal_stop_ib_listener (void) -{ - int rc; - - LASSERT (kibnal_data.kib_listen_handle != NULL); - - rc = ib_cm_listen_stop (kibnal_data.kib_listen_handle); - if (rc != 0) - CERROR("Error stopping IB listener: %d\n", rc); - - kibnal_data.kib_listen_handle = NULL; -} - -int -kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid) -{ - kib_peer_t *peer; - unsigned long flags; - int rc; - - LASSERT (nid != LNET_NID_ANY); - - LIBCFS_ALLOC(peer, sizeof (*peer)); - if (peer == NULL) { - CERROR("Cannot allocate peer\n"); - return -ENOMEM; - } - - memset(peer, 0, sizeof(*peer)); /* zero flags etc */ - - peer->ibp_nid = nid; - atomic_set (&peer->ibp_refcount, 1); /* 1 ref for caller */ - - INIT_LIST_HEAD (&peer->ibp_list); /* not in the peer table yet */ - INIT_LIST_HEAD (&peer->ibp_conns); - INIT_LIST_HEAD (&peer->ibp_tx_queue); - INIT_LIST_HEAD (&peer->ibp_connd_list); /* not queued for connecting */ - - peer->ibp_error = 0; - peer->ibp_last_alive = cfs_time_current(); - peer->ibp_reconnect_interval = 0; /* OK to connect at any time */ - - write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - - if (atomic_read(&kibnal_data.kib_npeers) >= - *kibnal_tunables.kib_concurrent_peers) { - rc = -EOVERFLOW; /* !! but at least it distinguishes */ - } else if (kibnal_data.kib_nonewpeers) { - rc = -ESHUTDOWN; /* shutdown has started */ - } else { - rc = 0; - /* npeers only grows with kib_global_lock held */ - atomic_inc(&kibnal_data.kib_npeers); - } - - write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - - if (rc != 0) { - CERROR("Can't create peer: %s\n", - (rc == -ESHUTDOWN) ? "shutting down" : - "too many peers"); - LIBCFS_FREE(peer, sizeof(*peer)); - } else { - *peerp = peer; - } - - return rc; -} - -void -kibnal_destroy_peer (kib_peer_t *peer) -{ - CDEBUG (D_NET, "peer %s %p deleted\n", - libcfs_nid2str(peer->ibp_nid), peer); - - LASSERT (atomic_read (&peer->ibp_refcount) == 0); - LASSERT (peer->ibp_persistence == 0); - LASSERT (!kibnal_peer_active(peer)); - LASSERT (peer->ibp_connecting == 0); - LASSERT (peer->ibp_accepting == 0); - LASSERT (list_empty (&peer->ibp_connd_list)); - LASSERT (list_empty (&peer->ibp_conns)); - LASSERT (list_empty (&peer->ibp_tx_queue)); - - LIBCFS_FREE (peer, sizeof (*peer)); - - /* NB a peer's connections keep a reference on their peer until - * they are destroyed, so we can be assured that _all_ state to do - * with this peer has been cleaned up when its refcount drops to - * zero. */ - atomic_dec(&kibnal_data.kib_npeers); -} - -kib_peer_t * -kibnal_find_peer_locked (lnet_nid_t nid) -{ - struct list_head *peer_list = kibnal_nid2peerlist (nid); - struct list_head *tmp; - kib_peer_t *peer; - - list_for_each (tmp, peer_list) { - - peer = list_entry (tmp, kib_peer_t, ibp_list); - - LASSERT (peer->ibp_persistence != 0 || /* persistent peer */ - peer->ibp_connecting != 0 || /* creating conns */ - peer->ibp_accepting != 0 || - !list_empty (&peer->ibp_conns)); /* active conn */ - - if (peer->ibp_nid != nid) - continue; - - return (peer); - } - return (NULL); -} - -kib_peer_t * -kibnal_get_peer (lnet_nid_t nid) -{ - kib_peer_t *peer; - unsigned long flags; - - read_lock_irqsave(&kibnal_data.kib_global_lock, flags); - peer = kibnal_find_peer_locked (nid); - if (peer != NULL) /* +1 ref for caller? */ - kibnal_peer_addref(peer); - read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - - return (peer); -} - -void -kibnal_unlink_peer_locked (kib_peer_t *peer) -{ - LASSERT (peer->ibp_persistence == 0); - LASSERT (list_empty(&peer->ibp_conns)); - - LASSERT (kibnal_peer_active(peer)); - list_del_init (&peer->ibp_list); - /* lose peerlist's ref */ - kibnal_peer_decref(peer); -} - -int -kibnal_get_peer_info (int index, lnet_nid_t *nidp, __u32 *ipp, int *portp, - int *persistencep) -{ - kib_peer_t *peer; - struct list_head *ptmp; - unsigned long flags; - int i; - - read_lock_irqsave(&kibnal_data.kib_global_lock, flags); - - for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { - - list_for_each (ptmp, &kibnal_data.kib_peers[i]) { - - peer = list_entry (ptmp, kib_peer_t, ibp_list); - LASSERT (peer->ibp_persistence != 0 || - peer->ibp_connecting != 0 || - peer->ibp_accepting != 0 || - !list_empty (&peer->ibp_conns)); - - if (index-- > 0) - continue; - - *nidp = peer->ibp_nid; - *ipp = peer->ibp_ip; - *portp = peer->ibp_port; - *persistencep = peer->ibp_persistence; - - read_unlock_irqrestore(&kibnal_data.kib_global_lock, - flags); - return (0); - } - } - - read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - return (-ENOENT); -} - -int -kibnal_add_persistent_peer (lnet_nid_t nid, __u32 ip, int port) -{ - unsigned long flags; - kib_peer_t *peer; - kib_peer_t *peer2; - int rc; - - if (nid == LNET_NID_ANY) - return (-EINVAL); - - rc = kibnal_create_peer (&peer, nid); - if (rc != 0) - return rc; - - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - - /* I'm always called with a reference on kibnal_data.kib_ni - * so shutdown can't have started */ - LASSERT (kibnal_data.kib_nonewpeers == 0); - - peer2 = kibnal_find_peer_locked (nid); - if (peer2 != NULL) { - kibnal_peer_decref(peer); - peer = peer2; - } else { - /* peer table takes existing ref on peer */ - list_add_tail (&peer->ibp_list, - kibnal_nid2peerlist (nid)); - } - - peer->ibp_ip = ip; - peer->ibp_port = port; - peer->ibp_persistence++; - - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - return (0); -} - -void -kibnal_del_peer_locked (kib_peer_t *peer) -{ - struct list_head *ctmp; - struct list_head *cnxt; - kib_conn_t *conn; - - peer->ibp_persistence = 0; - - if (list_empty(&peer->ibp_conns)) { - kibnal_unlink_peer_locked(peer); - } else { - list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { - conn = list_entry(ctmp, kib_conn_t, ibc_list); - - kibnal_close_conn_locked (conn, 0); - } - /* NB peer is no longer persistent; closing its last conn - * unlinked it. */ - } - /* NB peer now unlinked; might even be freed if the peer table had the - * last ref on it. */ -} - -int -kibnal_del_peer (lnet_nid_t nid) -{ - unsigned long flags; - CFS_LIST_HEAD (zombies); - struct list_head *ptmp; - struct list_head *pnxt; - kib_peer_t *peer; - int lo; - int hi; - int i; - int rc = -ENOENT; - - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - - if (nid != LNET_NID_ANY) - lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; - else { - lo = 0; - hi = kibnal_data.kib_peer_hash_size - 1; - } - - for (i = lo; i <= hi; i++) { - list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) { - peer = list_entry (ptmp, kib_peer_t, ibp_list); - LASSERT (peer->ibp_persistence != 0 || - peer->ibp_connecting != 0 || - peer->ibp_accepting != 0 || - !list_empty (&peer->ibp_conns)); - - if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid)) - continue; - - if (!list_empty(&peer->ibp_tx_queue)) { - LASSERT (list_empty(&peer->ibp_conns)); - - list_splice_init(&peer->ibp_tx_queue, &zombies); - } - - kibnal_del_peer_locked (peer); - rc = 0; /* matched something */ - } - } - - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - - kibnal_txlist_done(&zombies, -EIO); - - return (rc); -} - -kib_conn_t * -kibnal_get_conn_by_idx (int index) -{ - kib_peer_t *peer; - struct list_head *ptmp; - kib_conn_t *conn; - struct list_head *ctmp; - unsigned long flags; - int i; - - read_lock_irqsave(&kibnal_data.kib_global_lock, flags); - - for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { - list_for_each (ptmp, &kibnal_data.kib_peers[i]) { - - peer = list_entry (ptmp, kib_peer_t, ibp_list); - LASSERT (peer->ibp_persistence > 0 || - peer->ibp_connecting != 0 || - peer->ibp_accepting != 0 || - !list_empty (&peer->ibp_conns)); - - list_for_each (ctmp, &peer->ibp_conns) { - if (index-- > 0) - continue; - - conn = list_entry (ctmp, kib_conn_t, ibc_list); - kibnal_conn_addref(conn); - read_unlock_irqrestore(&kibnal_data.kib_global_lock, - flags); - return (conn); - } - } - } - - read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - return (NULL); -} - -kib_conn_t * -kibnal_create_conn (void) -{ - kib_conn_t *conn; - int i; - __u64 vaddr = 0; - __u64 vaddr_base; - int page_offset; - int ipage; - int rc; - union { - struct ib_qp_create_param qp_create; - struct ib_qp_attribute qp_attr; - } params; - - LIBCFS_ALLOC (conn, sizeof (*conn)); - if (conn == NULL) { - CERROR ("Can't allocate connection\n"); - return (NULL); - } - - /* zero flags, NULL pointers etc... */ - memset (conn, 0, sizeof (*conn)); - - INIT_LIST_HEAD (&conn->ibc_tx_queue_nocred); - INIT_LIST_HEAD (&conn->ibc_tx_queue); - INIT_LIST_HEAD (&conn->ibc_tx_queue_rsrvd); - INIT_LIST_HEAD (&conn->ibc_active_txs); - spin_lock_init (&conn->ibc_lock); - - atomic_inc (&kibnal_data.kib_nconns); - /* well not really, but I call destroy() on failure, which decrements */ - - LIBCFS_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t)); - if (conn->ibc_rxs == NULL) - goto failed; - memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t)); - - rc = kibnal_alloc_pages(&conn->ibc_rx_pages, - IBNAL_RX_MSG_PAGES, - IB_ACCESS_LOCAL_WRITE); - if (rc != 0) - goto failed; - - vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr; - - for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) { - struct page *page = conn->ibc_rx_pages->ibp_pages[ipage]; - kib_rx_t *rx = &conn->ibc_rxs[i]; - - rx->rx_conn = conn; - rx->rx_vaddr = vaddr; - rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset); - - vaddr += IBNAL_MSG_SIZE; - LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES); - - page_offset += IBNAL_MSG_SIZE; - LASSERT (page_offset <= PAGE_SIZE); - - if (page_offset == PAGE_SIZE) { - page_offset = 0; - ipage++; - LASSERT (ipage <= IBNAL_RX_MSG_PAGES); - } - } - - /* We can post up to IBNAL_RX_MSGS, which may also include an - * additional RDMA work item */ - - params.qp_create = (struct ib_qp_create_param) { - .limit = { - .max_outstanding_send_request = 2 * IBNAL_RX_MSGS, - .max_outstanding_receive_request = IBNAL_RX_MSGS, - .max_send_gather_element = 1, - .max_receive_scatter_element = 1, - }, - .pd = kibnal_data.kib_pd, - .send_queue = kibnal_data.kib_cq, - .receive_queue = kibnal_data.kib_cq, - .send_policy = IB_WQ_SIGNAL_SELECTABLE, - .receive_policy = IB_WQ_SIGNAL_SELECTABLE, - .rd_domain = 0, - .transport = IB_TRANSPORT_RC, - .device_specific = NULL, - }; - - rc = ib_qp_create (¶ms.qp_create, &conn->ibc_qp, &conn->ibc_qpn); - if (rc != 0) { - CERROR ("Failed to create queue pair: %d\n", rc); - goto failed; - } - - /* Mark QP created */ - conn->ibc_state = IBNAL_CONN_INIT_QP; - - params.qp_attr = (struct ib_qp_attribute) { - .state = IB_QP_STATE_INIT, - .port = kibnal_data.kib_port, - .enable_rdma_read = 1, - .enable_rdma_write = 1, - .valid_fields = (IB_QP_ATTRIBUTE_STATE | - IB_QP_ATTRIBUTE_PORT | - IB_QP_ATTRIBUTE_PKEY_INDEX | - IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE), - }; - rc = ib_qp_modify(conn->ibc_qp, ¶ms.qp_attr); - if (rc != 0) { - CERROR ("Failed to modify queue pair: %d\n", rc); - goto failed; - } - - /* 1 ref for caller */ - atomic_set (&conn->ibc_refcount, 1); - return (conn); - - failed: - kibnal_destroy_conn (conn); - return (NULL); -} - -void -kibnal_destroy_conn (kib_conn_t *conn) -{ - int rc; - - CDEBUG (D_NET, "connection %p\n", conn); - - LASSERT (atomic_read (&conn->ibc_refcount) == 0); - LASSERT (list_empty(&conn->ibc_tx_queue)); - LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd)); - LASSERT (list_empty(&conn->ibc_tx_queue_nocred)); - LASSERT (list_empty(&conn->ibc_active_txs)); - LASSERT (conn->ibc_nsends_posted == 0); - LASSERT (conn->ibc_connreq == NULL); - - switch (conn->ibc_state) { - case IBNAL_CONN_ZOMBIE: - /* called after connection sequence initiated */ - - case IBNAL_CONN_INIT_QP: - rc = ib_qp_destroy(conn->ibc_qp); - if (rc != 0) - CERROR("Can't destroy QP: %d\n", rc); - /* fall through */ - - case IBNAL_CONN_INIT_NOTHING: - break; - - default: - LASSERT (0); - } - - if (conn->ibc_rx_pages != NULL) - kibnal_free_pages(conn->ibc_rx_pages); - - if (conn->ibc_rxs != NULL) - LIBCFS_FREE(conn->ibc_rxs, - IBNAL_RX_MSGS * sizeof(kib_rx_t)); - - if (conn->ibc_peer != NULL) - kibnal_peer_decref(conn->ibc_peer); - - LIBCFS_FREE(conn, sizeof (*conn)); - - atomic_dec(&kibnal_data.kib_nconns); - - if (atomic_read (&kibnal_data.kib_nconns) == 0 && - kibnal_data.kib_shutdown) { - /* I just nuked the last connection on shutdown; wake up - * everyone so they can exit. */ - wake_up_all(&kibnal_data.kib_sched_waitq); - wake_up_all(&kibnal_data.kib_reaper_waitq); - } -} - -int -kibnal_close_peer_conns_locked (kib_peer_t *peer, int why) -{ - kib_conn_t *conn; - struct list_head *ctmp; - struct list_head *cnxt; - int count = 0; - - list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { - conn = list_entry (ctmp, kib_conn_t, ibc_list); - - count++; - kibnal_close_conn_locked (conn, why); - } - - return (count); -} - -int -kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation) -{ - kib_conn_t *conn; - struct list_head *ctmp; - struct list_head *cnxt; - int count = 0; - - list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { - conn = list_entry (ctmp, kib_conn_t, ibc_list); - - if (conn->ibc_incarnation == incarnation) - continue; - - CDEBUG(D_NET, "Closing stale conn %p nid: %s" - " incarnation:"LPX64"("LPX64")\n", conn, - libcfs_nid2str(peer->ibp_nid), - conn->ibc_incarnation, incarnation); - - count++; - kibnal_close_conn_locked (conn, -ESTALE); - } - - return (count); -} - -int -kibnal_close_matching_conns (lnet_nid_t nid) -{ - unsigned long flags; - kib_peer_t *peer; - struct list_head *ptmp; - struct list_head *pnxt; - int lo; - int hi; - int i; - int count = 0; - - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - - if (nid != LNET_NID_ANY) - lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; - else { - lo = 0; - hi = kibnal_data.kib_peer_hash_size - 1; - } - - for (i = lo; i <= hi; i++) { - list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) { - - peer = list_entry (ptmp, kib_peer_t, ibp_list); - LASSERT (peer->ibp_persistence != 0 || - peer->ibp_connecting != 0 || - peer->ibp_accepting != 0 || - !list_empty (&peer->ibp_conns)); - - if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid)) - continue; - - count += kibnal_close_peer_conns_locked (peer, 0); - } - } - - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - - /* wildcards always succeed */ - if (nid == LNET_NID_ANY) - return (0); - - return (count == 0 ? -ENOENT : 0); -} - -int -kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) -{ - struct libcfs_ioctl_data *data = arg; - int rc = -EINVAL; - - LASSERT (ni == kibnal_data.kib_ni); - - switch(cmd) { - case IOC_LIBCFS_GET_PEER: { - lnet_nid_t nid = 0; - __u32 ip = 0; - int port = 0; - int share_count = 0; - - rc = kibnal_get_peer_info(data->ioc_count, - &nid, &ip, &port, &share_count); - data->ioc_nid = nid; - data->ioc_count = share_count; - data->ioc_u32[0] = ip; - data->ioc_u32[1] = port; - break; - } - case IOC_LIBCFS_ADD_PEER: { - rc = kibnal_add_persistent_peer (data->ioc_nid, - data->ioc_u32[0], /* IP */ - data->ioc_u32[1]); /* port */ - break; - } - case IOC_LIBCFS_DEL_PEER: { - rc = kibnal_del_peer (data->ioc_nid); - break; - } - case IOC_LIBCFS_GET_CONN: { - kib_conn_t *conn = kibnal_get_conn_by_idx (data->ioc_count); - - if (conn == NULL) - rc = -ENOENT; - else { - rc = 0; - data->ioc_nid = conn->ibc_peer->ibp_nid; - kibnal_conn_decref(conn); - } - break; - } - case IOC_LIBCFS_CLOSE_CONNECTION: { - rc = kibnal_close_matching_conns (data->ioc_nid); - break; - } - case IOC_LIBCFS_REGISTER_MYNID: { - /* Ignore if this is a noop */ - if (data->ioc_nid == ni->ni_nid) { - rc = 0; - } else { - CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n", - libcfs_nid2str(data->ioc_nid), - libcfs_nid2str(ni->ni_nid)); - rc = -EINVAL; - } - break; - } - } - - return rc; -} - -void -kibnal_free_pages (kib_pages_t *p) -{ - int npages = p->ibp_npages; - int rc; - int i; - - if (p->ibp_mapped) { - rc = ib_memory_deregister(p->ibp_handle); - if (rc != 0) - CERROR ("Deregister error: %d\n", rc); - } - - for (i = 0; i < npages; i++) - if (p->ibp_pages[i] != NULL) - __free_page(p->ibp_pages[i]); - - LIBCFS_FREE (p, offsetof(kib_pages_t, ibp_pages[npages])); -} - -int -kibnal_alloc_pages (kib_pages_t **pp, int npages, int access) -{ - kib_pages_t *p; - struct ib_physical_buffer *phys_pages; - int i; - int rc; - - LIBCFS_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages])); - if (p == NULL) { - CERROR ("Can't allocate buffer %d\n", npages); - return (-ENOMEM); - } - - memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages])); - p->ibp_npages = npages; - - for (i = 0; i < npages; i++) { - p->ibp_pages[i] = alloc_page (GFP_KERNEL); - if (p->ibp_pages[i] == NULL) { - CERROR ("Can't allocate page %d of %d\n", i, npages); - kibnal_free_pages(p); - return (-ENOMEM); - } - } - - LIBCFS_ALLOC(phys_pages, npages * sizeof(*phys_pages)); - if (phys_pages == NULL) { - CERROR ("Can't allocate physarray for %d pages\n", npages); - kibnal_free_pages(p); - return (-ENOMEM); - } - - for (i = 0; i < npages; i++) { - phys_pages[i].size = PAGE_SIZE; - phys_pages[i].address = - lnet_page2phys(p->ibp_pages[i]); - } - - p->ibp_vaddr = 0; - rc = ib_memory_register_physical(kibnal_data.kib_pd, - phys_pages, npages, - &p->ibp_vaddr, - npages * PAGE_SIZE, 0, - access, - &p->ibp_handle, - &p->ibp_lkey, - &p->ibp_rkey); - - LIBCFS_FREE(phys_pages, npages * sizeof(*phys_pages)); - - if (rc != 0) { - CERROR ("Error %d mapping %d pages\n", rc, npages); - kibnal_free_pages(p); - return (rc); - } - - p->ibp_mapped = 1; - *pp = p; - return (0); -} - -int -kibnal_setup_tx_descs (void) -{ - int ipage = 0; - int page_offset = 0; - __u64 vaddr; - __u64 vaddr_base; - struct page *page; - kib_tx_t *tx; - int i; - int rc; - - /* pre-mapped messages are not bigger than 1 page */ - LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE); - - /* No fancy arithmetic when we do the buffer calculations */ - LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0); - - rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, - IBNAL_TX_MSG_PAGES(), - 0); /* local read access only */ - if (rc != 0) - return (rc); - - vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr; - - for (i = 0; i < IBNAL_TX_MSGS(); i++) { - page = kibnal_data.kib_tx_pages->ibp_pages[ipage]; - tx = &kibnal_data.kib_tx_descs[i]; - - memset (tx, 0, sizeof(*tx)); /* zero flags etc */ - - tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset); - tx->tx_vaddr = vaddr; - tx->tx_mapped = KIB_TX_UNMAPPED; - - CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", - i, tx, tx->tx_msg, tx->tx_vaddr); - - list_add (&tx->tx_list, &kibnal_data.kib_idle_txs); - - vaddr += IBNAL_MSG_SIZE; - LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES()); - - page_offset += IBNAL_MSG_SIZE; - LASSERT (page_offset <= PAGE_SIZE); - - if (page_offset == PAGE_SIZE) { - page_offset = 0; - ipage++; - LASSERT (ipage <= IBNAL_TX_MSG_PAGES()); - } - } - - return (0); -} - -void -kibnal_shutdown (lnet_ni_t *ni) -{ - int i; - int rc; - unsigned long flags; - - CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", - atomic_read (&libcfs_kmemory)); - - LASSERT(ni == kibnal_data.kib_ni); - LASSERT(ni->ni_data == &kibnal_data); - - switch (kibnal_data.kib_init) { - default: - CERROR ("Unexpected state %d\n", kibnal_data.kib_init); - LBUG(); - - case IBNAL_INIT_ALL: - /* Prevent new peers from being created */ - write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - kibnal_data.kib_nonewpeers = 1; - write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - - kibnal_stop_ib_listener(); - - /* Remove all existing peers from the peer table */ - kibnal_del_peer(LNET_NID_ANY); - - /* Wait for pending conn reqs to be handled */ - i = 2; - spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); - while (!list_empty(&kibnal_data.kib_connd_acceptq)) { - spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, - flags); - i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n */ - "waiting for conn reqs to clean up\n"); - cfs_pause(cfs_time_seconds(1)); - - spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); - } - spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); - - /* Wait for all peer state to clean up */ - i = 2; - while (atomic_read(&kibnal_data.kib_npeers) != 0) { - i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ - "waiting for %d peers to close down\n", - atomic_read(&kibnal_data.kib_npeers)); - cfs_pause(cfs_time_seconds(1)); - } - /* fall through */ - - case IBNAL_INIT_CQ: - rc = ib_cq_destroy (kibnal_data.kib_cq); - if (rc != 0) - CERROR ("Destroy CQ error: %d\n", rc); - /* fall through */ - - case IBNAL_INIT_TXD: - kibnal_free_pages (kibnal_data.kib_tx_pages); - /* fall through */ -#if IBNAL_FMR - case IBNAL_INIT_FMR: - rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool); - if (rc != 0) - CERROR ("Destroy FMR pool error: %d\n", rc); - /* fall through */ -#endif - case IBNAL_INIT_PD: - rc = ib_pd_destroy(kibnal_data.kib_pd); - if (rc != 0) - CERROR ("Destroy PD error: %d\n", rc); - /* fall through */ - - case IBNAL_INIT_DATA: - /* Module refcount only gets to zero when all peers - * have been closed so all lists must be empty */ - LASSERT (atomic_read(&kibnal_data.kib_npeers) == 0); - LASSERT (kibnal_data.kib_peers != NULL); - for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { - LASSERT (list_empty (&kibnal_data.kib_peers[i])); - } - LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0); - LASSERT (list_empty (&kibnal_data.kib_sched_rxq)); - LASSERT (list_empty (&kibnal_data.kib_sched_txq)); - LASSERT (list_empty (&kibnal_data.kib_reaper_conns)); - LASSERT (list_empty (&kibnal_data.kib_connd_peers)); - LASSERT (list_empty (&kibnal_data.kib_connd_acceptq)); - - /* flag threads to terminate; wake and wait for them to die */ - kibnal_data.kib_shutdown = 1; - wake_up_all (&kibnal_data.kib_sched_waitq); - wake_up_all (&kibnal_data.kib_reaper_waitq); - wake_up_all (&kibnal_data.kib_connd_waitq); - - i = 2; - while (atomic_read (&kibnal_data.kib_nthreads) != 0) { - i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ - "Waiting for %d threads to terminate\n", - atomic_read (&kibnal_data.kib_nthreads)); - cfs_pause(cfs_time_seconds(1)); - } - /* fall through */ - - case IBNAL_INIT_NOTHING: - break; - } - - if (kibnal_data.kib_tx_descs != NULL) - LIBCFS_FREE (kibnal_data.kib_tx_descs, - IBNAL_TX_MSGS() * sizeof(kib_tx_t)); - - if (kibnal_data.kib_peers != NULL) - LIBCFS_FREE (kibnal_data.kib_peers, - sizeof (struct list_head) * - kibnal_data.kib_peer_hash_size); - - CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", - atomic_read (&libcfs_kmemory)); - - kibnal_data.kib_init = IBNAL_INIT_NOTHING; - PORTAL_MODULE_UNUSE; -} - -int -kibnal_get_ipoibidx(void) -{ - /* NB single threaded! */ - static struct ib_port_properties port_props; - - int ipoibidx = 0; - int devidx; - int port; - int rc; - struct ib_device *device; - - for (devidx = 0; devidx <= kibnal_data.kib_hca_idx; devidx++) { - device = ib_device_get_by_index(devidx); - - if (device == NULL) { - CERROR("Can't get IB device %d\n", devidx); - return -1; - } - - for (port = 1; port <= 2; port++) { - if (devidx == kibnal_data.kib_hca_idx && - port == kibnal_data.kib_port) - return ipoibidx; - - rc = ib_port_properties_get(device, port, - &port_props); - if (rc == 0) - ipoibidx++; - } - } - - LBUG(); - return -1; -} - -int -kibnal_startup (lnet_ni_t *ni) -{ - char ipif_name[32]; - __u32 ip; - __u32 netmask; - int up; - struct timeval tv; - int rc; - int hca; - int port; - int i; - int nob; - - LASSERT (ni->ni_lnd == &the_kiblnd); - - /* Only 1 instance supported */ - if (kibnal_data.kib_init != IBNAL_INIT_NOTHING) { - CERROR ("Only 1 instance supported\n"); - return -EPERM; - } - - if (*kibnal_tunables.kib_credits > *kibnal_tunables.kib_ntx) { - CERROR ("Can't set credits(%d) > ntx(%d)\n", - *kibnal_tunables.kib_credits, - *kibnal_tunables.kib_ntx); - return -EINVAL; - } - - memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */ - - ni->ni_maxtxcredits = *kibnal_tunables.kib_credits; - ni->ni_peertxcredits = *kibnal_tunables.kib_peercredits; - - CLASSERT (LNET_MAX_INTERFACES > 1); - - - kibnal_data.kib_hca_idx = 0; /* default: first HCA */ - kibnal_data.kib_port = 0; /* any port */ - - if (ni->ni_interfaces[0] != NULL) { - /* hca.port specified in 'networks=openib(h.p)' */ - if (ni->ni_interfaces[1] != NULL) { - CERROR("Multiple interfaces not supported\n"); - return -EPERM; - } - - nob = strlen(ni->ni_interfaces[0]); - i = sscanf(ni->ni_interfaces[0], "%d.%d%n", &hca, &port, &nob); - if (i >= 2 && nob == strlen(ni->ni_interfaces[0])) { - kibnal_data.kib_hca_idx = hca; - kibnal_data.kib_port = port; - } else { - nob = strlen(ni->ni_interfaces[0]); - i = sscanf(ni->ni_interfaces[0], "%d%n", &hca, &nob); - - if (i >= 1 && nob == strlen(ni->ni_interfaces[0])) { - kibnal_data.kib_hca_idx = hca; - } else { - CERROR("Can't parse interface '%s'\n", - ni->ni_interfaces[0]); - return -EINVAL; - } - } - } - - kibnal_data.kib_ni = ni; - ni->ni_data = &kibnal_data; - - do_gettimeofday(&tv); - kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; - - PORTAL_MODULE_USE; - - rwlock_init(&kibnal_data.kib_global_lock); - - kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE; - LIBCFS_ALLOC (kibnal_data.kib_peers, - sizeof (struct list_head) * kibnal_data.kib_peer_hash_size); - if (kibnal_data.kib_peers == NULL) { - goto failed; - } - for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) - INIT_LIST_HEAD(&kibnal_data.kib_peers[i]); - - spin_lock_init (&kibnal_data.kib_reaper_lock); - INIT_LIST_HEAD (&kibnal_data.kib_reaper_conns); - init_waitqueue_head (&kibnal_data.kib_reaper_waitq); - - spin_lock_init (&kibnal_data.kib_connd_lock); - INIT_LIST_HEAD (&kibnal_data.kib_connd_acceptq); - INIT_LIST_HEAD (&kibnal_data.kib_connd_peers); - init_waitqueue_head (&kibnal_data.kib_connd_waitq); - - spin_lock_init (&kibnal_data.kib_sched_lock); - INIT_LIST_HEAD (&kibnal_data.kib_sched_txq); - INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq); - init_waitqueue_head (&kibnal_data.kib_sched_waitq); - - spin_lock_init (&kibnal_data.kib_tx_lock); - INIT_LIST_HEAD (&kibnal_data.kib_idle_txs); - - LIBCFS_ALLOC (kibnal_data.kib_tx_descs, - IBNAL_TX_MSGS() * sizeof(kib_tx_t)); - if (kibnal_data.kib_tx_descs == NULL) { - CERROR ("Can't allocate tx descs\n"); - goto failed; - } - - /* lists/ptrs/locks initialised */ - kibnal_data.kib_init = IBNAL_INIT_DATA; - /*****************************************************/ - - for (i = 0; i < IBNAL_N_SCHED; i++) { - rc = kibnal_thread_start (kibnal_scheduler, - (void *)((unsigned long)i)); - if (rc != 0) { - CERROR("Can't spawn openibnal scheduler[%d]: %d\n", - i, rc); - goto failed; - } - } - - /* must have at least 2 connds to remain responsive to svcqry while - * connecting */ - if (*kibnal_tunables.kib_n_connd < 2) - *kibnal_tunables.kib_n_connd = 2; - - - for (i = 0; i < *kibnal_tunables.kib_n_connd; i++) { - rc = kibnal_thread_start (kibnal_connd, - (void *)((unsigned long)i)); - if (rc != 0) { - CERROR("Can't spawn openibnal connd[%d]: %d\n", - i, rc); - goto failed; - } - } - - rc = kibnal_thread_start (kibnal_reaper, NULL); - if (rc != 0) { - CERROR ("Can't spawn openibnal reaper: %d\n", rc); - goto failed; - } - - kibnal_data.kib_device = ib_device_get_by_index(kibnal_data.kib_hca_idx); - if (kibnal_data.kib_device == NULL) { - CERROR ("Can't open ib device %d\n", - kibnal_data.kib_hca_idx); - goto failed; - } - - rc = ib_device_properties_get(kibnal_data.kib_device, - &kibnal_data.kib_device_props); - if (rc != 0) { - CERROR ("Can't get device props: %d\n", rc); - goto failed; - } - - CDEBUG(D_NET, "Max Initiator: %d Max Responder %d\n", - kibnal_data.kib_device_props.max_initiator_per_qp, - kibnal_data.kib_device_props.max_responder_per_qp); - - if (kibnal_data.kib_port != 0) { - rc = ib_port_properties_get(kibnal_data.kib_device, - kibnal_data.kib_port, - &kibnal_data.kib_port_props); - if (rc != 0) { - CERROR("Error %d open port %d on HCA %d\n", rc, - kibnal_data.kib_port, - kibnal_data.kib_hca_idx); - goto failed; - } - } else { - for (i = 1; i <= 2; i++) { - rc = ib_port_properties_get(kibnal_data.kib_device, i, - &kibnal_data.kib_port_props); - if (rc == 0) { - kibnal_data.kib_port = i; - break; - } - } - if (kibnal_data.kib_port == 0) { - CERROR ("Can't find a port\n"); - goto failed; - } - } - - i = kibnal_get_ipoibidx(); - if (i < 0) - goto failed; - - snprintf(ipif_name, sizeof(ipif_name), "%s%d", - *kibnal_tunables.kib_ipif_basename, i); - if (strlen(ipif_name) == sizeof(ipif_name) - 1) { - CERROR("IPoIB interface name %s truncated\n", ipif_name); - return -EINVAL; - } - - rc = libcfs_ipif_query(ipif_name, &up, &ip, &netmask); - if (rc != 0) { - CERROR("Can't query IPoIB interface %s: %d\n", ipif_name, rc); - goto failed; - } - - if (!up) { - CERROR("Can't query IPoIB interface %s: it's down\n", ipif_name); - goto failed; - } - - ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ip); - - rc = ib_pd_create(kibnal_data.kib_device, - NULL, &kibnal_data.kib_pd); - if (rc != 0) { - CERROR ("Can't create PD: %d\n", rc); - goto failed; - } - - /* flag PD initialised */ - kibnal_data.kib_init = IBNAL_INIT_PD; - /*****************************************************/ -#if IBNAL_FMR - { - const int pool_size = *kibnal_tunables.kib_ntx; - struct ib_fmr_pool_param params = { - .max_pages_per_fmr = LNET_MAX_PAYLOAD/PAGE_SIZE, - .access = (IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_READ), - .pool_size = pool_size, - .dirty_watermark = (pool_size * 3)/4, - .flush_function = NULL, - .flush_arg = NULL, - .cache = 1, - }; - rc = ib_fmr_pool_create(kibnal_data.kib_pd, ¶ms, - &kibnal_data.kib_fmr_pool); - if (rc != 0) { - CERROR ("Can't create FMR pool size %d: %d\n", - pool_size, rc); - goto failed; - } - } - - /* flag FMR pool initialised */ - kibnal_data.kib_init = IBNAL_INIT_FMR; -#endif - /*****************************************************/ - - rc = kibnal_setup_tx_descs(); - if (rc != 0) { - CERROR ("Can't register tx descs: %d\n", rc); - goto failed; - } - - /* flag TX descs initialised */ - kibnal_data.kib_init = IBNAL_INIT_TXD; - /*****************************************************/ - - { - struct ib_cq_callback callback = { - .context = IBNAL_CALLBACK_CTXT, - .policy = IB_CQ_PROVIDER_REARM, - .function = { - .entry = kibnal_callback, - }, - .arg = NULL, - }; - int nentries = IBNAL_CQ_ENTRIES(); - - rc = ib_cq_create (kibnal_data.kib_device, - &nentries, &callback, NULL, - &kibnal_data.kib_cq); - if (rc != 0) { - CERROR ("Can't create CQ: %d\n", rc); - goto failed; - } - - /* I only want solicited events */ - rc = ib_cq_request_notification(kibnal_data.kib_cq, 1); - LASSERT (rc == 0); - } - - /* flag CQ initialised */ - kibnal_data.kib_init = IBNAL_INIT_CQ; - /*****************************************************/ - - rc = kibnal_start_ib_listener(); - if (rc != 0) - goto failed; - - /* flag everything initialised */ - kibnal_data.kib_init = IBNAL_INIT_ALL; - /*****************************************************/ - - return 0; - - failed: - kibnal_shutdown(ni); - return -ENETDOWN; -} - -void __exit -kibnal_module_fini (void) -{ - lnet_unregister_lnd(&the_kiblnd); - kibnal_tunables_fini(); -} - -int __init -kibnal_module_init (void) -{ - int rc; - - rc = kibnal_tunables_init(); - if (rc != 0) - return rc; - - lnet_register_lnd(&the_kiblnd); - - return (0); -} - -MODULE_AUTHOR("Sun Microsystems, Inc. "); -#ifdef USING_TSAPI -MODULE_DESCRIPTION("Kernel Cisco IB LND v1.00"); -#else -MODULE_DESCRIPTION("Kernel OpenIB(gen1) LND v1.00"); -#endif -MODULE_LICENSE("GPL"); - -module_init(kibnal_module_init); -module_exit(kibnal_module_fini); diff --git a/lnet/klnds/openiblnd/openiblnd.h b/lnet/klnds/openiblnd/openiblnd.h deleted file mode 100644 index 24db148..0000000 --- a/lnet/klnds/openiblnd/openiblnd.h +++ /dev/null @@ -1,706 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. - * - * GPL HEADER END - */ -/* - * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/openiblnd/openiblnd.h - * - * Author: Eric Barton - */ - -#ifndef EXPORT_SYMTAB -# define EXPORT_SYMTAB -#endif -#ifndef AUTOCONF_INCLUDED -#include -#endif -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#define DEBUG_SUBSYSTEM S_LND - -#include -#include -#include -#include - -#include -#include -#include - -#ifndef USING_TSAPI - -/* OpenIB Gen1 */ -typedef struct ib_qp ib_qp_t; -typedef struct ib_mr ib_mr_t; -typedef struct ib_fmr ib_fmr_t; -typedef struct ib_pd ib_pd_t; -typedef struct ib_cq ib_cq_t; -typedef struct ib_fmr_pool ib_fmr_pool_t; - -#else - -/* Cisco (topspin) */ -typedef void ib_qp_t; -typedef void ib_mr_t; -typedef void ib_fmr_t; -typedef void ib_pd_t; -typedef void ib_cq_t; -typedef void ib_fmr_pool_t; - -#define IB_ACCESS_LOCAL_WRITE TS_IB_ACCESS_LOCAL_WRITE -#define IB_WQ_SIGNAL_SELECTABLE TS_IB_ACCESS_LOCAL_WRITE -#define IB_TRANSPORT_RC TS_IB_TRANSPORT_RC -#define IB_QP_STATE_INIT TS_IB_QP_STATE_INIT -#define IB_QP_ATTRIBUTE_STATE TS_IB_QP_ATTRIBUTE_STATE -#define IB_QP_ATTRIBUTE_PORT TS_IB_QP_ATTRIBUTE_PORT -#define IB_QP_ATTRIBUTE_PKEY_INDEX TS_IB_QP_ATTRIBUTE_PKEY_INDEX -#define IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE -#define IB_ACCESS_LOCAL_WRITE TS_IB_ACCESS_LOCAL_WRITE -#define IB_ACCESS_REMOTE_WRITE TS_IB_ACCESS_REMOTE_WRITE -#define IB_ACCESS_REMOTE_READ TS_IB_ACCESS_REMOTE_READ -#define IB_CQ_CALLBACK_INTERRU TS_IB_CQ_CALLBACK_INTERRUPTPT -#define IB_CQ_PROVIDER_REARM TS_IB_CQ_PROVIDER_REARM -#define IB_CQ_CALLBACK_INTERRUPT TS_IB_CQ_CALLBACK_INTERRUPT -#define IB_COMPLETION_STATUS_SUCCESS TS_IB_COMPLETION_STATUS_SUCCESS -#define IB_OP_SEND TS_IB_OP_SEND -#define IB_OP_RDMA_WRITE TS_IB_OP_RDMA_WRITE -#define IB_OP_RDMA_READ TS_IB_OP_RDMA_READ - -#endif - -#ifdef CONFIG_SMP -# define IBNAL_N_SCHED num_online_cpus() /* # schedulers */ -#else -# define IBNAL_N_SCHED 1 /* # schedulers */ -#endif - -#define IBNAL_FMR 1 -//#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_PROCESS -#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_INTERRUPT - - -/* tunables fixed at compile time */ -#define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */ -#define IBNAL_RESCHED 100 /* # scheduler loops before reschedule */ -#define IBNAL_MSG_QUEUE_SIZE 8 /* # messages/RDMAs in-flight */ -#define IBNAL_CREDIT_HIGHWATER 6 /* when to eagerly return credits */ -#define IBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */ -#define IBNAL_RDMA_BASE 0x0eeb0000 - -/* QP tunables */ -#define IBNAL_RETRY 7 /* # times to retry */ -#define IBNAL_RNR_RETRY 7 /* */ -#define IBNAL_CM_RETRY 7 /* # times to retry connection */ -#define IBNAL_FLOW_CONTROL 1 -#define IBNAL_RESPONDER_RESOURCES 8 - -/************************/ -/* derived constants... */ - -/* TX messages (shared by all connections) */ -#define IBNAL_TX_MSGS() (*kibnal_tunables.kib_ntx) -#define IBNAL_TX_MSG_BYTES() (IBNAL_TX_MSGS() * IBNAL_MSG_SIZE) -#define IBNAL_TX_MSG_PAGES() ((IBNAL_TX_MSG_BYTES() + PAGE_SIZE - 1)/PAGE_SIZE) - -/* RX messages (per connection) */ -#define IBNAL_RX_MSGS (IBNAL_MSG_QUEUE_SIZE * 2) -#define IBNAL_RX_MSG_BYTES (IBNAL_RX_MSGS * IBNAL_MSG_SIZE) -#define IBNAL_RX_MSG_PAGES ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) - -/* we may have up to 2 completions per transmit + - 1 completion per receive, per connection */ -#define IBNAL_CQ_ENTRIES() ((2*IBNAL_TX_MSGS()) + \ - (IBNAL_RX_MSGS * *kibnal_tunables.kib_concurrent_peers)) - -typedef struct -{ - char **kib_ipif_basename; /* IPoIB interface base name */ - int *kib_n_connd; /* # connection daemons */ - int *kib_min_reconnect_interval; /* min connect retry seconds... */ - int *kib_max_reconnect_interval; /* max connect retry seconds */ - int *kib_concurrent_peers; /* max # peers */ - int *kib_cksum; /* checksum kib_msg_t? */ - int *kib_timeout; /* comms timeout (seconds) */ - int *kib_keepalive; /* keepalive (seconds) */ - int *kib_ntx; /* # tx descs */ - int *kib_credits; /* # concurrent sends */ - int *kib_peercredits; /* # concurrent sends to 1 peer */ - - cfs_sysctl_table_header_t *kib_sysctl; /* sysctl interface */ -} kib_tunables_t; - -typedef struct -{ - int ibp_npages; /* # pages */ - int ibp_mapped; /* mapped? */ - __u64 ibp_vaddr; /* mapped region vaddr */ - __u32 ibp_lkey; /* mapped region lkey */ - __u32 ibp_rkey; /* mapped region rkey */ - ib_mr_t *ibp_handle; /* mapped region handle */ - struct page *ibp_pages[0]; -} kib_pages_t; - -typedef struct -{ - int kib_init; /* initialisation state */ - __u64 kib_incarnation; /* which one am I */ - int kib_shutdown; /* shut down? */ - atomic_t kib_nthreads; /* # live threads */ - lnet_ni_t *kib_ni; /* _the_ openib interface */ - - __u64 kib_svc_id; /* service number I listen on */ - tTS_IB_GID kib_svc_gid; /* device/port GID */ - __u16 kib_svc_pkey; /* device/port pkey */ - - void *kib_listen_handle; /* IB listen handle */ - - rwlock_t kib_global_lock; /* stabilize peer/conn ops */ - - struct list_head *kib_peers; /* hash table of all my known peers */ - int kib_peer_hash_size; /* size of kib_peers */ - int kib_nonewpeers; /* prevent new peers? */ - atomic_t kib_npeers; /* # peers extant */ - atomic_t kib_nconns; /* # connections extant */ - - struct list_head kib_reaper_conns; /* connections to reap */ - wait_queue_head_t kib_reaper_waitq; /* reaper sleeps here */ - unsigned long kib_reaper_waketime; /* when reaper will wake */ - spinlock_t kib_reaper_lock; /* serialise */ - - struct list_head kib_connd_peers; /* peers waiting for a connection */ - struct list_head kib_connd_acceptq; /* accepted sockets to handle */ - wait_queue_head_t kib_connd_waitq; /* connection daemons sleep here */ - int kib_connd_connecting; /* # connds connecting */ - spinlock_t kib_connd_lock; /* serialise */ - - wait_queue_head_t kib_sched_waitq; /* schedulers sleep here */ - struct list_head kib_sched_txq; /* tx requiring attention */ - struct list_head kib_sched_rxq; /* rx requiring attention */ - spinlock_t kib_sched_lock; /* serialise */ - - struct kib_tx *kib_tx_descs; /* all the tx descriptors */ - kib_pages_t *kib_tx_pages; /* premapped tx msg pages */ - - struct list_head kib_idle_txs; /* idle tx descriptors */ - __u64 kib_next_tx_cookie; /* RDMA completion cookie */ - spinlock_t kib_tx_lock; /* serialise */ - - int kib_hca_idx; /* my HCA number */ - struct ib_device *kib_device; /* "the" device */ - struct ib_device_properties kib_device_props; /* its properties */ - int kib_port; /* port on the device */ - struct ib_port_properties kib_port_props; /* its properties */ - ib_pd_t *kib_pd; /* protection domain */ -#if IBNAL_FMR - ib_fmr_pool_t *kib_fmr_pool; /* fast memory region pool */ -#endif - ib_cq_t *kib_cq; /* completion queue */ - -} kib_data_t; - -#define IBNAL_INIT_NOTHING 0 -#define IBNAL_INIT_DATA 1 -#define IBNAL_INIT_LIB 2 -#define IBNAL_INIT_PD 3 -#define IBNAL_INIT_FMR 4 -#define IBNAL_INIT_TXD 5 -#define IBNAL_INIT_CQ 6 -#define IBNAL_INIT_ALL 7 - -typedef struct kib_acceptsock /* accepted socket queued for connd */ -{ - struct list_head ibas_list; /* queue for attention */ - struct socket *ibas_sock; /* the accepted socket */ -} kib_acceptsock_t; - -/************************************************************************ - * IB Wire message format. - * These are sent in sender's byte order (i.e. receiver flips). - * They may be sent via TCP/IP (service ID,GID,PKEY query/response), - * as private data in the connection request/response, or "normally". - */ - -typedef struct kib_svcrsp /* service response */ -{ - __u64 ibsr_svc_id; /* service's id */ - __u8 ibsr_svc_gid[16]; /* service's gid */ - __u16 ibsr_svc_pkey; /* service's pkey */ -} WIRE_ATTR kib_svcrsp_t; - -typedef struct kib_connparams -{ - __u32 ibcp_queue_depth; -} WIRE_ATTR kib_connparams_t; - -typedef struct -{ - union { - ib_mr_t *mr; - ib_fmr_t *fmr; - } md_handle; - __u32 md_lkey; - __u32 md_rkey; - __u64 md_addr; -} kib_md_t; - -typedef struct -{ - __u32 rd_key; /* remote key */ - __u32 rd_nob; /* # of bytes */ - __u64 rd_addr; /* remote io vaddr */ -} WIRE_ATTR kib_rdma_desc_t; - -typedef struct -{ - lnet_hdr_t ibim_hdr; /* portals header */ - char ibim_payload[0]; /* piggy-backed payload */ -} WIRE_ATTR kib_immediate_msg_t; - -typedef struct -{ - lnet_hdr_t ibrm_hdr; /* portals header */ - __u64 ibrm_cookie; /* opaque completion cookie */ - kib_rdma_desc_t ibrm_desc; /* where to suck/blow */ -} WIRE_ATTR kib_rdma_msg_t; - -typedef struct -{ - __u64 ibcm_cookie; /* opaque completion cookie */ - __u32 ibcm_status; /* completion status */ -} WIRE_ATTR kib_completion_msg_t; - -typedef struct -{ - /* First 2 fields fixed FOR ALL TIME */ - __u32 ibm_magic; /* I'm an openibnal message */ - __u16 ibm_version; /* this is my version number */ - - __u8 ibm_type; /* msg type */ - __u8 ibm_credits; /* returned credits */ - __u32 ibm_nob; /* # bytes in whole message */ - __u32 ibm_cksum; /* checksum (0 == no checksum) */ - __u64 ibm_srcnid; /* sender's NID */ - __u64 ibm_srcstamp; /* sender's incarnation */ - __u64 ibm_dstnid; /* destination's NID */ - __u64 ibm_dststamp; /* destination's incarnation */ - union { - kib_svcrsp_t svcrsp; - kib_connparams_t connparams; - kib_immediate_msg_t immediate; - kib_rdma_msg_t rdma; - kib_completion_msg_t completion; - } WIRE_ATTR ibm_u; -} WIRE_ATTR kib_msg_t; - -#define IBNAL_MSG_MAGIC LNET_PROTO_OPENIB_MAGIC /* unique magic */ -#define IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD 2 /* previous protocol version */ -#define IBNAL_MSG_VERSION 3 /* current protocol version */ - -#define IBNAL_MSG_SVCQRY 0xb0 /* service query */ -#define IBNAL_MSG_SVCRSP 0xb1 /* service response */ -#define IBNAL_MSG_CONNREQ 0xc0 /* connection request */ -#define IBNAL_MSG_CONNACK 0xc1 /* connection acknowledge */ -#define IBNAL_MSG_NOOP 0xd0 /* nothing (just credits) */ -#define IBNAL_MSG_IMMEDIATE 0xd1 /* portals hdr + payload */ -#define IBNAL_MSG_PUT_RDMA 0xd2 /* portals PUT hdr + source rdma desc */ -#define IBNAL_MSG_PUT_DONE 0xd3 /* signal PUT rdma completion */ -#define IBNAL_MSG_GET_RDMA 0xd4 /* portals GET hdr + sink rdma desc */ -#define IBNAL_MSG_GET_DONE 0xd5 /* signal GET rdma completion */ - -/***********************************************************************/ - -typedef struct kib_rx /* receive message */ -{ - struct list_head rx_list; /* queue for attention */ - struct kib_conn *rx_conn; /* owning conn */ - int rx_nob; /* # bytes received (-1 while posted) */ - __u64 rx_vaddr; /* pre-mapped buffer (hca vaddr) */ - kib_msg_t *rx_msg; /* pre-mapped buffer (host vaddr) */ - struct ib_receive_param rx_sp; /* receive work item */ - struct ib_gather_scatter rx_gl; /* and it's memory */ -} kib_rx_t; - -typedef struct kib_tx /* transmit message */ -{ - struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */ - struct kib_conn *tx_conn; /* owning conn */ - int tx_mapped; /* mapped for RDMA? */ - int tx_sending; /* # tx callbacks outstanding */ - int tx_status; /* completion status */ - unsigned long tx_deadline; /* completion deadline */ - int tx_passive_rdma; /* peer sucks/blows */ - int tx_passive_rdma_wait; /* waiting for peer to complete */ - __u64 tx_passive_rdma_cookie; /* completion cookie */ - lnet_msg_t *tx_lntmsg[2]; /* ptl msgs to finalize on completion */ - kib_md_t tx_md; /* RDMA mapping (active/passive) */ - __u64 tx_vaddr; /* pre-mapped buffer (hca vaddr) */ - kib_msg_t *tx_msg; /* pre-mapped buffer (host vaddr) */ - int tx_nsp; /* # send work items */ - struct ib_send_param tx_sp[2]; /* send work items... */ - struct ib_gather_scatter tx_gl[2]; /* ...and their memory */ -} kib_tx_t; - -#define KIB_TX_UNMAPPED 0 -#define KIB_TX_MAPPED 1 -#define KIB_TX_MAPPED_FMR 2 - -typedef struct kib_connreq -{ - /* active connection-in-progress state */ - struct kib_conn *cr_conn; - kib_msg_t cr_msg; - __u64 cr_tid; - tTS_IB_GID cr_gid; - kib_svcrsp_t cr_svcrsp; - struct ib_path_record cr_path; - struct ib_cm_active_param cr_connparam; -} kib_connreq_t; - -typedef struct kib_conn -{ - struct kib_peer *ibc_peer; /* owning peer */ - struct list_head ibc_list; /* stash on peer's conn list */ - __u64 ibc_incarnation; /* which instance of the peer */ - int ibc_version; /* peer protocol version */ - atomic_t ibc_refcount; /* # users */ - int ibc_state; /* what's happening */ - int ibc_nsends_posted; /* # uncompleted sends */ - int ibc_credits; /* # credits I have */ - int ibc_outstanding_credits; /* # credits to return */ - int ibc_reserved_credits; /* # credits for ACK/DONE msgs */ - unsigned long ibc_last_send; /* time of last send */ - struct list_head ibc_tx_queue_nocred; /* sends that don't need a credit */ - struct list_head ibc_tx_queue_rsrvd; /* sends that need a reserved cred */ - struct list_head ibc_tx_queue; /* send queue */ - struct list_head ibc_active_txs; /* active tx awaiting completion */ - spinlock_t ibc_lock; /* serialise */ - kib_rx_t *ibc_rxs; /* the rx descs */ - kib_pages_t *ibc_rx_pages; /* premapped rx msg pages */ - ib_qp_t *ibc_qp; /* queue pair */ - __u32 ibc_qpn; /* queue pair number */ - tTS_IB_CM_COMM_ID ibc_comm_id; /* connection ID? */ - kib_connreq_t *ibc_connreq; /* connection request state */ -} kib_conn_t; - -#define IBNAL_CONN_INIT_NOTHING 0 /* initial state */ -#define IBNAL_CONN_INIT_QP 1 /* ibc_qp set up */ -#define IBNAL_CONN_CONNECTING 2 /* started to connect */ -#define IBNAL_CONN_ESTABLISHED 3 /* connection established */ -#define IBNAL_CONN_DEATHROW 4 /* waiting to be closed */ -#define IBNAL_CONN_ZOMBIE 5 /* waiting to be freed */ - -typedef struct kib_peer -{ - struct list_head ibp_list; /* stash on global peer list */ - struct list_head ibp_connd_list; /* schedule on kib_connd_peers */ - lnet_nid_t ibp_nid; /* who's on the other end(s) */ - __u32 ibp_ip; /* IP to query for peer conn params */ - int ibp_port; /* port to qery for peer conn params */ - __u64 ibp_incarnation; /* peer's incarnation */ - atomic_t ibp_refcount; /* # users */ - int ibp_persistence; /* "known" peer refs */ - struct list_head ibp_conns; /* all active connections */ - struct list_head ibp_tx_queue; /* msgs waiting for a conn */ - int ibp_connecting; /* current active connection attempts */ - int ibp_accepting; /* current passive connection attempts */ - unsigned long ibp_reconnect_time; /* when reconnect may be attempted */ - unsigned long ibp_reconnect_interval; /* exponential backoff */ - int ibp_error; /* errno on closing this peer */ - cfs_time_t ibp_last_alive; /* when (in jiffies) I was last alive */ -} kib_peer_t; - -extern kib_data_t kibnal_data; -extern kib_tunables_t kibnal_tunables; - -/******************************************************************************/ - -/* these are purposely avoiding using local vars so they don't increase - * stack consumption. */ - -#define kibnal_conn_addref(conn) \ -do { \ - CDEBUG(D_NET, "conn[%p] (%d)++\n", \ - (conn), atomic_read(&(conn)->ibc_refcount)); \ - LASSERT(atomic_read(&(conn)->ibc_refcount) > 0); \ - atomic_inc(&(conn)->ibc_refcount); \ -} while (0) - -#define kibnal_conn_decref(conn) \ -do { \ - unsigned long flags; \ - \ - CDEBUG(D_NET, "conn[%p] (%d)--\n", \ - (conn), atomic_read(&(conn)->ibc_refcount)); \ - LASSERT(atomic_read(&(conn)->ibc_refcount) > 0); \ - if (atomic_dec_and_test(&(conn)->ibc_refcount)) { \ - spin_lock_irqsave(&kibnal_data.kib_reaper_lock, flags); \ - list_add_tail(&(conn)->ibc_list, \ - &kibnal_data.kib_reaper_conns); \ - wake_up(&kibnal_data.kib_reaper_waitq); \ - spin_unlock_irqrestore(&kibnal_data.kib_reaper_lock, flags); \ - } \ -} while (0) - -#define kibnal_peer_addref(peer) \ -do { \ - CDEBUG(D_NET, "peer[%p] -> %s (%d)++\n", \ - (peer), libcfs_nid2str((peer)->ibp_nid), \ - atomic_read (&(peer)->ibp_refcount)); \ - LASSERT(atomic_read(&(peer)->ibp_refcount) > 0); \ - atomic_inc(&(peer)->ibp_refcount); \ -} while (0) - -#define kibnal_peer_decref(peer) \ -do { \ - CDEBUG(D_NET, "peer[%p] -> %s (%d)--\n", \ - (peer), libcfs_nid2str((peer)->ibp_nid), \ - atomic_read (&(peer)->ibp_refcount)); \ - LASSERT(atomic_read(&(peer)->ibp_refcount) > 0); \ - if (atomic_dec_and_test(&(peer)->ibp_refcount)) \ - kibnal_destroy_peer(peer); \ -} while (0) - -/******************************************************************************/ - -static inline struct list_head * -kibnal_nid2peerlist (lnet_nid_t nid) -{ - unsigned int hash = ((unsigned int)nid) % kibnal_data.kib_peer_hash_size; - - return (&kibnal_data.kib_peers [hash]); -} - -static inline int -kibnal_peer_active(kib_peer_t *peer) -{ - /* Am I in the peer hash table? */ - return (!list_empty(&peer->ibp_list)); -} - -static inline void -kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn) -{ - struct list_head *q; - - LASSERT (tx->tx_nsp > 0); /* work items set up */ - LASSERT (tx->tx_conn == NULL); /* only set here */ - - kibnal_conn_addref(conn); - tx->tx_conn = conn; - tx->tx_deadline = jiffies + *kibnal_tunables.kib_timeout * HZ; - - if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) { - /* All messages have simple credit control */ - q = &conn->ibc_tx_queue; - } else { - LASSERT (conn->ibc_version == IBNAL_MSG_VERSION); - - switch (tx->tx_msg->ibm_type) { - case IBNAL_MSG_PUT_RDMA: - case IBNAL_MSG_GET_RDMA: - /* RDMA request: reserve a buffer for the RDMA reply - * before sending */ - q = &conn->ibc_tx_queue_rsrvd; - break; - - case IBNAL_MSG_PUT_DONE: - case IBNAL_MSG_GET_DONE: - /* RDMA completion: no credits; peer has reserved a - * reply buffer */ - q = &conn->ibc_tx_queue_nocred; - break; - - case IBNAL_MSG_NOOP: - case IBNAL_MSG_IMMEDIATE: - /* Otherwise: consume a credit before sending */ - q = &conn->ibc_tx_queue; - break; - - default: - LBUG(); - q = NULL; - } - } - - list_add_tail(&tx->tx_list, q); -} - -static inline int -kibnal_send_keepalive(kib_conn_t *conn) -{ - return (*kibnal_tunables.kib_keepalive > 0) && - time_after(jiffies, conn->ibc_last_send + - *kibnal_tunables.kib_keepalive*HZ); -} - -/* CAVEAT EMPTOR: - * We rely on tx/rx descriptor alignment to allow us to use the lowest bit - * of the work request id as a flag to determine if the completion is for a - * transmit or a receive. It seems that that the CQ entry's 'op' field - * isn't always set correctly on completions that occur after QP teardown. */ - -static inline __u64 -kibnal_ptr2wreqid (void *ptr, int isrx) -{ - unsigned long lptr = (unsigned long)ptr; - - LASSERT ((lptr & 1) == 0); - return (__u64)(lptr | (isrx ? 1 : 0)); -} - -static inline void * -kibnal_wreqid2ptr (__u64 wreqid) -{ - return (void *)(((unsigned long)wreqid) & ~1UL); -} - -static inline int -kibnal_wreqid_is_rx (__u64 wreqid) -{ - return (wreqid & 1) != 0; -} - -#if (IB_NTXRXPARAMS == 3) -static inline int -kibnal_ib_send(ib_qp_t *qp, struct ib_send_param *p) -{ - return ib_send(qp, p, 1); -} - -static inline int -kibnal_ib_receive(ib_qp_t *qp, struct ib_receive_param *p) -{ - return ib_receive(qp, p, 1); -} -#elif (IB_NTXRXPARAMS == 4) -static inline int -kibnal_ib_send(ib_qp_t *qp, struct ib_send_param *p) -{ - return ib_send(qp, p, 1, NULL); -} - -static inline int -kibnal_ib_receive(ib_qp_t *qp, struct ib_receive_param *p) -{ - return ib_receive(qp, p, 1, NULL); -} -#else - #error "IB_NTXRXPARAMS not set correctly" -#endif - -int kibnal_startup (lnet_ni_t *ni); -void kibnal_shutdown (lnet_ni_t *ni); -int kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg); -int kibnal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); -int kibnal_eager_recv (lnet_ni_t *ni, void *private, - lnet_msg_t *lntmsg, void **new_private); -int kibnal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, - int delayed, unsigned int niov, - struct iovec *iov, lnet_kiov_t *kiov, - unsigned int offset, unsigned int mlen, unsigned int rlen); -int kibnal_accept(lnet_ni_t *ni, struct socket *sock); - -extern void kibnal_init_msg(kib_msg_t *msg, int type, int body_nob); -extern void kibnal_pack_msg(kib_msg_t *msg, int version, int credits, - lnet_nid_t dstnid, __u64 dststamp); -extern int kibnal_unpack_msg(kib_msg_t *msg, int expected_version, int nob); -extern void kibnal_handle_svcqry (struct socket *sock); -extern int kibnal_make_svcqry (kib_conn_t *conn); -extern void kibnal_free_acceptsock (kib_acceptsock_t *as); -extern int kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid); -extern void kibnal_destroy_peer (kib_peer_t *peer); -extern int kibnal_add_persistent_peer(lnet_nid_t nid, __u32 ip, int port); -extern int kibnal_del_peer (lnet_nid_t nid); -extern kib_peer_t *kibnal_find_peer_locked (lnet_nid_t nid); -extern void kibnal_unlink_peer_locked (kib_peer_t *peer); -extern void kibnal_peer_alive(kib_peer_t *peer); -extern int kibnal_close_stale_conns_locked (kib_peer_t *peer, - __u64 incarnation); -extern kib_conn_t *kibnal_create_conn (void); -extern void kibnal_destroy_conn (kib_conn_t *conn); -extern int kibnal_alloc_pages (kib_pages_t **pp, int npages, int access); -extern void kibnal_free_pages (kib_pages_t *p); - -extern void kibnal_check_sends (kib_conn_t *conn); - -extern tTS_IB_CM_CALLBACK_RETURN -kibnal_bad_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid, - void *param, void *arg); -extern tTS_IB_CM_CALLBACK_RETURN -kibnal_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid, - void *param, void *arg); -extern tTS_IB_CM_CALLBACK_RETURN -kibnal_passive_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid, - void *param, void *arg); - -extern void kibnal_close_conn_locked (kib_conn_t *conn, int error); -extern void kibnal_destroy_conn (kib_conn_t *conn); -extern int kibnal_thread_start (int (*fn)(void *arg), void *arg); -extern int kibnal_scheduler(void *arg); -extern int kibnal_connd (void *arg); -extern int kibnal_reaper (void *arg); -extern void kibnal_callback (ib_cq_t *cq, struct ib_cq_entry *e, void *arg); -extern void kibnal_txlist_done (struct list_head *txlist, int status); -extern void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob); -extern int kibnal_close_conn (kib_conn_t *conn, int why); -extern void kibnal_start_active_rdma (int type, int status, - kib_rx_t *rx, lnet_msg_t *lntmsg, - unsigned int niov, - struct iovec *iov, lnet_kiov_t *kiov, - int offset, int nob); - -extern int kibnal_tunables_init(void); -extern void kibnal_tunables_fini(void); diff --git a/lnet/klnds/openiblnd/openiblnd_cb.c b/lnet/klnds/openiblnd/openiblnd_cb.c deleted file mode 100644 index 61d401d..0000000 --- a/lnet/klnds/openiblnd/openiblnd_cb.c +++ /dev/null @@ -1,2624 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. - * - * GPL HEADER END - */ -/* - * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/openiblnd/openiblnd_cb.c - * - * Author: Eric Barton - */ - -#include "openiblnd.h" - -/* - * LIB functions follow - * - */ -void -kibnal_schedule_tx_done (kib_tx_t *tx) -{ - unsigned long flags; - - spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags); - - list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq); - wake_up (&kibnal_data.kib_sched_waitq); - - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); -} - -void -kibnal_tx_done (kib_tx_t *tx) -{ - lnet_msg_t *lntmsg[2]; - unsigned long flags; - int i; - int rc; - - LASSERT (tx->tx_sending == 0); /* mustn't be awaiting callback */ - LASSERT (!tx->tx_passive_rdma_wait); /* mustn't be awaiting RDMA */ - - if (in_interrupt()) { - /* can't deregister memory/flush FMAs/finalize in IRQ context... */ - kibnal_schedule_tx_done(tx); - return; - } - - switch (tx->tx_mapped) { - default: - LBUG(); - - case KIB_TX_UNMAPPED: - break; - - case KIB_TX_MAPPED: - rc = ib_memory_deregister(tx->tx_md.md_handle.mr); - LASSERT (rc == 0); - tx->tx_mapped = KIB_TX_UNMAPPED; - break; - -#if IBNAL_FMR - case KIB_TX_MAPPED_FMR: - rc = ib_fmr_deregister(tx->tx_md.md_handle.fmr); - LASSERT (rc == 0); - -#ifndef USING_TSAPI - /* Somewhat belt-and-braces since the tx's conn has closed if - * this was a passive RDMA waiting to complete... */ - if (tx->tx_status != 0) - ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool); -#endif - tx->tx_mapped = KIB_TX_UNMAPPED; - break; -#endif - } - - /* tx may have up to 2 ptlmsgs to finalise */ - lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL; - lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL; - rc = tx->tx_status; - - if (tx->tx_conn != NULL) { - kibnal_conn_decref(tx->tx_conn); - tx->tx_conn = NULL; - } - - tx->tx_nsp = 0; - tx->tx_passive_rdma = 0; - tx->tx_status = 0; - - spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); - - list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs); - - spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); - - /* delay finalize until my descs have been freed */ - for (i = 0; i < 2; i++) { - if (lntmsg[i] == NULL) - continue; - - lnet_finalize (kibnal_data.kib_ni, lntmsg[i], rc); - } -} - -kib_tx_t * -kibnal_get_idle_tx (void) -{ - unsigned long flags; - kib_tx_t *tx; - - spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); - - if (list_empty (&kibnal_data.kib_idle_txs)) { - spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); - return NULL; - } - - tx = list_entry (kibnal_data.kib_idle_txs.next, kib_tx_t, tx_list); - list_del (&tx->tx_list); - - /* Allocate a new passive RDMA completion cookie. It might not be - * needed, but we've got a lock right now and we're unlikely to - * wrap... */ - tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++; - - spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); - - LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); - LASSERT (tx->tx_nsp == 0); - LASSERT (tx->tx_sending == 0); - LASSERT (tx->tx_status == 0); - LASSERT (tx->tx_conn == NULL); - LASSERT (!tx->tx_passive_rdma); - LASSERT (!tx->tx_passive_rdma_wait); - LASSERT (tx->tx_lntmsg[0] == NULL); - LASSERT (tx->tx_lntmsg[1] == NULL); - - return tx; -} - -void -kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status) -{ - struct list_head *ttmp; - unsigned long flags; - int idle; - - spin_lock_irqsave (&conn->ibc_lock, flags); - - list_for_each (ttmp, &conn->ibc_active_txs) { - kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list); - - LASSERT (tx->tx_passive_rdma || - !tx->tx_passive_rdma_wait); - - LASSERT (tx->tx_passive_rdma_wait || - tx->tx_sending != 0); - - if (!tx->tx_passive_rdma_wait || - tx->tx_passive_rdma_cookie != cookie) - continue; - - CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status); - - /* XXX Set mlength of reply here */ - - tx->tx_status = status; - tx->tx_passive_rdma_wait = 0; - idle = (tx->tx_sending == 0); - - if (idle) - list_del (&tx->tx_list); - - spin_unlock_irqrestore (&conn->ibc_lock, flags); - - /* I could be racing with tx callbacks. It's whoever - * _makes_ tx idle that frees it */ - if (idle) - kibnal_tx_done (tx); - return; - } - - spin_unlock_irqrestore (&conn->ibc_lock, flags); - - CERROR ("Unmatched (late?) RDMA completion "LPX64" from %s\n", - cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid)); -} - -void -kibnal_post_rx (kib_rx_t *rx, int credit, int rsrvd_credit) -{ - kib_conn_t *conn = rx->rx_conn; - int rc; - unsigned long flags; - - LASSERT(!rsrvd_credit || - conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD); - - rx->rx_gl = (struct ib_gather_scatter) { - .address = rx->rx_vaddr, - .length = IBNAL_MSG_SIZE, - .key = conn->ibc_rx_pages->ibp_lkey, - }; - - rx->rx_sp = (struct ib_receive_param) { - .work_request_id = kibnal_ptr2wreqid(rx, 1), - .scatter_list = &rx->rx_gl, - .num_scatter_entries = 1, - .device_specific = NULL, - .signaled = 1, - }; - - LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED); - LASSERT (rx->rx_nob >= 0); /* not posted */ - rx->rx_nob = -1; /* is now */ - mb(); - - if (conn->ibc_state != IBNAL_CONN_ESTABLISHED) - rc = -ECONNABORTED; - else - rc = kibnal_ib_receive(conn->ibc_qp, &rx->rx_sp); - - if (rc == 0) { - if (credit || rsrvd_credit) { - spin_lock_irqsave(&conn->ibc_lock, flags); - - if (credit) - conn->ibc_outstanding_credits++; - if (rsrvd_credit) - conn->ibc_reserved_credits++; - - spin_unlock_irqrestore(&conn->ibc_lock, flags); - - kibnal_check_sends(conn); - } - return; - } - - if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { - CERROR ("Error posting receive -> %s: %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); - kibnal_close_conn (rx->rx_conn, rc); - } else { - CDEBUG (D_NET, "Error posting receive -> %s: %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); - } - - /* Drop rx's ref */ - kibnal_conn_decref(conn); -} - -void -kibnal_rx_callback (struct ib_cq_entry *e) -{ - kib_rx_t *rx = (kib_rx_t *)kibnal_wreqid2ptr(e->work_request_id); - kib_msg_t *msg = rx->rx_msg; - kib_conn_t *conn = rx->rx_conn; - int credits; - unsigned long flags; - int rc; - int err = -ECONNABORTED; - - CDEBUG (D_NET, "rx %p conn %p\n", rx, conn); - LASSERT (rx->rx_nob < 0); /* was posted */ - rx->rx_nob = 0; /* isn't now */ - mb(); - - /* receives complete with error in any case after we've started - * closing the QP */ - if (conn->ibc_state >= IBNAL_CONN_DEATHROW) - goto failed; - - /* We don't post receives until the conn is established */ - LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED); - - if (e->status != IB_COMPLETION_STATUS_SUCCESS) { - CERROR("Rx from %s failed: %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), e->status); - goto failed; - } - - LASSERT (e->bytes_transferred >= 0); - rx->rx_nob = e->bytes_transferred; - mb(); - - rc = kibnal_unpack_msg(msg, conn->ibc_version, rx->rx_nob); - if (rc != 0) { - CERROR ("Error %d unpacking rx from %s\n", - rc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - goto failed; - } - - if (conn->ibc_peer->ibp_nid != msg->ibm_srcnid || - kibnal_data.kib_ni->ni_nid != msg->ibm_dstnid || - msg->ibm_srcstamp != conn->ibc_incarnation || - msg->ibm_dststamp != kibnal_data.kib_incarnation) { - CERROR ("Stale rx from %s\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - err = -ESTALE; - goto failed; - } - - /* Have I received credits that will let me send? */ - credits = msg->ibm_credits; - if (credits != 0) { - spin_lock_irqsave(&conn->ibc_lock, flags); - conn->ibc_credits += credits; - spin_unlock_irqrestore(&conn->ibc_lock, flags); - - kibnal_check_sends(conn); - } - - switch (msg->ibm_type) { - case IBNAL_MSG_NOOP: - kibnal_post_rx (rx, 1, 0); - return; - - case IBNAL_MSG_IMMEDIATE: - break; - - case IBNAL_MSG_PUT_RDMA: - case IBNAL_MSG_GET_RDMA: - CDEBUG(D_NET, "%d RDMA: cookie "LPX64", key %x, addr "LPX64", nob %d\n", - msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie, - msg->ibm_u.rdma.ibrm_desc.rd_key, - msg->ibm_u.rdma.ibrm_desc.rd_addr, - msg->ibm_u.rdma.ibrm_desc.rd_nob); - break; - - case IBNAL_MSG_PUT_DONE: - case IBNAL_MSG_GET_DONE: - CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n", - msg->ibm_type, msg->ibm_u.completion.ibcm_cookie, - msg->ibm_u.completion.ibcm_status); - - kibnal_complete_passive_rdma (conn, - msg->ibm_u.completion.ibcm_cookie, - msg->ibm_u.completion.ibcm_status); - - if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) { - kibnal_post_rx (rx, 1, 0); - } else { - /* this reply buffer was pre-reserved */ - kibnal_post_rx (rx, 0, 1); - } - return; - - default: - CERROR ("Bad msg type %x from %s\n", - msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - goto failed; - } - - kibnal_peer_alive(conn->ibc_peer); - - /* schedule for kibnal_rx() in thread context */ - spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); - - list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq); - wake_up (&kibnal_data.kib_sched_waitq); - - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); - return; - - failed: - CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); - kibnal_close_conn(conn, err); - - /* Don't re-post rx & drop its ref on conn */ - kibnal_conn_decref(conn); -} - -void -kibnal_rx (kib_rx_t *rx) -{ - int rc = 0; - kib_msg_t *msg = rx->rx_msg; - - switch (msg->ibm_type) { - case IBNAL_MSG_GET_RDMA: - rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.rdma.ibrm_hdr, - msg->ibm_srcnid, rx, 1); - break; - - case IBNAL_MSG_PUT_RDMA: - rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.rdma.ibrm_hdr, - msg->ibm_srcnid, rx, 1); - break; - - case IBNAL_MSG_IMMEDIATE: - rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.immediate.ibim_hdr, - msg->ibm_srcnid, rx, 0); - break; - - default: - LBUG(); - break; - } - - if (rc < 0) { - kibnal_close_conn(rx->rx_conn, rc); - kibnal_post_rx (rx, 1, 0); - } -} - -#if 0 -int -kibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp) -{ - struct page *page; - - if (vaddr >= VMALLOC_START && - vaddr < VMALLOC_END) - page = vmalloc_to_page ((void *)vaddr); -#ifdef CONFIG_HIGHMEM - else if (vaddr >= PKMAP_BASE && - vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) - page = vmalloc_to_page ((void *)vaddr); - /* in 2.4 ^ just walks the page tables */ -#endif - else - page = virt_to_page (vaddr); - - if (page == NULL || - !VALID_PAGE (page)) - return (-EFAULT); - - *physp = lnet_page2phys(page) + (vaddr & (PAGE_SIZE - 1)); - return (0); -} -#endif - -int -kibnal_map_iov (kib_tx_t *tx, int access, - unsigned int niov, struct iovec *iov, int offset, int nob) - -{ - void *vaddr; - int rc; - - LASSERT (nob > 0); - LASSERT (niov > 0); - LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); - - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - niov--; - iov++; - LASSERT (niov > 0); - } - - if (nob > iov->iov_len - offset) { - CERROR ("Can't map multiple vaddr fragments\n"); - return (-EMSGSIZE); - } - - vaddr = (void *)(((unsigned long)iov->iov_base) + offset); - tx->tx_md.md_addr = (__u64)((unsigned long)vaddr); - - rc = ib_memory_register (kibnal_data.kib_pd, - vaddr, nob, - access, - &tx->tx_md.md_handle.mr, - &tx->tx_md.md_lkey, - &tx->tx_md.md_rkey); - - if (rc != 0) { - CERROR ("Can't map vaddr: %d\n", rc); - return (rc); - } - - tx->tx_mapped = KIB_TX_MAPPED; - return (0); -} - -int -kibnal_map_kiov (kib_tx_t *tx, int access, - int nkiov, lnet_kiov_t *kiov, - int offset, int nob) -{ -#if IBNAL_FMR - __u64 *phys; - const int mapped = KIB_TX_MAPPED_FMR; -#else - struct ib_physical_buffer *phys; - const int mapped = KIB_TX_MAPPED; -#endif - int page_offset; - int nphys; - int resid; - int phys_size; - int rc; - - CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob); - - LASSERT (nob > 0); - LASSERT (nkiov > 0); - LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); - - while (offset >= kiov->kiov_len) { - offset -= kiov->kiov_len; - nkiov--; - kiov++; - LASSERT (nkiov > 0); - } - - phys_size = nkiov * sizeof (*phys); - LIBCFS_ALLOC(phys, phys_size); - if (phys == NULL) { - CERROR ("Can't allocate tmp phys\n"); - return (-ENOMEM); - } - - page_offset = kiov->kiov_offset + offset; -#if IBNAL_FMR - phys[0] = lnet_page2phys(kiov->kiov_page); -#else - phys[0].address = lnet_page2phys(kiov->kiov_page); - phys[0].size = PAGE_SIZE; -#endif - nphys = 1; - resid = nob - (kiov->kiov_len - offset); - - while (resid > 0) { - kiov++; - nkiov--; - LASSERT (nkiov > 0); - - if (kiov->kiov_offset != 0 || - ((resid > PAGE_SIZE) && - kiov->kiov_len < PAGE_SIZE)) { - int i; - /* Can't have gaps */ - CERROR ("Can't make payload contiguous in I/O VM:" - "page %d, offset %d, len %d \n", nphys, - kiov->kiov_offset, kiov->kiov_len); - - for (i = -nphys; i < nkiov; i++) - { - CERROR("kiov[%d] %p +%d for %d\n", - i, kiov[i].kiov_page, kiov[i].kiov_offset, kiov[i].kiov_len); - } - - rc = -EINVAL; - goto out; - } - - if (nphys == LNET_MAX_IOV) { - CERROR ("payload too big (%d)\n", nphys); - rc = -EMSGSIZE; - goto out; - } - - LASSERT (nphys * sizeof (*phys) < phys_size); -#if IBNAL_FMR - phys[nphys] = lnet_page2phys(kiov->kiov_page); -#else - phys[nphys].address = lnet_page2phys(kiov->kiov_page); - phys[nphys].size = PAGE_SIZE; -#endif - nphys++; - - resid -= PAGE_SIZE; - } - - tx->tx_md.md_addr = IBNAL_RDMA_BASE; - -#if IBNAL_FMR - rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool, - phys, nphys, - &tx->tx_md.md_addr, - page_offset, - &tx->tx_md.md_handle.fmr, - &tx->tx_md.md_lkey, - &tx->tx_md.md_rkey); -#else - rc = ib_memory_register_physical (kibnal_data.kib_pd, - phys, nphys, - &tx->tx_md.md_addr, - nob, page_offset, - access, - &tx->tx_md.md_handle.mr, - &tx->tx_md.md_lkey, - &tx->tx_md.md_rkey); -#endif - if (rc == 0) { - CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: lkey %x, rkey %x\n", - nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey); - tx->tx_mapped = mapped; - } else { - CERROR ("Can't map phys: %d\n", rc); - rc = -EFAULT; - } - - out: - LIBCFS_FREE(phys, phys_size); - return (rc); -} - -kib_conn_t * -kibnal_find_conn_locked (kib_peer_t *peer) -{ - struct list_head *tmp; - - /* just return the first connection */ - list_for_each (tmp, &peer->ibp_conns) { - return (list_entry(tmp, kib_conn_t, ibc_list)); - } - - return (NULL); -} - -void -kibnal_check_sends (kib_conn_t *conn) -{ - unsigned long flags; - kib_tx_t *tx; - int rc; - int i; - int consume_credit; - int done; - int nwork; - - spin_lock_irqsave (&conn->ibc_lock, flags); - - LASSERT (conn->ibc_nsends_posted <= IBNAL_RX_MSGS); - LASSERT (conn->ibc_reserved_credits >= 0); - - while (conn->ibc_reserved_credits > 0 && - !list_empty(&conn->ibc_tx_queue_rsrvd)) { - LASSERT (conn->ibc_version != - IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD); - tx = list_entry(conn->ibc_tx_queue_rsrvd.next, - kib_tx_t, tx_list); - list_del(&tx->tx_list); - list_add_tail(&tx->tx_list, &conn->ibc_tx_queue); - conn->ibc_reserved_credits--; - } - - if (list_empty(&conn->ibc_tx_queue) && - list_empty(&conn->ibc_tx_queue_nocred) && - (conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER || - kibnal_send_keepalive(conn))) { - spin_unlock_irqrestore(&conn->ibc_lock, flags); - - tx = kibnal_get_idle_tx(); - if (tx != NULL) - kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0); - - spin_lock_irqsave(&conn->ibc_lock, flags); - - if (tx != NULL) - kibnal_queue_tx_locked(tx, conn); - } - - for (;;) { - if (!list_empty(&conn->ibc_tx_queue_nocred)) { - LASSERT (conn->ibc_version != - IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD); - tx = list_entry(conn->ibc_tx_queue_nocred.next, - kib_tx_t, tx_list); - consume_credit = 0; - } else if (!list_empty (&conn->ibc_tx_queue)) { - tx = list_entry (conn->ibc_tx_queue.next, - kib_tx_t, tx_list); - consume_credit = 1; - } else { - /* nothing waiting */ - break; - } - - /* We rely on this for QP sizing */ - LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= 2); - - LASSERT (conn->ibc_outstanding_credits >= 0); - LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE); - LASSERT (conn->ibc_credits >= 0); - LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE); - - /* Not on ibc_rdma_queue */ - LASSERT (!tx->tx_passive_rdma_wait); - - if (conn->ibc_nsends_posted == IBNAL_RX_MSGS) - break; - - if (consume_credit) { - if (conn->ibc_credits == 0) /* no credits */ - break; - - if (conn->ibc_credits == 1 && /* last credit reserved for */ - conn->ibc_outstanding_credits == 0) /* giving back credits */ - break; - } - - list_del (&tx->tx_list); - - if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP && - (!list_empty(&conn->ibc_tx_queue) || - !list_empty(&conn->ibc_tx_queue_nocred) || - (conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER && - !kibnal_send_keepalive(conn)))) { - /* redundant NOOP */ - spin_unlock_irqrestore(&conn->ibc_lock, flags); - kibnal_tx_done(tx); - spin_lock_irqsave(&conn->ibc_lock, flags); - continue; - } - - kibnal_pack_msg(tx->tx_msg, conn->ibc_version, - conn->ibc_outstanding_credits, - conn->ibc_peer->ibp_nid, conn->ibc_incarnation); - - conn->ibc_outstanding_credits = 0; - conn->ibc_nsends_posted++; - if (consume_credit) - conn->ibc_credits--; - - tx->tx_sending = tx->tx_nsp; - tx->tx_passive_rdma_wait = tx->tx_passive_rdma; - list_add (&tx->tx_list, &conn->ibc_active_txs); - - spin_unlock_irqrestore (&conn->ibc_lock, flags); - - /* NB the gap between removing tx from the queue and sending it - * allows message re-ordering to occur */ - - LASSERT (tx->tx_nsp > 0); - - rc = -ECONNABORTED; - nwork = 0; - if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { - tx->tx_status = 0; - /* Driver only accepts 1 item at a time */ - for (i = 0; i < tx->tx_nsp; i++) { - rc = kibnal_ib_send(conn->ibc_qp, &tx->tx_sp[i]); - if (rc != 0) - break; - nwork++; - } - } - - conn->ibc_last_send = jiffies; - - spin_lock_irqsave (&conn->ibc_lock, flags); - if (rc != 0) { - /* NB credits are transferred in the actual - * message, which can only be the last work item */ - conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits; - if (consume_credit) - conn->ibc_credits++; - conn->ibc_nsends_posted--; - - tx->tx_status = rc; - tx->tx_passive_rdma_wait = 0; - tx->tx_sending -= tx->tx_nsp - nwork; - - done = (tx->tx_sending == 0); - if (done) - list_del (&tx->tx_list); - - spin_unlock_irqrestore (&conn->ibc_lock, flags); - - if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) - CERROR ("Error %d posting transmit to %s\n", - rc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - else - CDEBUG (D_NET, "Error %d posting transmit to %s\n", - rc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - - kibnal_close_conn (conn, rc); - - if (done) - kibnal_tx_done (tx); - return; - } - - } - - spin_unlock_irqrestore (&conn->ibc_lock, flags); -} - -void -kibnal_tx_callback (struct ib_cq_entry *e) -{ - kib_tx_t *tx = (kib_tx_t *)kibnal_wreqid2ptr(e->work_request_id); - kib_conn_t *conn; - unsigned long flags; - int idle; - - conn = tx->tx_conn; - LASSERT (conn != NULL); - LASSERT (tx->tx_sending != 0); - - spin_lock_irqsave(&conn->ibc_lock, flags); - - CDEBUG(D_NET, "conn %p tx %p [%d/%d]: %d\n", conn, tx, - tx->tx_nsp - tx->tx_sending, tx->tx_nsp, - e->status); - - /* I could be racing with rdma completion. Whoever makes 'tx' idle - * gets to free it, which also drops its ref on 'conn'. If it's - * not me, then I take an extra ref on conn so it can't disappear - * under me. */ - - tx->tx_sending--; - idle = (tx->tx_sending == 0) && /* This is the final callback */ - (!tx->tx_passive_rdma_wait); /* Not waiting for RDMA completion */ - if (idle) - list_del(&tx->tx_list); - - kibnal_conn_addref(conn); - - if (tx->tx_sending == 0) - conn->ibc_nsends_posted--; - - if (e->status != IB_COMPLETION_STATUS_SUCCESS && - tx->tx_status == 0) - tx->tx_status = -ECONNABORTED; - - spin_unlock_irqrestore(&conn->ibc_lock, flags); - - if (idle) - kibnal_tx_done (tx); - - if (e->status != IB_COMPLETION_STATUS_SUCCESS) { - CDEBUG (D_NETERROR, "Tx completion to %s failed: %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), e->status); - kibnal_close_conn (conn, -ENETDOWN); - } else { - kibnal_peer_alive(conn->ibc_peer); - /* can I shovel some more sends out the door? */ - kibnal_check_sends(conn); - } - - kibnal_conn_decref(conn); -} - -void -kibnal_callback (ib_cq_t *cq, struct ib_cq_entry *e, void *arg) -{ - if (kibnal_wreqid_is_rx(e->work_request_id)) - kibnal_rx_callback (e); - else - kibnal_tx_callback (e); -} - -void -kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob) -{ - struct ib_gather_scatter *gl = &tx->tx_gl[tx->tx_nsp]; - struct ib_send_param *sp = &tx->tx_sp[tx->tx_nsp]; - int fence; - int nob = offsetof (kib_msg_t, ibm_u) + body_nob; - - LASSERT (tx->tx_nsp >= 0 && - tx->tx_nsp < sizeof(tx->tx_sp)/sizeof(tx->tx_sp[0])); - LASSERT (nob <= IBNAL_MSG_SIZE); - - kibnal_init_msg(tx->tx_msg, type, body_nob); - - /* Fence the message if it's bundled with an RDMA read */ - fence = (tx->tx_nsp > 0) && - (type == IBNAL_MSG_PUT_DONE); - - *gl = (struct ib_gather_scatter) { - .address = tx->tx_vaddr, - .length = nob, - .key = kibnal_data.kib_tx_pages->ibp_lkey, - }; - - /* NB If this is an RDMA read, the completion message must wait for - * the RDMA to complete. Sends wait for previous RDMA writes - * anyway... */ - *sp = (struct ib_send_param) { - .work_request_id = kibnal_ptr2wreqid(tx, 0), - .op = IB_OP_SEND, - .gather_list = gl, - .num_gather_entries = 1, - .device_specific = NULL, - .solicited_event = 1, - .signaled = 1, - .immediate_data_valid = 0, - .fence = fence, - .inline_data = 0, - }; - - tx->tx_nsp++; -} - -void -kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn) -{ - unsigned long flags; - - spin_lock_irqsave(&conn->ibc_lock, flags); - - kibnal_queue_tx_locked (tx, conn); - - spin_unlock_irqrestore(&conn->ibc_lock, flags); - - kibnal_check_sends(conn); -} - -void -kibnal_schedule_active_connect_locked (kib_peer_t *peer) -{ - /* Called with exclusive kib_global_lock */ - - peer->ibp_connecting++; - kibnal_peer_addref(peer); /* extra ref for connd */ - - spin_lock (&kibnal_data.kib_connd_lock); - - LASSERT (list_empty(&peer->ibp_connd_list)); - list_add_tail (&peer->ibp_connd_list, - &kibnal_data.kib_connd_peers); - wake_up (&kibnal_data.kib_connd_waitq); - - spin_unlock (&kibnal_data.kib_connd_lock); -} - -void -kibnal_launch_tx (kib_tx_t *tx, lnet_nid_t nid) -{ - unsigned long flags; - kib_peer_t *peer; - kib_conn_t *conn; - int retry; - int rc; - rwlock_t *g_lock = &kibnal_data.kib_global_lock; - - /* If I get here, I've committed to send, so I complete the tx with - * failure on any problems */ - - LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */ - LASSERT (tx->tx_nsp > 0); /* work items have been set up */ - - for (retry = 0; ; retry = 1) { - read_lock_irqsave(g_lock, flags); - - peer = kibnal_find_peer_locked (nid); - if (peer != NULL) { - conn = kibnal_find_conn_locked (peer); - if (conn != NULL) { - kibnal_conn_addref(conn); /* 1 ref for me...*/ - read_unlock_irqrestore(g_lock, flags); - - kibnal_queue_tx (tx, conn); - kibnal_conn_decref(conn); /* ...until here */ - return; - } - } - - /* Making one or more connections; I'll need a write lock... */ - read_unlock(g_lock); - write_lock(g_lock); - - peer = kibnal_find_peer_locked (nid); - if (peer != NULL) - break; - - write_unlock_irqrestore (g_lock, flags); - - if (retry) { - CERROR("Can't find peer %s\n", libcfs_nid2str(nid)); - tx->tx_status = -EHOSTUNREACH; - kibnal_tx_done (tx); - return; - } - - rc = kibnal_add_persistent_peer(nid, LNET_NIDADDR(nid), - lnet_acceptor_port()); - if (rc != 0) { - CERROR("Can't add peer %s: %d\n", - libcfs_nid2str(nid), rc); - tx->tx_status = rc; - kibnal_tx_done(tx); - return; - } - } - - conn = kibnal_find_conn_locked (peer); - if (conn != NULL) { - /* Connection exists; queue message on it */ - kibnal_conn_addref(conn); /* +1 ref from me... */ - write_unlock_irqrestore (g_lock, flags); - - kibnal_queue_tx (tx, conn); - kibnal_conn_decref(conn); /* ...until here */ - return; - } - - if (peer->ibp_connecting == 0 && - peer->ibp_accepting == 0) { - if (!(peer->ibp_reconnect_interval == 0 || /* first attempt */ - time_after_eq(jiffies, peer->ibp_reconnect_time))) { - write_unlock_irqrestore (g_lock, flags); - tx->tx_status = -EHOSTUNREACH; - kibnal_tx_done (tx); - return; - } - - kibnal_schedule_active_connect_locked(peer); - } - - /* A connection is being established; queue the message... */ - list_add_tail (&tx->tx_list, &peer->ibp_tx_queue); - - write_unlock_irqrestore (g_lock, flags); -} - -void -kibnal_txlist_done (struct list_head *txlist, int status) -{ - kib_tx_t *tx; - - while (!list_empty(txlist)) { - tx = list_entry (txlist->next, kib_tx_t, tx_list); - - list_del (&tx->tx_list); - /* complete now */ - tx->tx_status = status; - kibnal_tx_done (tx); - } -} - -int -kibnal_start_passive_rdma (int type, lnet_msg_t *lntmsg, - int niov, struct iovec *iov, lnet_kiov_t *kiov, - int nob) -{ - lnet_nid_t nid = lntmsg->msg_target.nid; - kib_tx_t *tx; - kib_msg_t *ibmsg; - int rc; - int access; - - LASSERT (type == IBNAL_MSG_PUT_RDMA || - type == IBNAL_MSG_GET_RDMA); - LASSERT (nob > 0); - LASSERT (!in_interrupt()); /* Mapping could block */ - - if (type == IBNAL_MSG_PUT_RDMA) { - access = IB_ACCESS_REMOTE_READ; - } else { - access = IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_LOCAL_WRITE; - } - - tx = kibnal_get_idle_tx (); - if (tx == NULL) { - CERROR("Can't allocate %s txd for %s\n", - (type == IBNAL_MSG_PUT_RDMA) ? "PUT/REPLY" : "GET", - libcfs_nid2str(nid)); - return -ENOMEM; - } - - - if (iov != NULL) - rc = kibnal_map_iov (tx, access, niov, iov, 0, nob); - else - rc = kibnal_map_kiov (tx, access, niov, kiov, 0, nob); - - if (rc != 0) { - CERROR ("Can't map RDMA for %s: %d\n", - libcfs_nid2str(nid), rc); - goto failed; - } - - if (type == IBNAL_MSG_GET_RDMA) { - /* reply gets finalized when tx completes */ - tx->tx_lntmsg[1] = lnet_create_reply_msg(kibnal_data.kib_ni, - lntmsg); - if (tx->tx_lntmsg[1] == NULL) { - CERROR ("Can't create reply for GET -> %s\n", - libcfs_nid2str(nid)); - rc = -ENOMEM; - goto failed; - } - } - - tx->tx_passive_rdma = 1; - - ibmsg = tx->tx_msg; - - ibmsg->ibm_u.rdma.ibrm_hdr = lntmsg->msg_hdr; - ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie; - ibmsg->ibm_u.rdma.ibrm_desc.rd_key = tx->tx_md.md_rkey; - ibmsg->ibm_u.rdma.ibrm_desc.rd_addr = tx->tx_md.md_addr; - ibmsg->ibm_u.rdma.ibrm_desc.rd_nob = nob; - - kibnal_init_tx_msg (tx, type, sizeof (kib_rdma_msg_t)); - - CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr " - LPX64", nob %d\n", - tx, tx->tx_passive_rdma_cookie, tx->tx_md.md_rkey, - tx->tx_md.md_addr, nob); - - /* lntmsg gets finalized when tx completes. */ - tx->tx_lntmsg[0] = lntmsg; - - kibnal_launch_tx(tx, nid); - return (0); - - failed: - tx->tx_status = rc; - kibnal_tx_done (tx); - return (-EIO); -} - -void -kibnal_start_active_rdma (int type, int status, - kib_rx_t *rx, lnet_msg_t *lntmsg, - unsigned int niov, - struct iovec *iov, lnet_kiov_t *kiov, - int offset, int nob) -{ - kib_msg_t *rxmsg = rx->rx_msg; - kib_msg_t *txmsg; - kib_tx_t *tx; - int access; - int rdma_op; - int rc; - - CDEBUG(D_NET, "type %d, status %d, niov %d, offset %d, nob %d\n", - type, status, niov, offset, nob); - - /* Called by scheduler */ - LASSERT (!in_interrupt ()); - - /* Either all pages or all vaddrs */ - LASSERT (!(kiov != NULL && iov != NULL)); - - /* No data if we're completing with failure */ - LASSERT (status == 0 || nob == 0); - - LASSERT (type == IBNAL_MSG_GET_DONE || - type == IBNAL_MSG_PUT_DONE); - - if (type == IBNAL_MSG_GET_DONE) { - access = 0; - rdma_op = IB_OP_RDMA_WRITE; - LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA); - } else { - access = IB_ACCESS_LOCAL_WRITE; - rdma_op = IB_OP_RDMA_READ; - LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA); - } - - tx = kibnal_get_idle_tx (); - if (tx == NULL) { - CERROR ("tx descs exhausted on RDMA from %s" - " completing locally with failure\n", - libcfs_nid2str(rx->rx_conn->ibc_peer->ibp_nid)); - lnet_finalize (kibnal_data.kib_ni, lntmsg, -ENOMEM); - return; - } - LASSERT (tx->tx_nsp == 0); - - if (nob != 0) { - /* We actually need to transfer some data (the transfer - * size could get truncated to zero when the incoming - * message is matched) */ - - if (kiov != NULL) - rc = kibnal_map_kiov (tx, access, - niov, kiov, offset, nob); - else - rc = kibnal_map_iov (tx, access, - niov, iov, offset, nob); - - if (rc != 0) { - CERROR ("Can't map RDMA -> %s: %d\n", - libcfs_nid2str(rx->rx_conn->ibc_peer->ibp_nid), - rc); - /* We'll skip the RDMA and complete with failure. */ - status = rc; - nob = 0; - } else { - tx->tx_gl[0] = (struct ib_gather_scatter) { - .address = tx->tx_md.md_addr, - .length = nob, - .key = tx->tx_md.md_lkey, - }; - - tx->tx_sp[0] = (struct ib_send_param) { - .work_request_id = kibnal_ptr2wreqid(tx, 0), - .op = rdma_op, - .gather_list = &tx->tx_gl[0], - .num_gather_entries = 1, - .remote_address = rxmsg->ibm_u.rdma.ibrm_desc.rd_addr, - .rkey = rxmsg->ibm_u.rdma.ibrm_desc.rd_key, - .device_specific = NULL, - .solicited_event = 0, - .signaled = 1, - .immediate_data_valid = 0, - .fence = 0, - .inline_data = 0, - }; - - tx->tx_nsp = 1; - } - } - - txmsg = tx->tx_msg; - - txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie; - txmsg->ibm_u.completion.ibcm_status = status; - - kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t)); - - if (status == 0 && nob != 0) { - LASSERT (tx->tx_nsp > 1); - /* RDMA: lntmsg gets finalized when the tx completes. This - * is after the completion message has been sent, which in - * turn is after the RDMA has finished. */ - tx->tx_lntmsg[0] = lntmsg; - } else { - LASSERT (tx->tx_nsp == 1); - /* No RDMA: local completion happens now! */ - CDEBUG(D_NET, "No data: immediate completion\n"); - lnet_finalize (kibnal_data.kib_ni, lntmsg, - status == 0 ? 0 : -EIO); - } - - kibnal_queue_tx(tx, rx->rx_conn); -} - -int -kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) -{ - lnet_hdr_t *hdr = &lntmsg->msg_hdr; - int type = lntmsg->msg_type; - lnet_process_id_t target = lntmsg->msg_target; - int target_is_router = lntmsg->msg_target_is_router; - int routing = lntmsg->msg_routing; - unsigned int payload_niov = lntmsg->msg_niov; - struct iovec *payload_iov = lntmsg->msg_iov; - lnet_kiov_t *payload_kiov = lntmsg->msg_kiov; - unsigned int payload_offset = lntmsg->msg_offset; - unsigned int payload_nob = lntmsg->msg_len; - kib_msg_t *ibmsg; - kib_tx_t *tx; - int nob; - - /* NB 'private' is different depending on what we're sending.... */ - - CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n", - payload_nob, payload_niov, libcfs_id2str(target)); - - LASSERT (payload_nob == 0 || payload_niov > 0); - LASSERT (payload_niov <= LNET_MAX_IOV); - - /* Thread context if we're sending payload */ - LASSERT (!in_interrupt() || payload_niov == 0); - /* payload is either all vaddrs or all pages */ - LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); - - switch (type) { - default: - LBUG(); - return (-EIO); - - case LNET_MSG_ACK: - LASSERT (payload_nob == 0); - break; - - case LNET_MSG_GET: - if (routing || target_is_router) - break; /* send IMMEDIATE */ - - /* is the REPLY message too small for RDMA? */ - nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]); - if (nob <= IBNAL_MSG_SIZE) - break; /* send IMMEDIATE */ - - if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0) - return kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, lntmsg, - lntmsg->msg_md->md_niov, - lntmsg->msg_md->md_iov.iov, NULL, - lntmsg->msg_md->md_length); - - return kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, lntmsg, - lntmsg->msg_md->md_niov, - NULL, lntmsg->msg_md->md_iov.kiov, - lntmsg->msg_md->md_length); - - case LNET_MSG_REPLY: - case LNET_MSG_PUT: - /* Is the payload small enough not to need RDMA? */ - nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); - if (nob <= IBNAL_MSG_SIZE) - break; /* send IMMEDIATE */ - - return kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA, lntmsg, - payload_niov, - payload_iov, payload_kiov, - payload_nob); - } - - /* Send IMMEDIATE */ - - tx = kibnal_get_idle_tx(); - if (tx == NULL) { - CERROR ("Can't send %d to %s: tx descs exhausted%s\n", - type, libcfs_nid2str(target.nid), - in_interrupt() ? " (intr)" : ""); - return (-ENOMEM); - } - - ibmsg = tx->tx_msg; - ibmsg->ibm_u.immediate.ibim_hdr = *hdr; - - if (payload_kiov != NULL) - lnet_copy_kiov2flat(IBNAL_MSG_SIZE, ibmsg, - offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), - payload_niov, payload_kiov, - payload_offset, payload_nob); - else - lnet_copy_iov2flat(IBNAL_MSG_SIZE, ibmsg, - offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), - payload_niov, payload_iov, - payload_offset, payload_nob); - - kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, - offsetof(kib_immediate_msg_t, - ibim_payload[payload_nob])); - - /* lntmsg gets finalized when tx completes */ - tx->tx_lntmsg[0] = lntmsg; - - kibnal_launch_tx(tx, target.nid); - return (0); -} - -int -kibnal_eager_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, - void **new_private) -{ - kib_rx_t *rx = private; - kib_conn_t *conn = rx->rx_conn; - - if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) { - /* Can't block if RDMA completions need normal credits */ - LCONSOLE_ERROR_MSG(0x12a, - "Dropping message from %s: no buffers free. " - "%s is running an old version of LNET that may " - "deadlock if messages wait for buffers)\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - return -EDEADLK; - } - - *new_private = private; - return 0; -} - -int -kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, - int delayed, unsigned int niov, - struct iovec *iov, lnet_kiov_t *kiov, - unsigned int offset, unsigned int mlen, unsigned int rlen) -{ - kib_rx_t *rx = private; - kib_msg_t *rxmsg = rx->rx_msg; - int msg_nob; - int rc = 0; - - LASSERT (mlen <= rlen); - LASSERT (!in_interrupt ()); - /* Either all pages or all vaddrs */ - LASSERT (!(kiov != NULL && iov != NULL)); - - switch (rxmsg->ibm_type) { - default: - LBUG(); - - case IBNAL_MSG_IMMEDIATE: - msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]); - if (msg_nob > rx->rx_nob) { - CERROR ("Immediate message from %s too big: %d(%d)\n", - libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid), - msg_nob, rx->rx_nob); - rc = -EPROTO; - break; - } - - if (kiov != NULL) - lnet_copy_flat2kiov( - niov, kiov, offset, - IBNAL_MSG_SIZE, rxmsg, - offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), - mlen); - else - lnet_copy_flat2iov( - niov, iov, offset, - IBNAL_MSG_SIZE, rxmsg, - offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), - mlen); - - lnet_finalize (ni, lntmsg, 0); - break; - - case IBNAL_MSG_GET_RDMA: - if (lntmsg != NULL) { - /* GET matched: RDMA lntmsg's payload */ - kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0, - rx, lntmsg, - lntmsg->msg_niov, - lntmsg->msg_iov, - lntmsg->msg_kiov, - lntmsg->msg_offset, - lntmsg->msg_len); - } else { - /* GET didn't match anything */ - kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -ENODATA, - rx, NULL, 0, NULL, NULL, 0, 0); - } - break; - - case IBNAL_MSG_PUT_RDMA: - kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0, rx, lntmsg, - niov, iov, kiov, offset, mlen); - break; - } - - kibnal_post_rx(rx, 1, 0); - return rc; -} - -int -kibnal_thread_start (int (*fn)(void *arg), void *arg) -{ - long pid = kernel_thread (fn, arg, 0); - - if (pid < 0) - return ((int)pid); - - atomic_inc (&kibnal_data.kib_nthreads); - return (0); -} - -void -kibnal_thread_fini (void) -{ - atomic_dec (&kibnal_data.kib_nthreads); -} - -void -kibnal_peer_alive (kib_peer_t *peer) -{ - /* This is racy, but everyone's only writing cfs_time_current() */ - peer->ibp_last_alive = cfs_time_current(); - mb(); -} - -void -kibnal_peer_notify (kib_peer_t *peer) -{ - time_t last_alive = 0; - int error = 0; - unsigned long flags; - - read_lock_irqsave(&kibnal_data.kib_global_lock, flags); - - if (list_empty(&peer->ibp_conns) && - peer->ibp_accepting == 0 && - peer->ibp_connecting == 0 && - peer->ibp_error != 0) { - error = peer->ibp_error; - peer->ibp_error = 0; - last_alive = cfs_time_current_sec() - - cfs_duration_sec(cfs_time_current() - - peer->ibp_last_alive); - } - - read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - - if (error != 0) - lnet_notify(kibnal_data.kib_ni, peer->ibp_nid, 0, last_alive); -} - -void -kibnal_close_conn_locked (kib_conn_t *conn, int error) -{ - /* This just does the immmediate housekeeping, and schedules the - * connection for the reaper to finish off. - * Caller holds kib_global_lock exclusively in irq context */ - kib_peer_t *peer = conn->ibc_peer; - - CDEBUG (error == 0 ? D_NET : D_NETERROR, - "closing conn to %s: error %d\n", - libcfs_nid2str(peer->ibp_nid), error); - - LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED || - conn->ibc_state == IBNAL_CONN_CONNECTING); - - if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { - /* kib_reaper_conns takes ibc_list's ref */ - list_del (&conn->ibc_list); - } else { - /* new ref for kib_reaper_conns */ - kibnal_conn_addref(conn); - } - - if (list_empty (&peer->ibp_conns)) { /* no more conns */ - if (peer->ibp_persistence == 0 && /* non-persistent peer */ - kibnal_peer_active(peer)) /* still in peer table */ - kibnal_unlink_peer_locked (peer); - - peer->ibp_error = error; /* set/clear error on last conn */ - } - - conn->ibc_state = IBNAL_CONN_DEATHROW; - - /* Schedule conn for closing/destruction */ - spin_lock (&kibnal_data.kib_reaper_lock); - - list_add_tail (&conn->ibc_list, &kibnal_data.kib_reaper_conns); - wake_up (&kibnal_data.kib_reaper_waitq); - - spin_unlock (&kibnal_data.kib_reaper_lock); -} - -int -kibnal_close_conn (kib_conn_t *conn, int why) -{ - unsigned long flags; - int count = 0; - - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - - LASSERT (conn->ibc_state >= IBNAL_CONN_CONNECTING); - - if (conn->ibc_state <= IBNAL_CONN_ESTABLISHED) { - count = 1; - kibnal_close_conn_locked (conn, why); - } - - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - return (count); -} - -void -kibnal_peer_connect_failed (kib_peer_t *peer, int active, int error) -{ - LIST_HEAD (zombies); - unsigned long flags; - - LASSERT(error != 0); - - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - - if (active) { - LASSERT (peer->ibp_connecting != 0); - peer->ibp_connecting--; - } else { - LASSERT (peer->ibp_accepting != 0); - peer->ibp_accepting--; - } - - if (peer->ibp_connecting != 0 || - peer->ibp_accepting != 0) { - /* another connection attempt under way... */ - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - return; - } - - if (list_empty(&peer->ibp_conns)) { - /* Say when active connection can be re-attempted */ - peer->ibp_reconnect_interval *= 2; - peer->ibp_reconnect_interval = - MAX(peer->ibp_reconnect_interval, - *kibnal_tunables.kib_min_reconnect_interval); - peer->ibp_reconnect_interval = - MIN(peer->ibp_reconnect_interval, - *kibnal_tunables.kib_max_reconnect_interval); - - peer->ibp_reconnect_time = jiffies + - peer->ibp_reconnect_interval * HZ; - - /* Take peer's blocked transmits; I'll complete - * them with error */ - list_add(&zombies, &peer->ibp_tx_queue); - list_del_init(&peer->ibp_tx_queue); - - if (kibnal_peer_active(peer) && - (peer->ibp_persistence == 0)) { - /* failed connection attempt on non-persistent peer */ - kibnal_unlink_peer_locked (peer); - } - - peer->ibp_error = error; - } else { - /* Can't have blocked transmits if there are connections */ - LASSERT (list_empty(&peer->ibp_tx_queue)); - } - - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - - kibnal_peer_notify(peer); - - if (!list_empty (&zombies)) - CDEBUG (D_NETERROR, "Deleting messages for %s: connection failed\n", - libcfs_nid2str(peer->ibp_nid)); - - kibnal_txlist_done(&zombies, -EHOSTUNREACH); -} - -void -kibnal_connreq_done (kib_conn_t *conn, int active, int status) -{ - int state = conn->ibc_state; - kib_peer_t *peer = conn->ibc_peer; - kib_tx_t *tx; - unsigned long flags; - int rc; - int i; - - if (conn->ibc_connreq != NULL) { - LIBCFS_FREE (conn->ibc_connreq, sizeof (*conn->ibc_connreq)); - conn->ibc_connreq = NULL; - } - - switch (state) { - case IBNAL_CONN_CONNECTING: - /* conn has a CM comm_id */ - if (status == 0) { - /* Install common (active/passive) callback for - * disconnect/idle notification */ - rc = tsIbCmCallbackModify(conn->ibc_comm_id, - kibnal_conn_callback, - conn); - LASSERT (rc == 0); - } else { - /* LASSERT (no more CM callbacks) */ - rc = tsIbCmCallbackModify(conn->ibc_comm_id, - kibnal_bad_conn_callback, - conn); - LASSERT (rc == 0); - } - break; - - case IBNAL_CONN_INIT_QP: - LASSERT (status != 0); - break; - - default: - LBUG(); - } - - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - - if (active) - LASSERT (peer->ibp_connecting != 0); - else - LASSERT (peer->ibp_accepting != 0); - - if (status == 0 && /* connection established */ - kibnal_peer_active(peer)) { /* peer not deleted */ - - if (active) - peer->ibp_connecting--; - else - peer->ibp_accepting--; - - conn->ibc_last_send = jiffies; - conn->ibc_state = IBNAL_CONN_ESTABLISHED; - kibnal_peer_alive(peer); - - /* +1 ref for ibc_list; caller(== CM)'s ref remains until - * the IB_CM_IDLE callback */ - kibnal_conn_addref(conn); - list_add (&conn->ibc_list, &peer->ibp_conns); - - peer->ibp_reconnect_interval = 0; /* OK to reconnect at any time */ - - /* post blocked sends to the new connection */ - spin_lock (&conn->ibc_lock); - - while (!list_empty (&peer->ibp_tx_queue)) { - tx = list_entry (peer->ibp_tx_queue.next, - kib_tx_t, tx_list); - - list_del (&tx->tx_list); - - kibnal_queue_tx_locked (tx, conn); - } - - spin_unlock (&conn->ibc_lock); - - /* Nuke any dangling conns from a different peer instance... */ - kibnal_close_stale_conns_locked (conn->ibc_peer, - conn->ibc_incarnation); - - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - - /* queue up all the receives */ - for (i = 0; i < IBNAL_RX_MSGS; i++) { - /* +1 ref for rx desc */ - kibnal_conn_addref(conn); - - CDEBUG(D_NET, "RX[%d] %p->%p - "LPX64"\n", - i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg, - conn->ibc_rxs[i].rx_vaddr); - - kibnal_post_rx (&conn->ibc_rxs[i], 0, 0); - } - - kibnal_check_sends (conn); - return; - } - - if (status == 0) { - /* connection established, but peer was deleted. Schedule for - * reaper to cm_disconnect... */ - status = -ECONNABORTED; - kibnal_close_conn_locked (conn, status); - } else { - /* just waiting for refs to drain */ - conn->ibc_state = IBNAL_CONN_ZOMBIE; - } - - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - - kibnal_peer_connect_failed (conn->ibc_peer, active, status); -} - -int -kibnal_accept_connreq (kib_conn_t **connp, tTS_IB_CM_COMM_ID cid, - kib_msg_t *msg, int nob) -{ - kib_conn_t *conn; - kib_peer_t *peer; - kib_peer_t *peer2; - unsigned long flags; - int rc; - - rc = kibnal_unpack_msg(msg, 0, nob); - if (rc != 0) { - CERROR("Can't unpack connreq msg: %d\n", rc); - return -EPROTO; - } - - CDEBUG(D_NET, "connreq from %s\n", libcfs_nid2str(msg->ibm_srcnid)); - - if (msg->ibm_type != IBNAL_MSG_CONNREQ) { - CERROR("Unexpected connreq msg type: %x from %s\n", - msg->ibm_type, libcfs_nid2str(msg->ibm_srcnid)); - return -EPROTO; - } - - if (msg->ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) { - CERROR("Can't accept %s: bad queue depth %d (%d expected)\n", - libcfs_nid2str(msg->ibm_srcnid), - msg->ibm_u.connparams.ibcp_queue_depth, - IBNAL_MSG_QUEUE_SIZE); - return (-EPROTO); - } - - conn = kibnal_create_conn(); - if (conn == NULL) - return (-ENOMEM); - - /* assume 'nid' is a new peer */ - rc = kibnal_create_peer(&peer, msg->ibm_srcnid); - if (rc != 0) { - kibnal_conn_decref(conn); - return (-ENOMEM); - } - - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - - if (kibnal_data.kib_nonewpeers) { - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - - CERROR ("Shutdown has started, drop connreq from %s\n", - libcfs_nid2str(msg->ibm_srcnid)); - kibnal_conn_decref(conn); - kibnal_peer_decref(peer); - return -ESHUTDOWN; - } - - /* Check I'm the same instance that gave the connection parameters. - * NB If my incarnation changes after this, the peer will get nuked and - * we'll spot that when the connection is finally added into the peer's - * connlist */ - if (kibnal_data.kib_ni->ni_nid != msg->ibm_dstnid || - msg->ibm_dststamp != kibnal_data.kib_incarnation) { - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - - CERROR("Stale connection params from %s\n", - libcfs_nid2str(msg->ibm_srcnid)); - kibnal_conn_decref(conn); - kibnal_peer_decref(peer); - return -ESTALE; - } - - peer2 = kibnal_find_peer_locked(msg->ibm_srcnid); - if (peer2 == NULL) { - /* Brand new peer */ - LASSERT (peer->ibp_accepting == 0); - - /* peer table takes my ref on peer */ - list_add_tail (&peer->ibp_list, - kibnal_nid2peerlist(msg->ibm_srcnid)); - } else { - /* tie-break connection race in favour of the higher NID */ - if (peer2->ibp_connecting != 0 && - msg->ibm_srcnid < kibnal_data.kib_ni->ni_nid) { - write_unlock_irqrestore(&kibnal_data.kib_global_lock, - flags); - CWARN("Conn race %s\n", - libcfs_nid2str(peer2->ibp_nid)); - - kibnal_conn_decref(conn); - kibnal_peer_decref(peer); - return -EALREADY; - } - - kibnal_peer_decref(peer); - peer = peer2; - } - - /* +1 ref for conn */ - kibnal_peer_addref(peer); - peer->ibp_accepting++; - - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - - conn->ibc_peer = peer; - conn->ibc_state = IBNAL_CONN_CONNECTING; - conn->ibc_comm_id = cid; - conn->ibc_incarnation = msg->ibm_srcstamp; - conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; - conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE; - conn->ibc_version = msg->ibm_version; - - *connp = conn; - return (0); -} - -tTS_IB_CM_CALLBACK_RETURN -kibnal_bad_conn_callback (tTS_IB_CM_EVENT event, - tTS_IB_CM_COMM_ID cid, - void *param, - void *arg) -{ - CERROR ("Unexpected event %d: conn %p\n", event, arg); - LBUG (); - return TS_IB_CM_CALLBACK_PROCEED; -} - -void -kibnal_abort_txs (kib_conn_t *conn, struct list_head *txs) -{ - LIST_HEAD (zombies); - struct list_head *tmp; - struct list_head *nxt; - kib_tx_t *tx; - unsigned long flags; - - spin_lock_irqsave (&conn->ibc_lock, flags); - - list_for_each_safe (tmp, nxt, txs) { - tx = list_entry (tmp, kib_tx_t, tx_list); - - if (txs == &conn->ibc_active_txs) { - LASSERT (tx->tx_passive_rdma || - !tx->tx_passive_rdma_wait); - - LASSERT (tx->tx_passive_rdma_wait || - tx->tx_sending != 0); - } else { - LASSERT (!tx->tx_passive_rdma_wait); - LASSERT (tx->tx_sending == 0); - } - - tx->tx_status = -ECONNABORTED; - tx->tx_passive_rdma_wait = 0; - - if (tx->tx_sending == 0) { - list_del (&tx->tx_list); - list_add (&tx->tx_list, &zombies); - } - } - - spin_unlock_irqrestore (&conn->ibc_lock, flags); - - kibnal_txlist_done (&zombies, -ECONNABORTED); -} - -tTS_IB_CM_CALLBACK_RETURN -kibnal_conn_callback (tTS_IB_CM_EVENT event, - tTS_IB_CM_COMM_ID cid, - void *param, - void *arg) -{ - kib_conn_t *conn = arg; - int rc; - - /* Established Connection Notifier */ - - switch (event) { - default: - CDEBUG(D_NETERROR, "Connection %p -> %s ERROR %d\n", - conn, libcfs_nid2str(conn->ibc_peer->ibp_nid), event); - kibnal_close_conn (conn, -ECONNABORTED); - break; - - case TS_IB_CM_DISCONNECTED: - CDEBUG(D_NETERROR, "Connection %p -> %s DISCONNECTED.\n", - conn, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - kibnal_close_conn (conn, 0); - break; - - case TS_IB_CM_IDLE: - CDEBUG(D_NET, "Connection %p -> %s IDLE.\n", - conn, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - - /* LASSERT (no further callbacks) */ - rc = tsIbCmCallbackModify(cid, kibnal_bad_conn_callback, conn); - LASSERT (rc == 0); - - /* NB we wait until the connection has closed before - * completing outstanding passive RDMAs so we can be sure - * the network can't touch the mapped memory any more. */ - - kibnal_abort_txs(conn, &conn->ibc_tx_queue); - kibnal_abort_txs(conn, &conn->ibc_tx_queue_rsrvd); - kibnal_abort_txs(conn, &conn->ibc_tx_queue_nocred); - kibnal_abort_txs(conn, &conn->ibc_active_txs); - - kibnal_conn_decref(conn); /* Lose CM's ref */ - break; - } - - return TS_IB_CM_CALLBACK_PROCEED; -} - -tTS_IB_CM_CALLBACK_RETURN -kibnal_passive_conn_callback (tTS_IB_CM_EVENT event, - tTS_IB_CM_COMM_ID cid, - void *param, - void *arg) -{ - kib_conn_t *conn = arg; - int rc; - - switch (event) { - default: - if (conn == NULL) { - /* no connection yet */ - CERROR ("Unexpected event: %d\n", event); - return TS_IB_CM_CALLBACK_ABORT; - } - - CERROR ("%s event %p -> %s: %d\n", - (event == TS_IB_CM_IDLE) ? "IDLE" : "Unexpected", - conn, libcfs_nid2str(conn->ibc_peer->ibp_nid), event); - kibnal_connreq_done(conn, 0, -ECONNABORTED); - kibnal_conn_decref(conn); /* drop CM's ref */ - return TS_IB_CM_CALLBACK_ABORT; - - case TS_IB_CM_REQ_RECEIVED: { - struct ib_cm_req_received_param *req = param; - kib_msg_t *msg = req->remote_private_data; - - LASSERT (conn == NULL); - - /* Don't really know srcnid until successful unpack */ - CDEBUG(D_NET, "REQ from ?%s?\n", libcfs_nid2str(msg->ibm_srcnid)); - - rc = kibnal_accept_connreq(&conn, cid, msg, - req->remote_private_data_len); - if (rc != 0) { - CERROR ("Can't accept ?%s?: %d\n", - libcfs_nid2str(msg->ibm_srcnid), rc); - return TS_IB_CM_CALLBACK_ABORT; - } - - /* update 'arg' for next callback */ - rc = tsIbCmCallbackModify(cid, kibnal_passive_conn_callback, conn); - LASSERT (rc == 0); - - msg = req->accept_param.reply_private_data; - kibnal_init_msg(msg, IBNAL_MSG_CONNACK, - sizeof(msg->ibm_u.connparams)); - - msg->ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE; - - kibnal_pack_msg(msg, conn->ibc_version, 0, - conn->ibc_peer->ibp_nid, - conn->ibc_incarnation); - - req->accept_param.qp = conn->ibc_qp; - req->accept_param.reply_private_data_len = msg->ibm_nob; - req->accept_param.responder_resources = IBNAL_RESPONDER_RESOURCES; - req->accept_param.initiator_depth = IBNAL_RESPONDER_RESOURCES; - req->accept_param.rnr_retry_count = IBNAL_RNR_RETRY; - req->accept_param.flow_control = IBNAL_FLOW_CONTROL; - - CDEBUG(D_NET, "Proceeding\n"); - return TS_IB_CM_CALLBACK_PROCEED; /* CM takes my ref on conn */ - } - - case TS_IB_CM_ESTABLISHED: - LASSERT (conn != NULL); - CWARN("Connection %p -> %s ESTABLISHED.\n", - conn, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - - kibnal_connreq_done(conn, 0, 0); - return TS_IB_CM_CALLBACK_PROCEED; - } -} - -tTS_IB_CM_CALLBACK_RETURN -kibnal_active_conn_callback (tTS_IB_CM_EVENT event, - tTS_IB_CM_COMM_ID cid, - void *param, - void *arg) -{ - kib_conn_t *conn = arg; - unsigned long flags; - - switch (event) { - case TS_IB_CM_REP_RECEIVED: { - struct ib_cm_rep_received_param *rep = param; - kib_msg_t *msg = rep->remote_private_data; - int nob = rep->remote_private_data_len; - int rc; - - rc = kibnal_unpack_msg(msg, conn->ibc_version, nob); - if (rc != 0) { - CERROR ("Error %d unpacking conn ack from %s\n", - rc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - kibnal_connreq_done(conn, 1, rc); - kibnal_conn_decref(conn); /* drop CM's ref */ - return TS_IB_CM_CALLBACK_ABORT; - } - - if (msg->ibm_type != IBNAL_MSG_CONNACK) { - CERROR ("Unexpected conn ack type %d from %s\n", - msg->ibm_type, - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - kibnal_connreq_done(conn, 1, -EPROTO); - kibnal_conn_decref(conn); /* drop CM's ref */ - return TS_IB_CM_CALLBACK_ABORT; - } - - if (conn->ibc_peer->ibp_nid != msg->ibm_srcnid || - kibnal_data.kib_ni->ni_nid != msg->ibm_dstnid || - msg->ibm_srcstamp != conn->ibc_incarnation || - msg->ibm_dststamp != kibnal_data.kib_incarnation) { - CERROR("Stale conn ack from %s\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - kibnal_connreq_done(conn, 1, -ESTALE); - kibnal_conn_decref(conn); /* drop CM's ref */ - return TS_IB_CM_CALLBACK_ABORT; - } - - if (msg->ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) { - CERROR ("Bad queue depth %d from %s\n", - msg->ibm_u.connparams.ibcp_queue_depth, - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - kibnal_connreq_done(conn, 1, -EPROTO); - kibnal_conn_decref(conn); /* drop CM's ref */ - return TS_IB_CM_CALLBACK_ABORT; - } - - CDEBUG(D_NET, "Connection %p -> %s REP_RECEIVED.\n", - conn, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - - conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; - conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE; - return TS_IB_CM_CALLBACK_PROCEED; - } - - case TS_IB_CM_ESTABLISHED: - CWARN("Connection %p -> %s ESTABLISHED\n", - conn, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - - kibnal_connreq_done(conn, 1, 0); - return TS_IB_CM_CALLBACK_PROCEED; - - case TS_IB_CM_IDLE: - CDEBUG(D_NETERROR, "Connection %p -> %s IDLE\n", - conn, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - /* I assume this connection attempt was rejected because the - * peer found a stale QP; I'll just try again */ - write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - kibnal_schedule_active_connect_locked(conn->ibc_peer); - write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - - kibnal_connreq_done(conn, 1, -ECONNABORTED); - kibnal_conn_decref(conn); /* drop CM's ref */ - return TS_IB_CM_CALLBACK_ABORT; - - default: - CDEBUG(D_NETERROR, "Connection %p -> %s ERROR %d\n", - conn, libcfs_nid2str(conn->ibc_peer->ibp_nid), event); - kibnal_connreq_done(conn, 1, -ECONNABORTED); - kibnal_conn_decref(conn); /* drop CM's ref */ - return TS_IB_CM_CALLBACK_ABORT; - } -} - -int -kibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status, - struct ib_path_record *resp, int remaining, - void *arg) -{ - kib_conn_t *conn = arg; - kib_peer_t *peer = conn->ibc_peer; - kib_msg_t *msg = &conn->ibc_connreq->cr_msg; - - if (status != 0) { - CDEBUG (D_NETERROR, "Pathreq %p -> %s failed: %d\n", - conn, libcfs_nid2str(peer->ibp_nid), status); - kibnal_connreq_done(conn, 1, status); - kibnal_conn_decref(conn); /* drop callback's ref */ - return 1; /* non-zero prevents further callbacks */ - } - - conn->ibc_connreq->cr_path = *resp; - - kibnal_init_msg(msg, IBNAL_MSG_CONNREQ, sizeof(msg->ibm_u.connparams)); - msg->ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE; - kibnal_pack_msg(msg, conn->ibc_version, 0, - peer->ibp_nid, conn->ibc_incarnation); - - conn->ibc_connreq->cr_connparam = (struct ib_cm_active_param) { - .qp = conn->ibc_qp, - .req_private_data = msg, - .req_private_data_len = msg->ibm_nob, - .responder_resources = IBNAL_RESPONDER_RESOURCES, - .initiator_depth = IBNAL_RESPONDER_RESOURCES, - .retry_count = IBNAL_RETRY, - .rnr_retry_count = IBNAL_RNR_RETRY, - .cm_response_timeout = *kibnal_tunables.kib_timeout, - .max_cm_retries = IBNAL_CM_RETRY, - .flow_control = IBNAL_FLOW_CONTROL, - }; - - /* XXX set timeout just like SDP!!!*/ - conn->ibc_connreq->cr_path.packet_life = 13; - - /* Flag I'm getting involved with the CM... */ - conn->ibc_state = IBNAL_CONN_CONNECTING; - - CDEBUG(D_NET, "Connecting to, service id "LPX64", on %s\n", - conn->ibc_connreq->cr_svcrsp.ibsr_svc_id, - libcfs_nid2str(peer->ibp_nid)); - - /* kibnal_connect_callback gets my conn ref */ - status = ib_cm_connect (&conn->ibc_connreq->cr_connparam, - &conn->ibc_connreq->cr_path, NULL, - conn->ibc_connreq->cr_svcrsp.ibsr_svc_id, 0, - kibnal_active_conn_callback, conn, - &conn->ibc_comm_id); - if (status != 0) { - CERROR ("Connect %p -> %s failed: %d\n", - conn, libcfs_nid2str(conn->ibc_peer->ibp_nid), status); - /* Back out state change: I've not got a CM comm_id yet... */ - conn->ibc_state = IBNAL_CONN_INIT_QP; - kibnal_connreq_done(conn, 1, status); - kibnal_conn_decref(conn); /* Drop callback's ref */ - } - - return 1; /* non-zero to prevent further callbacks */ -} - -void -kibnal_connect_peer (kib_peer_t *peer) -{ - kib_conn_t *conn; - int rc; - - conn = kibnal_create_conn(); - if (conn == NULL) { - CERROR ("Can't allocate conn\n"); - kibnal_peer_connect_failed (peer, 1, -ENOMEM); - return; - } - - conn->ibc_peer = peer; - kibnal_peer_addref(peer); - - LIBCFS_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq)); - if (conn->ibc_connreq == NULL) { - CERROR ("Can't allocate connreq\n"); - kibnal_connreq_done(conn, 1, -ENOMEM); - kibnal_conn_decref(conn); /* drop my ref */ - return; - } - - memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq)); - - rc = kibnal_make_svcqry(conn); - if (rc != 0) { - kibnal_connreq_done (conn, 1, rc); - kibnal_conn_decref(conn); /* drop my ref */ - return; - } - - rc = ib_cached_gid_get(kibnal_data.kib_device, - kibnal_data.kib_port, 0, - conn->ibc_connreq->cr_gid); - LASSERT (rc == 0); - - /* kibnal_pathreq_callback gets my conn ref */ - rc = tsIbPathRecordRequest (kibnal_data.kib_device, - kibnal_data.kib_port, - conn->ibc_connreq->cr_gid, - conn->ibc_connreq->cr_svcrsp.ibsr_svc_gid, - conn->ibc_connreq->cr_svcrsp.ibsr_svc_pkey, - 0, - *kibnal_tunables.kib_timeout * HZ, - 0, - kibnal_pathreq_callback, conn, - &conn->ibc_connreq->cr_tid); - if (rc == 0) - return; /* callback now has my ref on conn */ - - CERROR ("Path record request %p -> %s failed: %d\n", - conn, libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); - kibnal_connreq_done(conn, 1, rc); - kibnal_conn_decref(conn); /* drop my ref */ -} - -int -kibnal_check_txs (kib_conn_t *conn, struct list_head *txs) -{ - kib_tx_t *tx; - struct list_head *ttmp; - unsigned long flags; - int timed_out = 0; - - spin_lock_irqsave (&conn->ibc_lock, flags); - - list_for_each (ttmp, txs) { - tx = list_entry (ttmp, kib_tx_t, tx_list); - - if (txs == &conn->ibc_active_txs) { - LASSERT (tx->tx_passive_rdma || - !tx->tx_passive_rdma_wait); - - LASSERT (tx->tx_passive_rdma_wait || - tx->tx_sending != 0); - } else { - LASSERT (!tx->tx_passive_rdma_wait); - LASSERT (tx->tx_sending == 0); - } - - if (time_after_eq (jiffies, tx->tx_deadline)) { - timed_out = 1; - break; - } - } - - spin_unlock_irqrestore (&conn->ibc_lock, flags); - return timed_out; -} - -int -kibnal_conn_timed_out (kib_conn_t *conn) -{ - return kibnal_check_txs(conn, &conn->ibc_tx_queue) || - kibnal_check_txs(conn, &conn->ibc_tx_queue_rsrvd) || - kibnal_check_txs(conn, &conn->ibc_tx_queue_nocred) || - kibnal_check_txs(conn, &conn->ibc_active_txs); -} - -void -kibnal_check_conns (int idx) -{ - struct list_head *peers = &kibnal_data.kib_peers[idx]; - struct list_head *ptmp; - kib_peer_t *peer; - kib_conn_t *conn; - struct list_head *ctmp; - unsigned long flags; - - again: - /* NB. We expect to have a look at all the peers and not find any - * rdmas to time out, so we just use a shared lock while we - * take a look... */ - read_lock_irqsave(&kibnal_data.kib_global_lock, flags); - - list_for_each (ptmp, peers) { - peer = list_entry (ptmp, kib_peer_t, ibp_list); - - list_for_each (ctmp, &peer->ibp_conns) { - conn = list_entry (ctmp, kib_conn_t, ibc_list); - - LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED); - - - /* In case we have enough credits to return via a - * NOOP, but there were no non-blocking tx descs - * free to do it last time... */ - kibnal_check_sends(conn); - - if (!kibnal_conn_timed_out(conn)) - continue; - - kibnal_conn_addref(conn); - - read_unlock_irqrestore(&kibnal_data.kib_global_lock, - flags); - - CERROR("Timed out RDMA with %s\n", - libcfs_nid2str(peer->ibp_nid)); - - kibnal_close_conn (conn, -ETIMEDOUT); - kibnal_conn_decref(conn); - - /* start again now I've dropped the lock */ - goto again; - } - } - - read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); -} - -void -kibnal_terminate_conn (kib_conn_t *conn) -{ - int rc; - - CDEBUG(D_NET, "conn %p\n", conn); - LASSERT (conn->ibc_state == IBNAL_CONN_DEATHROW); - conn->ibc_state = IBNAL_CONN_ZOMBIE; - - rc = ib_cm_disconnect (conn->ibc_comm_id); - if (rc != 0) - CERROR ("Error %d disconnecting conn %p -> %s\n", - rc, conn, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - - kibnal_peer_notify(conn->ibc_peer); -} - -int -kibnal_reaper (void *arg) -{ - wait_queue_t wait; - unsigned long flags; - kib_conn_t *conn; - int timeout; - int i; - int peer_index = 0; - unsigned long deadline = jiffies; - - cfs_daemonize ("kibnal_reaper"); - cfs_block_allsigs (); - - init_waitqueue_entry (&wait, current); - - spin_lock_irqsave (&kibnal_data.kib_reaper_lock, flags); - - while (!kibnal_data.kib_shutdown) { - if (!list_empty (&kibnal_data.kib_reaper_conns)) { - conn = list_entry (kibnal_data.kib_reaper_conns.next, - kib_conn_t, ibc_list); - list_del (&conn->ibc_list); - - spin_unlock_irqrestore (&kibnal_data.kib_reaper_lock, flags); - - switch (conn->ibc_state) { - case IBNAL_CONN_DEATHROW: - LASSERT (conn->ibc_comm_id != TS_IB_CM_COMM_ID_INVALID); - /* Disconnect: conn becomes a zombie in the - * callback and last ref reschedules it - * here... */ - kibnal_terminate_conn(conn); - kibnal_conn_decref(conn); - break; - - case IBNAL_CONN_INIT_QP: - case IBNAL_CONN_ZOMBIE: - kibnal_destroy_conn (conn); - break; - - default: - CERROR ("Bad conn %p state: %d\n", - conn, conn->ibc_state); - LBUG(); - } - - spin_lock_irqsave (&kibnal_data.kib_reaper_lock, flags); - continue; - } - - spin_unlock_irqrestore (&kibnal_data.kib_reaper_lock, flags); - - /* careful with the jiffy wrap... */ - while ((timeout = (int)(deadline - jiffies)) <= 0) { - const int n = 4; - const int p = 1; - int chunk = kibnal_data.kib_peer_hash_size; - - /* Time to check for RDMA timeouts on a few more - * peers: I do checks every 'p' seconds on a - * proportion of the peer table and I need to check - * every connection 'n' times within a timeout - * interval, to ensure I detect a timeout on any - * connection within (n+1)/n times the timeout - * interval. */ - - if (*kibnal_tunables.kib_timeout > n * p) - chunk = (chunk * n * p) / - *kibnal_tunables.kib_timeout; - if (chunk == 0) - chunk = 1; - - for (i = 0; i < chunk; i++) { - kibnal_check_conns (peer_index); - peer_index = (peer_index + 1) % - kibnal_data.kib_peer_hash_size; - } - - deadline += p * HZ; - } - - kibnal_data.kib_reaper_waketime = jiffies + timeout; - - set_current_state (TASK_INTERRUPTIBLE); - add_wait_queue (&kibnal_data.kib_reaper_waitq, &wait); - - schedule_timeout (timeout); - - set_current_state (TASK_RUNNING); - remove_wait_queue (&kibnal_data.kib_reaper_waitq, &wait); - - spin_lock_irqsave (&kibnal_data.kib_reaper_lock, flags); - } - - spin_unlock_irqrestore (&kibnal_data.kib_reaper_lock, flags); - - kibnal_thread_fini (); - return (0); -} - -int -kibnal_connd (void *arg) -{ - long id = (long)arg; - char name[16]; - wait_queue_t wait; - unsigned long flags; - kib_peer_t *peer; - kib_acceptsock_t *as; - int did_something; - - snprintf(name, sizeof(name), "kibnal_connd_%02ld", id); - cfs_daemonize(name); - cfs_block_allsigs(); - - init_waitqueue_entry (&wait, current); - - spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); - - while (!kibnal_data.kib_shutdown) { - did_something = 0; - - if (!list_empty (&kibnal_data.kib_connd_acceptq)) { - as = list_entry (kibnal_data.kib_connd_acceptq.next, - kib_acceptsock_t, ibas_list); - list_del (&as->ibas_list); - - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - - kibnal_handle_svcqry(as->ibas_sock); - kibnal_free_acceptsock(as); - - spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); - did_something = 1; - } - - /* Only handle an outgoing connection request if there is someone left - * to handle an incoming svcqry */ - if (!list_empty (&kibnal_data.kib_connd_peers) && - ((kibnal_data.kib_connd_connecting + 1) < - *kibnal_tunables.kib_n_connd)) { - peer = list_entry (kibnal_data.kib_connd_peers.next, - kib_peer_t, ibp_connd_list); - - list_del_init (&peer->ibp_connd_list); - kibnal_data.kib_connd_connecting++; - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - - kibnal_connect_peer (peer); - kibnal_peer_decref(peer); - - spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); - did_something = 1; - kibnal_data.kib_connd_connecting--; - } - - if (did_something) - continue; - - set_current_state (TASK_INTERRUPTIBLE); - add_wait_queue_exclusive(&kibnal_data.kib_connd_waitq, &wait); - - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - - schedule(); - - set_current_state (TASK_RUNNING); - remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait); - - spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); - } - - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - - kibnal_thread_fini (); - return (0); -} - -int -kibnal_scheduler(void *arg) -{ - long id = (long)arg; - char name[16]; - kib_rx_t *rx; - kib_tx_t *tx; - unsigned long flags; - int rc; - int counter = 0; - int did_something; - - snprintf(name, sizeof(name), "kibnal_sd_%02ld", id); - cfs_daemonize(name); - cfs_block_allsigs(); - - spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); - - while (!kibnal_data.kib_shutdown) { - did_something = 0; - - while (!list_empty(&kibnal_data.kib_sched_txq)) { - tx = list_entry(kibnal_data.kib_sched_txq.next, - kib_tx_t, tx_list); - list_del(&tx->tx_list); - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, - flags); - kibnal_tx_done(tx); - - spin_lock_irqsave(&kibnal_data.kib_sched_lock, - flags); - } - - if (!list_empty(&kibnal_data.kib_sched_rxq)) { - rx = list_entry(kibnal_data.kib_sched_rxq.next, - kib_rx_t, rx_list); - list_del(&rx->rx_list); - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, - flags); - - kibnal_rx(rx); - - did_something = 1; - spin_lock_irqsave(&kibnal_data.kib_sched_lock, - flags); - } - - /* nothing to do or hogging CPU */ - if (!did_something || counter++ == IBNAL_RESCHED) { - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, - flags); - counter = 0; - - if (!did_something) { - rc = wait_event_interruptible_exclusive( - kibnal_data.kib_sched_waitq, - !list_empty(&kibnal_data.kib_sched_txq) || - !list_empty(&kibnal_data.kib_sched_rxq) || - kibnal_data.kib_shutdown); - } else { - cfs_cond_resched(); - } - - spin_lock_irqsave(&kibnal_data.kib_sched_lock, - flags); - } - } - - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); - - kibnal_thread_fini(); - return (0); -} diff --git a/lnet/klnds/openiblnd/openiblnd_modparams.c b/lnet/klnds/openiblnd/openiblnd_modparams.c deleted file mode 100644 index 5d808f7..0000000 --- a/lnet/klnds/openiblnd/openiblnd_modparams.c +++ /dev/null @@ -1,269 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/openiblnd/openiblnd_modparams.c - * - * Author: Eric Barton - */ - -#include "openiblnd.h" - -static char *ipif_basename = "ib"; -CFS_MODULE_PARM(ipif_basename, "s", charp, 0444, - "IPoIB interface base name"); - -static int n_connd = 4; -CFS_MODULE_PARM(n_connd, "i", int, 0444, - "# of connection daemons"); - -static int min_reconnect_interval = 1; -CFS_MODULE_PARM(min_reconnect_interval, "i", int, 0644, - "minimum connection retry interval (seconds)"); - -static int max_reconnect_interval = 60; -CFS_MODULE_PARM(max_reconnect_interval, "i", int, 0644, - "maximum connection retry interval (seconds)"); - -static int concurrent_peers = 1152; -CFS_MODULE_PARM(concurrent_peers, "i", int, 0444, - "maximum number of peers that may connect"); - -static int cksum = 0; -CFS_MODULE_PARM(cksum, "i", int, 0644, - "set non-zero to enable message (not RDMA) checksums"); - -static int timeout = 50; -CFS_MODULE_PARM(timeout, "i", int, 0644, - "timeout (seconds)"); - -static int ntx = 384; -CFS_MODULE_PARM(ntx, "i", int, 0444, - "# of message descriptors"); - -static int credits = 256; -CFS_MODULE_PARM(credits, "i", int, 0444, - "# concurrent sends"); - -static int peer_credits = 16; -CFS_MODULE_PARM(peer_credits, "i", int, 0444, - "# concurrent sends to 1 peer"); - -static int keepalive = 100; -CFS_MODULE_PARM(keepalive, "i", int, 0644, - "Idle time in seconds before sending a keepalive"); - -kib_tunables_t kibnal_tunables = { - .kib_ipif_basename = &ipif_basename, - .kib_n_connd = &n_connd, - .kib_min_reconnect_interval = &min_reconnect_interval, - .kib_max_reconnect_interval = &max_reconnect_interval, - .kib_concurrent_peers = &concurrent_peers, - .kib_cksum = &cksum, - .kib_timeout = &timeout, - .kib_ntx = &ntx, - .kib_credits = &credits, - .kib_peercredits = &peer_credits, - .kib_keepalive = &keepalive, -}; - -#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM - -#ifndef HAVE_SYSCTL_UNNUMBERED -enum { - KIBNAL_IPIF_BASENAME = 1, - KIBNAL_N_CONND, - KIBNAL_RECONNECT_MIN, - KIBNAL_RECONNECT_MAX, - KIBNAL_CONCURRENT_PEERS, - KIBNAL_CKSUM, - KIBNAL_TIMEOUT, - KIBNAL_NTX, - KIBNAL_CREDITS, - KIBNAL_PEER_CREDITS, - KIBNAL_KEEPALIVE -}; -#else - -#define KIBNAL_IPIF_BASENAME CTL_UNNUMBERED -#define KIBNAL_N_CONND CTL_UNNUMBERED -#define KIBNAL_RECONNECT_MIN CTL_UNNUMBERED -#define KIBNAL_RECONNECT_MAX CTL_UNNUMBERED -#define KIBNAL_CONCURRENT_PEERS CTL_UNNUMBERED -#define KIBNAL_CKSUM CTL_UNNUMBERED -#define KIBNAL_TIMEOUT CTL_UNNUMBERED -#define KIBNAL_NTX CTL_UNNUMBERED -#define KIBNAL_CREDITS CTL_UNNUMBERED -#define KIBNAL_PEER_CREDITS CTL_UNNUMBERED -#define KIBNAL_KEEPALIVE CTL_UNNUMBERED - -#endif - -static cfs_sysctl_table_t kibnal_ctl_table[] = { - { - .ctl_name = KIBNAL_IPIF_BASENAME, - .procname = "ipif_basename", - .data = &ipif_basename, - .maxlen = 1024, - .mode = 0444, - .proc_handler = &proc_dostring - }, - { - .ctl_name = KIBNAL_N_CONND, - .procname = "n_connd", - .data = &n_connd, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = KIBNAL_RECONNECT_MIN, - .procname = "min_reconnect_interval", - .data = &min_reconnect_interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = KIBNAL_RECONNECT_MAX, - .procname = "max_reconnect_interval", - .data = &max_reconnect_interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = KIBNAL_CONCURRENT_PEERS, - .procname = "concurrent_peers", - .data = &concurrent_peers, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = KIBNAL_CKSUM, - .procname = "cksum", - .data = &cksum, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = KIBNAL_TIMEOUT, - .procname = "timeout", - .data = &timeout, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = KIBNAL_NTX, - .procname = "ntx", - .data = &ntx, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = KIBNAL_CREDITS, - .procname = "credits", - .data = &credits, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = KIBNAL_PEER_CREDITS, - .procname = "peer_credits", - .data = &peer_credits, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = KIBNAL_KEEPALIVE, - .procname = "keepalive", - .data = &keepalive, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - {0} -}; - -static cfs_sysctl_table_t kibnal_top_ctl_table[] = { - { - .ctl_name = CTL_KIBNAL, - .procname = "openibnal", - .data = NULL, - .maxlen = 0, - .mode = 0555, - .child = kibnal_ctl_table - }, - {0} -}; - -int -kibnal_tunables_init () -{ - kibnal_tunables.kib_sysctl = - cfs_register_sysctl_table(kibnal_top_ctl_table, 0); - - if (kibnal_tunables.kib_sysctl == NULL) - CWARN("Can't setup /proc tunables\n"); - - return 0; -} - -void -kibnal_tunables_fini () -{ - if (kibnal_tunables.kib_sysctl != NULL) - cfs_unregister_sysctl_table(kibnal_tunables.kib_sysctl); -} - -#else - -int -kibnal_tunables_init () -{ - return 0; -} - -void -kibnal_tunables_fini () -{ -} - -#endif diff --git a/lnet/klnds/viblnd/.gitignore b/lnet/klnds/viblnd/.gitignore deleted file mode 100644 index 37760ff..0000000 --- a/lnet/klnds/viblnd/.gitignore +++ /dev/null @@ -1,11 +0,0 @@ -/.deps -/Makefile -/.*.cmd -/autoMakefile.in -/autoMakefile -/*.ko -/*.mod.c -/.*.flags -/.tmp_versions -/.depend -/wirecheck diff --git a/lnet/klnds/viblnd/Makefile.in b/lnet/klnds/viblnd/Makefile.in deleted file mode 100644 index 5b5c2db..0000000 --- a/lnet/klnds/viblnd/Makefile.in +++ /dev/null @@ -1,6 +0,0 @@ -MODULES := kviblnd -kviblnd-objs := viblnd.o viblnd_cb.o viblnd_modparams.o - -EXTRA_POST_CFLAGS := @VIBCPPFLAGS@ - -@INCLUDE_RULES@ diff --git a/lnet/klnds/viblnd/autoMakefile.am b/lnet/klnds/viblnd/autoMakefile.am deleted file mode 100644 index 4d0afb3..0000000 --- a/lnet/klnds/viblnd/autoMakefile.am +++ /dev/null @@ -1,44 +0,0 @@ -# -# GPL HEADER START -# -# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 only, -# as published by the Free Software Foundation. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License version 2 for more details (a copy is included -# in the LICENSE file that accompanied this code). -# -# You should have received a copy of the GNU General Public License -# version 2 along with this program; If not, see -# http://www.sun.com/software/products/lustre/docs/GPLv2.pdf -# -# Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, -# CA 95054 USA or visit www.sun.com if you need additional information or -# have any questions. -# -# GPL HEADER END -# - -# -# Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. -# Use is subject to license terms. -# - -# -# This file is part of Lustre, http://www.lustre.org/ -# Lustre is a trademark of Sun Microsystems, Inc. -# - -if MODULES -if BUILD_VIBLND -modulenet_DATA = kviblnd$(KMODEXT) -endif -endif - -MOSTLYCLEANFILES = @MOSTLYCLEANFILES@ -DIST_SOURCES = $(kviblnd-objs:%.o=%.c) viblnd.h viblnd_wire.h diff --git a/lnet/klnds/viblnd/viblnd.c b/lnet/klnds/viblnd/viblnd.c deleted file mode 100644 index 9d904c4..0000000 --- a/lnet/klnds/viblnd/viblnd.c +++ /dev/null @@ -1,2032 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. - * - * GPL HEADER END - */ -/* - * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/viblnd/viblnd.c - * - * Author: Eric Barton - * Author: Frank Zago - */ - -#include "viblnd.h" - -lnd_t the_kiblnd = { - .lnd_type = VIBLND, - .lnd_startup = kibnal_startup, - .lnd_shutdown = kibnal_shutdown, - .lnd_ctl = kibnal_ctl, - .lnd_send = kibnal_send, - .lnd_recv = kibnal_recv, - .lnd_eager_recv = kibnal_eager_recv, -}; - -kib_data_t kibnal_data; - -void vibnal_assert_wire_constants (void) -{ - /* Wire protocol assertions generated by 'wirecheck' - * running on Linux robert 2.6.11-1.27_FC3 #1 Tue May 17 20:27:37 EDT 2005 i686 athlon i386 G - * with gcc version 3.4.3 20050227 (Red Hat 3.4.3-22.fc3) */ - - - /* Constants... */ - CLASSERT (IBNAL_MSG_MAGIC == 0x0be91b91); - CLASSERT (IBNAL_MSG_VERSION == 0x11); - CLASSERT (IBNAL_MSG_CONNREQ == 0xc0); - CLASSERT (IBNAL_MSG_CONNACK == 0xc1); - CLASSERT (IBNAL_MSG_NOOP == 0xd0); - CLASSERT (IBNAL_MSG_IMMEDIATE == 0xd1); - CLASSERT (IBNAL_MSG_PUT_REQ == 0xd2); - CLASSERT (IBNAL_MSG_PUT_NAK == 0xd3); - CLASSERT (IBNAL_MSG_PUT_ACK == 0xd4); - CLASSERT (IBNAL_MSG_PUT_DONE == 0xd5); - CLASSERT (IBNAL_MSG_GET_REQ == 0xd6); - CLASSERT (IBNAL_MSG_GET_DONE == 0xd7); - - /* Checks for struct kib_connparams_t */ - CLASSERT ((int)sizeof(kib_connparams_t) == 12); - CLASSERT ((int)offsetof(kib_connparams_t, ibcp_queue_depth) == 0); - CLASSERT ((int)sizeof(((kib_connparams_t *)0)->ibcp_queue_depth) == 4); - CLASSERT ((int)offsetof(kib_connparams_t, ibcp_max_msg_size) == 4); - CLASSERT ((int)sizeof(((kib_connparams_t *)0)->ibcp_max_msg_size) == 4); - CLASSERT ((int)offsetof(kib_connparams_t, ibcp_max_frags) == 8); - CLASSERT ((int)sizeof(((kib_connparams_t *)0)->ibcp_max_frags) == 4); - - /* Checks for struct kib_immediate_msg_t */ - CLASSERT ((int)sizeof(kib_immediate_msg_t) == 72); - CLASSERT ((int)offsetof(kib_immediate_msg_t, ibim_hdr) == 0); - CLASSERT ((int)sizeof(((kib_immediate_msg_t *)0)->ibim_hdr) == 72); - CLASSERT ((int)offsetof(kib_immediate_msg_t, ibim_payload[13]) == 85); - CLASSERT ((int)sizeof(((kib_immediate_msg_t *)0)->ibim_payload[13]) == 1); - CLASSERT (IBNAL_USE_FMR == 1); - - /* Checks for struct kib_rdma_desc_t */ - CLASSERT ((int)sizeof(kib_rdma_desc_t) == 16); - CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_addr) == 0); - CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_addr) == 8); - CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_nob) == 8); - CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_nob) == 4); - CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_key) == 12); - CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_key) == 4); - - /* Checks for struct kib_putreq_msg_t */ - CLASSERT ((int)sizeof(kib_putreq_msg_t) == 80); - CLASSERT ((int)offsetof(kib_putreq_msg_t, ibprm_hdr) == 0); - CLASSERT ((int)sizeof(((kib_putreq_msg_t *)0)->ibprm_hdr) == 72); - CLASSERT ((int)offsetof(kib_putreq_msg_t, ibprm_cookie) == 72); - CLASSERT ((int)sizeof(((kib_putreq_msg_t *)0)->ibprm_cookie) == 8); - - /* Checks for struct kib_putack_msg_t */ - CLASSERT ((int)sizeof(kib_putack_msg_t) == 32); - CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_src_cookie) == 0); - CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_src_cookie) == 8); - CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_dst_cookie) == 8); - CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_dst_cookie) == 8); - CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_rd) == 16); - CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_rd) == 16); - - /* Checks for struct kib_get_msg_t */ - CLASSERT ((int)sizeof(kib_get_msg_t) == 96); - CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_hdr) == 0); - CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_hdr) == 72); - CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_cookie) == 72); - CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_cookie) == 8); - CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_rd) == 80); - CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_rd) == 16); - - /* Checks for struct kib_completion_msg_t */ - CLASSERT ((int)sizeof(kib_completion_msg_t) == 12); - CLASSERT ((int)offsetof(kib_completion_msg_t, ibcm_cookie) == 0); - CLASSERT ((int)sizeof(((kib_completion_msg_t *)0)->ibcm_cookie) == 8); - CLASSERT ((int)offsetof(kib_completion_msg_t, ibcm_status) == 8); - CLASSERT ((int)sizeof(((kib_completion_msg_t *)0)->ibcm_status) == 4); - - /* Checks for struct kib_msg_t */ - CLASSERT ((int)sizeof(kib_msg_t) == 152); - CLASSERT ((int)offsetof(kib_msg_t, ibm_magic) == 0); - CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_magic) == 4); - CLASSERT ((int)offsetof(kib_msg_t, ibm_version) == 4); - CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_version) == 2); - CLASSERT ((int)offsetof(kib_msg_t, ibm_type) == 6); - CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_type) == 1); - CLASSERT ((int)offsetof(kib_msg_t, ibm_credits) == 7); - CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_credits) == 1); - CLASSERT ((int)offsetof(kib_msg_t, ibm_nob) == 8); - CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_nob) == 4); - CLASSERT ((int)offsetof(kib_msg_t, ibm_cksum) == 12); - CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_cksum) == 4); - CLASSERT ((int)offsetof(kib_msg_t, ibm_srcnid) == 16); - CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_srcnid) == 8); - CLASSERT ((int)offsetof(kib_msg_t, ibm_srcstamp) == 24); - CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_srcstamp) == 8); - CLASSERT ((int)offsetof(kib_msg_t, ibm_dstnid) == 32); - CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_dstnid) == 8); - CLASSERT ((int)offsetof(kib_msg_t, ibm_dststamp) == 40); - CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_dststamp) == 8); - CLASSERT ((int)offsetof(kib_msg_t, ibm_seq) == 48); - CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_seq) == 8); - CLASSERT ((int)offsetof(kib_msg_t, ibm_u.connparams) == 56); - CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.connparams) == 12); - CLASSERT ((int)offsetof(kib_msg_t, ibm_u.immediate) == 56); - CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.immediate) == 72); - CLASSERT ((int)offsetof(kib_msg_t, ibm_u.putreq) == 56); - CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.putreq) == 80); - CLASSERT ((int)offsetof(kib_msg_t, ibm_u.putack) == 56); - CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.putack) == 32); - CLASSERT ((int)offsetof(kib_msg_t, ibm_u.get) == 56); - CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.get) == 96); - CLASSERT ((int)offsetof(kib_msg_t, ibm_u.completion) == 56); - CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.completion) == 12); -} - -__u32 -kibnal_cksum (void *ptr, int nob) -{ - char *c = ptr; - __u32 sum = 0; - - while (nob-- > 0) - sum = ((sum << 1) | (sum >> 31)) + *c++; - - /* ensure I don't return 0 (== no checksum) */ - return (sum == 0) ? 1 : sum; -} - -void -kibnal_init_msg(kib_msg_t *msg, int type, int body_nob) -{ - msg->ibm_type = type; - msg->ibm_nob = offsetof(kib_msg_t, ibm_u) + body_nob; -} - -void -kibnal_pack_msg(kib_msg_t *msg, __u32 version, int credits, - lnet_nid_t dstnid, __u64 dststamp, __u64 seq) -{ - /* CAVEAT EMPTOR! all message fields not set here should have been - * initialised previously. */ - msg->ibm_magic = IBNAL_MSG_MAGIC; - msg->ibm_version = version; - /* ibm_type */ - msg->ibm_credits = credits; - /* ibm_nob */ - msg->ibm_cksum = 0; - msg->ibm_srcnid = kibnal_data.kib_ni->ni_nid; - msg->ibm_srcstamp = kibnal_data.kib_incarnation; - msg->ibm_dstnid = dstnid; - msg->ibm_dststamp = dststamp; - msg->ibm_seq = seq; - - if (*kibnal_tunables.kib_cksum) { - /* NB ibm_cksum zero while computing cksum */ - msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob); - } -} - -int -kibnal_unpack_msg(kib_msg_t *msg, __u32 expected_version, int nob) -{ - const int hdr_size = offsetof(kib_msg_t, ibm_u); - __u32 msg_cksum; - __u32 msg_version; - int flip; - int msg_nob; -#if !IBNAL_USE_FMR - int i; - int n; -#endif - /* 6 bytes are enough to have received magic + version */ - if (nob < 6) { - CERROR("Short message: %d\n", nob); - return -EPROTO; - } - - /* Future protocol version compatibility support! - * If the viblnd-specific protocol changes, or when LNET unifies - * protocols over all LNDs, the initial connection will negotiate a - * protocol version. If I find this, I avoid any console errors. If - * my is doing connection establishment, the reject will tell the peer - * which version I'm running. */ - - if (msg->ibm_magic == IBNAL_MSG_MAGIC) { - flip = 0; - } else if (msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC)) { - flip = 1; - } else { - if (msg->ibm_magic == LNET_PROTO_MAGIC || - msg->ibm_magic == __swab32(LNET_PROTO_MAGIC)) - return -EPROTO; - - /* Completely out to lunch */ - CERROR("Bad magic: %08x\n", msg->ibm_magic); - return -EPROTO; - } - - msg_version = flip ? __swab16(msg->ibm_version) : msg->ibm_version; - if (expected_version == 0) { - if (msg_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD && - msg_version != IBNAL_MSG_VERSION) - return -EPROTO; - } else if (msg_version != expected_version) { - CERROR("Bad version: %x(%x expected)\n", - msg_version, expected_version); - return -EPROTO; - } - - if (nob < hdr_size) { - CERROR("Short message: %d\n", nob); - return -EPROTO; - } - - msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob; - if (msg_nob > nob) { - CERROR("Short message: got %d, wanted %d\n", nob, msg_nob); - return -EPROTO; - } - - /* checksum must be computed with ibm_cksum zero and BEFORE anything - * gets flipped */ - msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum; - msg->ibm_cksum = 0; - if (msg_cksum != 0 && - msg_cksum != kibnal_cksum(msg, msg_nob)) { - CERROR("Bad checksum\n"); - return -EPROTO; - } - msg->ibm_cksum = msg_cksum; - - if (flip) { - /* leave magic unflipped as a clue to peer endianness */ - msg->ibm_version = msg_version; - CLASSERT (sizeof(msg->ibm_type) == 1); - CLASSERT (sizeof(msg->ibm_credits) == 1); - msg->ibm_nob = msg_nob; - __swab64s(&msg->ibm_srcnid); - __swab64s(&msg->ibm_srcstamp); - __swab64s(&msg->ibm_dstnid); - __swab64s(&msg->ibm_dststamp); - __swab64s(&msg->ibm_seq); - } - - if (msg->ibm_srcnid == LNET_NID_ANY) { - CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid)); - return -EPROTO; - } - - switch (msg->ibm_type) { - default: - CERROR("Unknown message type %x\n", msg->ibm_type); - return -EPROTO; - - case IBNAL_MSG_NOOP: - break; - - case IBNAL_MSG_IMMEDIATE: - if (msg_nob < offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])) { - CERROR("Short IMMEDIATE: %d(%d)\n", msg_nob, - (int)offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])); - return -EPROTO; - } - break; - - case IBNAL_MSG_PUT_REQ: - if (msg_nob < hdr_size + sizeof(msg->ibm_u.putreq)) { - CERROR("Short PUT_REQ: %d(%d)\n", msg_nob, - (int)(hdr_size + sizeof(msg->ibm_u.putreq))); - return -EPROTO; - } - break; - - case IBNAL_MSG_PUT_ACK: - if (msg_nob < hdr_size + sizeof(msg->ibm_u.putack)) { - CERROR("Short PUT_ACK: %d(%d)\n", msg_nob, - (int)(hdr_size + sizeof(msg->ibm_u.putack))); - return -EPROTO; - } -#if IBNAL_USE_FMR - if (flip) { - __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_addr); - __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nob); - __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key); - } -#else - if (flip) { - __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key); - __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nfrag); - } - - n = msg->ibm_u.putack.ibpam_rd.rd_nfrag; - if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) { - CERROR("Bad PUT_ACK nfrags: %d, should be 0 < n <= %d\n", - n, IBNAL_MAX_RDMA_FRAGS); - return -EPROTO; - } - - if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])) { - CERROR("Short PUT_ACK: %d(%d)\n", msg_nob, - (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])); - return -EPROTO; - } - - if (flip) { - for (i = 0; i < n; i++) { - __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_nob); - __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr_lo); - __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr_hi); - } - } -#endif - break; - - case IBNAL_MSG_GET_REQ: - if (msg_nob < hdr_size + sizeof(msg->ibm_u.get)) { - CERROR("Short GET_REQ: %d(%d)\n", msg_nob, - (int)(hdr_size + sizeof(msg->ibm_u.get))); - return -EPROTO; - } -#if IBNAL_USE_FMR - if (flip) { - __swab64s(&msg->ibm_u.get.ibgm_rd.rd_addr); - __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nob); - __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key); - } -#else - if (flip) { - __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key); - __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nfrag); - } - - n = msg->ibm_u.get.ibgm_rd.rd_nfrag; - if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) { - CERROR("Bad GET_REQ nfrags: %d, should be 0 < n <= %d\n", - n, IBNAL_MAX_RDMA_FRAGS); - return -EPROTO; - } - - if (msg_nob < offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])) { - CERROR("Short GET_REQ: %d(%d)\n", msg_nob, - (int)offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])); - return -EPROTO; - } - - if (flip) - for (i = 0; i < msg->ibm_u.get.ibgm_rd.rd_nfrag; i++) { - __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_nob); - __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr_lo); - __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr_hi); - } -#endif - break; - - case IBNAL_MSG_PUT_NAK: - case IBNAL_MSG_PUT_DONE: - case IBNAL_MSG_GET_DONE: - if (msg_nob < hdr_size + sizeof(msg->ibm_u.completion)) { - CERROR("Short RDMA completion: %d(%d)\n", msg_nob, - (int)(hdr_size + sizeof(msg->ibm_u.completion))); - return -EPROTO; - } - if (flip) - __swab32s(&msg->ibm_u.completion.ibcm_status); - break; - - case IBNAL_MSG_CONNREQ: - case IBNAL_MSG_CONNACK: - if (msg_nob < hdr_size + sizeof(msg->ibm_u.connparams)) { - CERROR("Short connreq/ack: %d(%d)\n", msg_nob, - (int)(hdr_size + sizeof(msg->ibm_u.connparams))); - return -EPROTO; - } - if (flip) { - __swab32s(&msg->ibm_u.connparams.ibcp_queue_depth); - __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size); - __swab32s(&msg->ibm_u.connparams.ibcp_max_frags); - } - break; - } - return 0; -} - -int -kibnal_start_listener (lnet_ni_t *ni) -{ - static cm_listen_data_t info; - - cm_return_t cmrc; - - LASSERT (kibnal_data.kib_listen_handle == NULL); - - kibnal_data.kib_listen_handle = - cm_create_cep(cm_cep_transp_rc); - if (kibnal_data.kib_listen_handle == NULL) { - CERROR ("Can't create listen CEP\n"); - return -ENOMEM; - } - - CDEBUG(D_NET, "Created CEP %p for listening\n", - kibnal_data.kib_listen_handle); - - memset(&info, 0, sizeof(info)); - info.listen_addr.end_pt.sid = - (__u64)(*kibnal_tunables.kib_service_number); - - cmrc = cm_listen(kibnal_data.kib_listen_handle, &info, - kibnal_listen_callback, NULL); - if (cmrc == cm_stat_success) - return 0; - - CERROR ("cm_listen error: %d\n", cmrc); - - cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle); - LASSERT (cmrc == cm_stat_success); - - kibnal_data.kib_listen_handle = NULL; - return -EINVAL; -} - -void -kibnal_stop_listener(lnet_ni_t *ni) -{ - cm_return_t cmrc; - - LASSERT (kibnal_data.kib_listen_handle != NULL); - - cmrc = cm_cancel(kibnal_data.kib_listen_handle); - if (cmrc != cm_stat_success) - CERROR ("Error %d stopping listener\n", cmrc); - - cfs_pause(cfs_time_seconds(1)/10); /* ensure no more callbacks */ - - cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle); - if (cmrc != vv_return_ok) - CERROR ("Error %d destroying CEP\n", cmrc); - - kibnal_data.kib_listen_handle = NULL; -} - -int -kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid) -{ - kib_peer_t *peer; - unsigned long flags; - int rc; - - LASSERT (nid != LNET_NID_ANY); - - LIBCFS_ALLOC(peer, sizeof (*peer)); - if (peer == NULL) { - CERROR("Cannot allocate peer\n"); - return -ENOMEM; - } - - memset(peer, 0, sizeof(*peer)); /* zero flags etc */ - - peer->ibp_nid = nid; - atomic_set (&peer->ibp_refcount, 1); /* 1 ref for caller */ - - INIT_LIST_HEAD (&peer->ibp_list); /* not in the peer table yet */ - INIT_LIST_HEAD (&peer->ibp_conns); - INIT_LIST_HEAD (&peer->ibp_tx_queue); - - peer->ibp_error = 0; - peer->ibp_last_alive = cfs_time_current(); - peer->ibp_reconnect_interval = 0; /* OK to connect at any time */ - - write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - - if (atomic_read(&kibnal_data.kib_npeers) >= - *kibnal_tunables.kib_concurrent_peers) { - rc = -EOVERFLOW; /* !! but at least it distinguishes */ - } else if (kibnal_data.kib_listen_handle == NULL) { - rc = -ESHUTDOWN; /* shutdown has started */ - } else { - rc = 0; - /* npeers only grows with the global lock held */ - atomic_inc(&kibnal_data.kib_npeers); - } - - write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - - if (rc != 0) { - CERROR("Can't create peer: %s\n", - (rc == -ESHUTDOWN) ? "shutting down" : - "too many peers"); - LIBCFS_FREE(peer, sizeof(*peer)); - } else { - *peerp = peer; - } - - return rc; -} - -void -kibnal_destroy_peer (kib_peer_t *peer) -{ - LASSERT (atomic_read (&peer->ibp_refcount) == 0); - LASSERT (peer->ibp_persistence == 0); - LASSERT (!kibnal_peer_active(peer)); - LASSERT (peer->ibp_connecting == 0); - LASSERT (peer->ibp_accepting == 0); - LASSERT (list_empty (&peer->ibp_conns)); - LASSERT (list_empty (&peer->ibp_tx_queue)); - - LIBCFS_FREE (peer, sizeof (*peer)); - - /* NB a peer's connections keep a reference on their peer until - * they are destroyed, so we can be assured that _all_ state to do - * with this peer has been cleaned up when its refcount drops to - * zero. */ - atomic_dec(&kibnal_data.kib_npeers); -} - -kib_peer_t * -kibnal_find_peer_locked (lnet_nid_t nid) -{ - /* the caller is responsible for accounting the additional reference - * that this creates */ - struct list_head *peer_list = kibnal_nid2peerlist (nid); - struct list_head *tmp; - kib_peer_t *peer; - - list_for_each (tmp, peer_list) { - - peer = list_entry (tmp, kib_peer_t, ibp_list); - - LASSERT (peer->ibp_persistence != 0 || /* persistent peer */ - peer->ibp_connecting != 0 || /* creating conns */ - peer->ibp_accepting != 0 || - !list_empty (&peer->ibp_conns)); /* active conn */ - - if (peer->ibp_nid != nid) - continue; - - CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n", - peer, libcfs_nid2str(nid), - atomic_read (&peer->ibp_refcount)); - return (peer); - } - return (NULL); -} - -void -kibnal_unlink_peer_locked (kib_peer_t *peer) -{ - LASSERT (peer->ibp_persistence == 0); - LASSERT (list_empty(&peer->ibp_conns)); - - LASSERT (kibnal_peer_active(peer)); - list_del_init (&peer->ibp_list); - /* lose peerlist's ref */ - kibnal_peer_decref(peer); -} - -int -kibnal_get_peer_info (int index, lnet_nid_t *nidp, __u32 *ipp, - int *persistencep) -{ - kib_peer_t *peer; - struct list_head *ptmp; - int i; - unsigned long flags; - - read_lock_irqsave(&kibnal_data.kib_global_lock, flags); - - for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { - - list_for_each (ptmp, &kibnal_data.kib_peers[i]) { - - peer = list_entry (ptmp, kib_peer_t, ibp_list); - LASSERT (peer->ibp_persistence != 0 || - peer->ibp_connecting != 0 || - peer->ibp_accepting != 0 || - !list_empty (&peer->ibp_conns)); - - if (index-- > 0) - continue; - - *nidp = peer->ibp_nid; - *ipp = peer->ibp_ip; - *persistencep = peer->ibp_persistence; - - read_unlock_irqrestore(&kibnal_data.kib_global_lock, - flags); - return (0); - } - } - - read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - return (-ENOENT); -} - -int -kibnal_add_persistent_peer (lnet_nid_t nid, __u32 ip) -{ - kib_peer_t *peer; - kib_peer_t *peer2; - unsigned long flags; - int rc; - - CDEBUG(D_NET, "%s at %u.%u.%u.%u\n", - libcfs_nid2str(nid), HIPQUAD(ip)); - - if (nid == LNET_NID_ANY) - return (-EINVAL); - - rc = kibnal_create_peer(&peer, nid); - if (rc != 0) - return rc; - - write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - - /* I'm always called with a reference on kibnal_data.kib_ni - * so shutdown can't have started */ - LASSERT (kibnal_data.kib_listen_handle != NULL); - - peer2 = kibnal_find_peer_locked (nid); - if (peer2 != NULL) { - kibnal_peer_decref (peer); - peer = peer2; - } else { - /* peer table takes existing ref on peer */ - list_add_tail (&peer->ibp_list, - kibnal_nid2peerlist (nid)); - } - - peer->ibp_ip = ip; - peer->ibp_persistence++; - - write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - return (0); -} - -void -kibnal_del_peer_locked (kib_peer_t *peer) -{ - struct list_head *ctmp; - struct list_head *cnxt; - kib_conn_t *conn; - - peer->ibp_persistence = 0; - - if (list_empty(&peer->ibp_conns)) { - kibnal_unlink_peer_locked(peer); - } else { - list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { - conn = list_entry(ctmp, kib_conn_t, ibc_list); - - kibnal_close_conn_locked (conn, 0); - } - /* NB peer is no longer persistent; closing its last conn - * unlinked it. */ - } - /* NB peer now unlinked; might even be freed if the peer table had the - * last ref on it. */ -} - -int -kibnal_del_peer (lnet_nid_t nid) -{ - CFS_LIST_HEAD (zombies); - struct list_head *ptmp; - struct list_head *pnxt; - kib_peer_t *peer; - int lo; - int hi; - int i; - unsigned long flags; - int rc = -ENOENT; - - write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - - if (nid != LNET_NID_ANY) - lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; - else { - lo = 0; - hi = kibnal_data.kib_peer_hash_size - 1; - } - - for (i = lo; i <= hi; i++) { - list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) { - peer = list_entry (ptmp, kib_peer_t, ibp_list); - LASSERT (peer->ibp_persistence != 0 || - peer->ibp_connecting != 0 || - peer->ibp_accepting != 0 || - !list_empty (&peer->ibp_conns)); - - if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid)) - continue; - - if (!list_empty(&peer->ibp_tx_queue)) { - LASSERT (list_empty(&peer->ibp_conns)); - - list_splice_init(&peer->ibp_tx_queue, &zombies); - } - - kibnal_del_peer_locked (peer); - rc = 0; /* matched something */ - } - } - - write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - - kibnal_txlist_done(&zombies, -EIO); - - return (rc); -} - -kib_conn_t * -kibnal_get_conn_by_idx (int index) -{ - kib_peer_t *peer; - struct list_head *ptmp; - kib_conn_t *conn; - struct list_head *ctmp; - int i; - unsigned long flags; - - read_lock_irqsave(&kibnal_data.kib_global_lock, flags); - - for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { - list_for_each (ptmp, &kibnal_data.kib_peers[i]) { - - peer = list_entry (ptmp, kib_peer_t, ibp_list); - LASSERT (peer->ibp_persistence > 0 || - peer->ibp_connecting != 0 || - peer->ibp_accepting != 0 || - !list_empty (&peer->ibp_conns)); - - list_for_each (ctmp, &peer->ibp_conns) { - if (index-- > 0) - continue; - - conn = list_entry (ctmp, kib_conn_t, ibc_list); - kibnal_conn_addref(conn); - read_unlock_irqrestore(&kibnal_data.kib_global_lock, - flags); - return (conn); - } - } - } - - read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - return (NULL); -} - -void -kibnal_debug_rx (kib_rx_t *rx) -{ - CDEBUG(D_CONSOLE, " %p nob %d msg_type %x " - "cred %d seq "LPD64"\n", - rx, rx->rx_nob, rx->rx_msg->ibm_type, - rx->rx_msg->ibm_credits, rx->rx_msg->ibm_seq); -} - -void -kibnal_debug_tx (kib_tx_t *tx) -{ - CDEBUG(D_CONSOLE, " %p snd %d q %d w %d rc %d dl %lx " - "cookie "LPX64" msg %s%s type %x cred %d seq "LPD64"\n", - tx, tx->tx_sending, tx->tx_queued, tx->tx_waiting, - tx->tx_status, tx->tx_deadline, tx->tx_cookie, - tx->tx_lntmsg[0] == NULL ? "-" : "!", - tx->tx_lntmsg[1] == NULL ? "-" : "!", - tx->tx_msg->ibm_type, tx->tx_msg->ibm_credits, - tx->tx_msg->ibm_seq); -} - -void -kibnal_debug_conn (kib_conn_t *conn) -{ - struct list_head *tmp; - int i; - - spin_lock(&conn->ibc_lock); - - CDEBUG(D_CONSOLE, "conn[%d] %p -> %s: \n", - atomic_read(&conn->ibc_refcount), conn, - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - CDEBUG(D_CONSOLE, " txseq "LPD64" rxseq "LPD64" state %d \n", - conn->ibc_txseq, conn->ibc_rxseq, conn->ibc_state); - CDEBUG(D_CONSOLE, " nposted %d cred %d o_cred %d r_cred %d\n", - conn->ibc_nsends_posted, conn->ibc_credits, - conn->ibc_outstanding_credits, conn->ibc_reserved_credits); - CDEBUG(D_CONSOLE, " disc %d comms_err %d\n", - conn->ibc_disconnect, conn->ibc_comms_error); - - CDEBUG(D_CONSOLE, " early_rxs:\n"); - list_for_each(tmp, &conn->ibc_early_rxs) - kibnal_debug_rx(list_entry(tmp, kib_rx_t, rx_list)); - - CDEBUG(D_CONSOLE, " tx_queue_nocred:\n"); - list_for_each(tmp, &conn->ibc_tx_queue_nocred) - kibnal_debug_tx(list_entry(tmp, kib_tx_t, tx_list)); - - CDEBUG(D_CONSOLE, " tx_queue_rsrvd:\n"); - list_for_each(tmp, &conn->ibc_tx_queue_rsrvd) - kibnal_debug_tx(list_entry(tmp, kib_tx_t, tx_list)); - - CDEBUG(D_CONSOLE, " tx_queue:\n"); - list_for_each(tmp, &conn->ibc_tx_queue) - kibnal_debug_tx(list_entry(tmp, kib_tx_t, tx_list)); - - CDEBUG(D_CONSOLE, " active_txs:\n"); - list_for_each(tmp, &conn->ibc_active_txs) - kibnal_debug_tx(list_entry(tmp, kib_tx_t, tx_list)); - - CDEBUG(D_CONSOLE, " rxs:\n"); - for (i = 0; i < IBNAL_RX_MSGS; i++) - kibnal_debug_rx(&conn->ibc_rxs[i]); - - spin_unlock(&conn->ibc_lock); -} - -int -kibnal_set_qp_state (kib_conn_t *conn, vv_qp_state_t new_state) -{ - static vv_qp_attr_t attr; - - kib_connvars_t *cv = conn->ibc_connvars; - vv_return_t vvrc; - - /* Only called by connd => static OK */ - LASSERT (!in_interrupt()); - LASSERT (current == kibnal_data.kib_connd); - - memset(&attr, 0, sizeof(attr)); - - switch (new_state) { - default: - LBUG(); - - case vv_qp_state_init: { - struct vv_qp_modify_init_st *init = &attr.modify.params.init; - - init->p_key_indx = cv->cv_pkey_index; - init->phy_port_num = cv->cv_port; - init->q_key = IBNAL_QKEY; /* XXX but VV_QP_AT_Q_KEY not set! */ - init->access_control = vv_acc_r_mem_read | - vv_acc_r_mem_write; /* XXX vv_acc_l_mem_write ? */ - - attr.modify.vv_qp_attr_mask = VV_QP_AT_P_KEY_IX | - VV_QP_AT_PHY_PORT_NUM | - VV_QP_AT_ACCESS_CON_F; - break; - } - case vv_qp_state_rtr: { - struct vv_qp_modify_rtr_st *rtr = &attr.modify.params.rtr; - vv_add_vec_t *av = &rtr->remote_add_vec; - - av->dlid = cv->cv_path.dlid; - av->grh_flag = (!IBNAL_LOCAL_SUB); - av->max_static_rate = IBNAL_R_2_STATIC_RATE(cv->cv_path.rate); - av->service_level = cv->cv_path.sl; - av->source_path_bit = IBNAL_SOURCE_PATH_BIT; - av->pmtu = cv->cv_path.mtu; - av->rnr_retry_count = cv->cv_rnr_count; - av->global_dest.traffic_class = cv->cv_path.traffic_class; - av->global_dest.hope_limit = cv->cv_path.hop_limut; - av->global_dest.flow_lable = cv->cv_path.flow_label; - av->global_dest.s_gid_index = cv->cv_sgid_index; - // XXX other av fields zero? - - rtr->destanation_qp = cv->cv_remote_qpn; - rtr->receive_psn = cv->cv_rxpsn; - rtr->responder_rdma_r_atom_num = IBNAL_OUS_DST_RD; - rtr->opt_min_rnr_nak_timer = *kibnal_tunables.kib_rnr_nak_timer; - - - // XXX sdp sets VV_QP_AT_OP_F but no actual optional options - attr.modify.vv_qp_attr_mask = VV_QP_AT_ADD_VEC | - VV_QP_AT_DEST_QP | - VV_QP_AT_R_PSN | - VV_QP_AT_MIN_RNR_NAK_T | - VV_QP_AT_RESP_RDMA_ATOM_OUT_NUM | - VV_QP_AT_OP_F; - break; - } - case vv_qp_state_rts: { - struct vv_qp_modify_rts_st *rts = &attr.modify.params.rts; - - rts->send_psn = cv->cv_txpsn; - rts->local_ack_timeout = *kibnal_tunables.kib_local_ack_timeout; - rts->retry_num = *kibnal_tunables.kib_retry_cnt; - rts->rnr_num = *kibnal_tunables.kib_rnr_cnt; - rts->dest_out_rdma_r_atom_num = IBNAL_OUS_DST_RD; - - attr.modify.vv_qp_attr_mask = VV_QP_AT_S_PSN | - VV_QP_AT_L_ACK_T | - VV_QP_AT_RETRY_NUM | - VV_QP_AT_RNR_NUM | - VV_QP_AT_DEST_RDMA_ATOM_OUT_NUM; - break; - } - case vv_qp_state_error: - case vv_qp_state_reset: - attr.modify.vv_qp_attr_mask = 0; - break; - } - - attr.modify.qp_modify_into_state = new_state; - attr.modify.vv_qp_attr_mask |= VV_QP_AT_STATE; - - vvrc = vv_qp_modify(kibnal_data.kib_hca, conn->ibc_qp, &attr, NULL); - if (vvrc != vv_return_ok) { - CERROR("Can't modify qp -> %s state to %d: %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), - new_state, vvrc); - return -EIO; - } - - return 0; -} - -kib_conn_t * -kibnal_create_conn (cm_cep_handle_t cep) -{ - kib_conn_t *conn; - int i; - int page_offset; - int ipage; - vv_return_t vvrc; - int rc; - - static vv_qp_attr_t reqattr; - static vv_qp_attr_t rspattr; - - /* Only the connd creates conns => single threaded */ - LASSERT(!in_interrupt()); - LASSERT(current == kibnal_data.kib_connd); - - LIBCFS_ALLOC(conn, sizeof (*conn)); - if (conn == NULL) { - CERROR ("Can't allocate connection\n"); - return (NULL); - } - - /* zero flags, NULL pointers etc... */ - memset (conn, 0, sizeof (*conn)); - - conn->ibc_version = IBNAL_MSG_VERSION; /* Use latest version at first */ - - INIT_LIST_HEAD (&conn->ibc_early_rxs); - INIT_LIST_HEAD (&conn->ibc_tx_queue_nocred); - INIT_LIST_HEAD (&conn->ibc_tx_queue); - INIT_LIST_HEAD (&conn->ibc_tx_queue_rsrvd); - INIT_LIST_HEAD (&conn->ibc_active_txs); - spin_lock_init (&conn->ibc_lock); - - atomic_inc (&kibnal_data.kib_nconns); - /* well not really, but I call destroy() on failure, which decrements */ - - conn->ibc_cep = cep; - - LIBCFS_ALLOC(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); - if (conn->ibc_connvars == NULL) { - CERROR("Can't allocate in-progress connection state\n"); - goto failed; - } - memset (conn->ibc_connvars, 0, sizeof(*conn->ibc_connvars)); - /* Random seed for QP sequence number */ - get_random_bytes(&conn->ibc_connvars->cv_rxpsn, - sizeof(conn->ibc_connvars->cv_rxpsn)); - - LIBCFS_ALLOC(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t)); - if (conn->ibc_rxs == NULL) { - CERROR("Cannot allocate RX buffers\n"); - goto failed; - } - memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t)); - - rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES, 1); - if (rc != 0) - goto failed; - - for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) { - struct page *page = conn->ibc_rx_pages->ibp_pages[ipage]; - kib_rx_t *rx = &conn->ibc_rxs[i]; - vv_mem_reg_h_t mem_h; - vv_r_key_t r_key; - - rx->rx_conn = conn; - rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + - page_offset); - - vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca, - rx->rx_msg, - IBNAL_MSG_SIZE, - &mem_h, - &rx->rx_lkey, - &r_key); - LASSERT (vvrc == vv_return_ok); - - CDEBUG(D_NET, "Rx[%d] %p->%p[%x]\n", i, rx, - rx->rx_msg, rx->rx_lkey); - - page_offset += IBNAL_MSG_SIZE; - LASSERT (page_offset <= PAGE_SIZE); - - if (page_offset == PAGE_SIZE) { - page_offset = 0; - ipage++; - LASSERT (ipage <= IBNAL_RX_MSG_PAGES); - } - } - - memset(&reqattr, 0, sizeof(reqattr)); - - reqattr.create.qp_type = vv_qp_type_r_conn; - reqattr.create.cq_send_h = kibnal_data.kib_cq; - reqattr.create.cq_receive_h = kibnal_data.kib_cq; - reqattr.create.send_max_outstand_wr = (1 + IBNAL_MAX_RDMA_FRAGS) * - (*kibnal_tunables.kib_concurrent_sends); - reqattr.create.receive_max_outstand_wr = IBNAL_RX_MSGS; - reqattr.create.max_scatgat_per_send_wr = 1; - reqattr.create.max_scatgat_per_receive_wr = 1; - reqattr.create.signaling_type = vv_selectable_signaling; - reqattr.create.pd_h = kibnal_data.kib_pd; - reqattr.create.recv_solicited_events = vv_selectable_signaling; // vv_signal_all; - - vvrc = vv_qp_create(kibnal_data.kib_hca, &reqattr, NULL, - &conn->ibc_qp, &rspattr); - if (vvrc != vv_return_ok) { - CERROR ("Failed to create queue pair: %d\n", vvrc); - goto failed; - } - - /* Mark QP created */ - conn->ibc_state = IBNAL_CONN_INIT_QP; - conn->ibc_connvars->cv_local_qpn = rspattr.create_return.qp_num; - - if (rspattr.create_return.receive_max_outstand_wr < - IBNAL_RX_MSGS || - rspattr.create_return.send_max_outstand_wr < - (1 + IBNAL_MAX_RDMA_FRAGS) * (*kibnal_tunables.kib_concurrent_sends)) { - CERROR("Insufficient rx/tx work items: wanted %d/%d got %d/%d\n", - IBNAL_RX_MSGS, - (1 + IBNAL_MAX_RDMA_FRAGS) * - (*kibnal_tunables.kib_concurrent_sends), - rspattr.create_return.receive_max_outstand_wr, - rspattr.create_return.send_max_outstand_wr); - goto failed; - } - - /* Mark init complete */ - conn->ibc_state = IBNAL_CONN_INIT; - - /* 1 ref for caller */ - atomic_set (&conn->ibc_refcount, 1); - return (conn); - - failed: - kibnal_destroy_conn (conn); - return (NULL); -} - -void -kibnal_destroy_conn (kib_conn_t *conn) -{ - vv_return_t vvrc; - - /* Only the connd does this (i.e. single threaded) */ - LASSERT (!in_interrupt()); - LASSERT (current == kibnal_data.kib_connd); - - CDEBUG (D_NET, "connection %p\n", conn); - - LASSERT (atomic_read (&conn->ibc_refcount) == 0); - LASSERT (list_empty(&conn->ibc_early_rxs)); - LASSERT (list_empty(&conn->ibc_tx_queue)); - LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd)); - LASSERT (list_empty(&conn->ibc_tx_queue_nocred)); - LASSERT (list_empty(&conn->ibc_active_txs)); - LASSERT (conn->ibc_nsends_posted == 0); - - switch (conn->ibc_state) { - default: - /* conn must be completely disengaged from the network */ - LBUG(); - - case IBNAL_CONN_DISCONNECTED: - /* connvars should have been freed already */ - LASSERT (conn->ibc_connvars == NULL); - /* fall through */ - - case IBNAL_CONN_INIT: - vvrc = cm_destroy_cep(conn->ibc_cep); - LASSERT (vvrc == vv_return_ok); - /* fall through */ - - case IBNAL_CONN_INIT_QP: - kibnal_set_qp_state(conn, vv_qp_state_reset); - vvrc = vv_qp_destroy(kibnal_data.kib_hca, conn->ibc_qp); - if (vvrc != vv_return_ok) - CERROR("Can't destroy QP: %d\n", vvrc); - /* fall through */ - - case IBNAL_CONN_INIT_NOTHING: - break; - } - - if (conn->ibc_rx_pages != NULL) - kibnal_free_pages(conn->ibc_rx_pages); - - if (conn->ibc_rxs != NULL) - LIBCFS_FREE(conn->ibc_rxs, - IBNAL_RX_MSGS * sizeof(kib_rx_t)); - - if (conn->ibc_connvars != NULL) - LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); - - if (conn->ibc_peer != NULL) - kibnal_peer_decref(conn->ibc_peer); - - LIBCFS_FREE(conn, sizeof (*conn)); - - atomic_dec(&kibnal_data.kib_nconns); -} - -int -kibnal_close_peer_conns_locked (kib_peer_t *peer, int why) -{ - kib_conn_t *conn; - struct list_head *ctmp; - struct list_head *cnxt; - int count = 0; - - list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { - conn = list_entry (ctmp, kib_conn_t, ibc_list); - - count++; - kibnal_close_conn_locked (conn, why); - } - - return (count); -} - -int -kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation) -{ - kib_conn_t *conn; - struct list_head *ctmp; - struct list_head *cnxt; - int count = 0; - - list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { - conn = list_entry (ctmp, kib_conn_t, ibc_list); - - if (conn->ibc_incarnation == incarnation) - continue; - - CDEBUG(D_NET, "Closing stale conn -> %s incarnation:"LPX64"("LPX64")\n", - libcfs_nid2str(peer->ibp_nid), - conn->ibc_incarnation, incarnation); - - count++; - kibnal_close_conn_locked (conn, -ESTALE); - } - - return (count); -} - -int -kibnal_close_matching_conns (lnet_nid_t nid) -{ - kib_peer_t *peer; - struct list_head *ptmp; - struct list_head *pnxt; - int lo; - int hi; - int i; - unsigned long flags; - int count = 0; - - write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - - if (nid != LNET_NID_ANY) - lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; - else { - lo = 0; - hi = kibnal_data.kib_peer_hash_size - 1; - } - - for (i = lo; i <= hi; i++) { - list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) { - - peer = list_entry (ptmp, kib_peer_t, ibp_list); - LASSERT (peer->ibp_persistence != 0 || - peer->ibp_connecting != 0 || - peer->ibp_accepting != 0 || - !list_empty (&peer->ibp_conns)); - - if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid)) - continue; - - count += kibnal_close_peer_conns_locked (peer, 0); - } - } - - write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - - /* wildcards always succeed */ - if (nid == LNET_NID_ANY) - return (0); - - return (count == 0 ? -ENOENT : 0); -} - -int -kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) -{ - struct libcfs_ioctl_data *data = arg; - int rc = -EINVAL; - - LASSERT (ni == kibnal_data.kib_ni); - - switch(cmd) { - case IOC_LIBCFS_GET_PEER: { - lnet_nid_t nid = 0; - __u32 ip = 0; - int share_count = 0; - - rc = kibnal_get_peer_info(data->ioc_count, - &nid, &ip, &share_count); - data->ioc_nid = nid; - data->ioc_count = share_count; - data->ioc_u32[0] = ip; - data->ioc_u32[1] = *kibnal_tunables.kib_service_number; /* port */ - break; - } - case IOC_LIBCFS_ADD_PEER: { - rc = kibnal_add_persistent_peer (data->ioc_nid, - data->ioc_u32[0]); /* IP */ - break; - } - case IOC_LIBCFS_DEL_PEER: { - rc = kibnal_del_peer (data->ioc_nid); - break; - } - case IOC_LIBCFS_GET_CONN: { - kib_conn_t *conn = kibnal_get_conn_by_idx (data->ioc_count); - - if (conn == NULL) - rc = -ENOENT; - else { - // kibnal_debug_conn(conn); - rc = 0; - data->ioc_nid = conn->ibc_peer->ibp_nid; - kibnal_conn_decref(conn); - } - break; - } - case IOC_LIBCFS_CLOSE_CONNECTION: { - rc = kibnal_close_matching_conns (data->ioc_nid); - break; - } - case IOC_LIBCFS_REGISTER_MYNID: { - if (ni->ni_nid == data->ioc_nid) { - rc = 0; - } else { - CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n", - libcfs_nid2str(data->ioc_nid), - libcfs_nid2str(ni->ni_nid)); - rc = -EINVAL; - } - break; - } - } - - return rc; -} - -void -kibnal_free_pages (kib_pages_t *p) -{ - int npages = p->ibp_npages; - int i; - - for (i = 0; i < npages; i++) - if (p->ibp_pages[i] != NULL) - __free_page(p->ibp_pages[i]); - - LIBCFS_FREE (p, offsetof(kib_pages_t, ibp_pages[npages])); -} - -int -kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write) -{ - kib_pages_t *p; - int i; - - LIBCFS_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages])); - if (p == NULL) { - CERROR ("Can't allocate buffer %d\n", npages); - return (-ENOMEM); - } - - memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages])); - p->ibp_npages = npages; - - for (i = 0; i < npages; i++) { - p->ibp_pages[i] = alloc_page (GFP_KERNEL); - if (p->ibp_pages[i] == NULL) { - CERROR ("Can't allocate page %d of %d\n", i, npages); - kibnal_free_pages(p); - return (-ENOMEM); - } - } - - *pp = p; - return (0); -} - -int -kibnal_alloc_tx_descs (void) -{ - int i; - - LIBCFS_ALLOC (kibnal_data.kib_tx_descs, - IBNAL_TX_MSGS() * sizeof(kib_tx_t)); - if (kibnal_data.kib_tx_descs == NULL) - return -ENOMEM; - - memset(kibnal_data.kib_tx_descs, 0, - IBNAL_TX_MSGS() * sizeof(kib_tx_t)); - - for (i = 0; i < IBNAL_TX_MSGS(); i++) { - kib_tx_t *tx = &kibnal_data.kib_tx_descs[i]; - -#if IBNAL_USE_FMR - LIBCFS_ALLOC(tx->tx_pages, LNET_MAX_IOV * - sizeof(*tx->tx_pages)); - if (tx->tx_pages == NULL) - return -ENOMEM; -#else - LIBCFS_ALLOC(tx->tx_wrq, - (1 + IBNAL_MAX_RDMA_FRAGS) * - sizeof(*tx->tx_wrq)); - if (tx->tx_wrq == NULL) - return -ENOMEM; - - LIBCFS_ALLOC(tx->tx_gl, - (1 + IBNAL_MAX_RDMA_FRAGS) * - sizeof(*tx->tx_gl)); - if (tx->tx_gl == NULL) - return -ENOMEM; - - LIBCFS_ALLOC(tx->tx_rd, - offsetof(kib_rdma_desc_t, - rd_frags[IBNAL_MAX_RDMA_FRAGS])); - if (tx->tx_rd == NULL) - return -ENOMEM; -#endif - } - - return 0; -} - -void -kibnal_free_tx_descs (void) -{ - int i; - - if (kibnal_data.kib_tx_descs == NULL) - return; - - for (i = 0; i < IBNAL_TX_MSGS(); i++) { - kib_tx_t *tx = &kibnal_data.kib_tx_descs[i]; - -#if IBNAL_USE_FMR - if (tx->tx_pages != NULL) - LIBCFS_FREE(tx->tx_pages, LNET_MAX_IOV * - sizeof(*tx->tx_pages)); -#else - if (tx->tx_wrq != NULL) - LIBCFS_FREE(tx->tx_wrq, - (1 + IBNAL_MAX_RDMA_FRAGS) * - sizeof(*tx->tx_wrq)); - - if (tx->tx_gl != NULL) - LIBCFS_FREE(tx->tx_gl, - (1 + IBNAL_MAX_RDMA_FRAGS) * - sizeof(*tx->tx_gl)); - - if (tx->tx_rd != NULL) - LIBCFS_FREE(tx->tx_rd, - offsetof(kib_rdma_desc_t, - rd_frags[IBNAL_MAX_RDMA_FRAGS])); -#endif - } - - LIBCFS_FREE(kibnal_data.kib_tx_descs, - IBNAL_TX_MSGS() * sizeof(kib_tx_t)); -} - -#if IBNAL_USE_FMR -void -kibnal_free_fmrs (int n) -{ - int i; - vv_return_t vvrc; - kib_tx_t *tx; - - for (i = 0; i < n; i++) { - tx = &kibnal_data.kib_tx_descs[i]; - - vvrc = vv_free_fmr(kibnal_data.kib_hca, - tx->tx_md.md_fmrhandle); - if (vvrc != vv_return_ok) - CWARN("vv_free_fmr[%d]: %d\n", i, vvrc); - } -} -#endif - -int -kibnal_setup_tx_descs (void) -{ - int ipage = 0; - int page_offset = 0; - struct page *page; - kib_tx_t *tx; - vv_mem_reg_h_t mem_h; - vv_r_key_t rkey; - vv_return_t vvrc; - int i; - int rc; -#if IBNAL_USE_FMR - vv_fmr_t fmr_props; -#endif - - /* pre-mapped messages are not bigger than 1 page */ - CLASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE); - - /* No fancy arithmetic when we do the buffer calculations */ - CLASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0); - - rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, - IBNAL_TX_MSG_PAGES(), 0); - if (rc != 0) - return (rc); - - for (i = 0; i < IBNAL_TX_MSGS(); i++) { - page = kibnal_data.kib_tx_pages->ibp_pages[ipage]; - tx = &kibnal_data.kib_tx_descs[i]; - -#if IBNAL_USE_FMR - memset(&fmr_props, 0, sizeof(fmr_props)); - fmr_props.pd_hndl = kibnal_data.kib_pd; - fmr_props.acl = (vv_acc_r_mem_write | - vv_acc_l_mem_write); - fmr_props.max_pages = LNET_MAX_IOV; - fmr_props.log2_page_sz = PAGE_SHIFT; - fmr_props.max_outstanding_maps = *kibnal_tunables.kib_fmr_remaps; - - vvrc = vv_alloc_fmr(kibnal_data.kib_hca, - &fmr_props, - &tx->tx_md.md_fmrhandle); - if (vvrc != vv_return_ok) { - CERROR("Can't allocate fmr %d: %d\n", i, vvrc); - kibnal_free_fmrs(i); - kibnal_free_pages (kibnal_data.kib_tx_pages); - return -ENOMEM; - } - - tx->tx_md.md_fmrcount = *kibnal_tunables.kib_fmr_remaps; - tx->tx_md.md_active = 0; -#endif - tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + - page_offset); - - vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca, - tx->tx_msg, - IBNAL_MSG_SIZE, - &mem_h, - &tx->tx_lkey, - &rkey); - LASSERT (vvrc == vv_return_ok); - - CDEBUG(D_NET, "Tx[%d] %p->%p[%x]\n", i, tx, - tx->tx_msg, tx->tx_lkey); - - list_add (&tx->tx_list, &kibnal_data.kib_idle_txs); - - page_offset += IBNAL_MSG_SIZE; - LASSERT (page_offset <= PAGE_SIZE); - - if (page_offset == PAGE_SIZE) { - page_offset = 0; - ipage++; - LASSERT (ipage <= IBNAL_TX_MSG_PAGES()); - } - } - - return (0); -} - -void -kibnal_shutdown (lnet_ni_t *ni) -{ - int i; - vv_return_t vvrc; - - LASSERT (ni == kibnal_data.kib_ni); - LASSERT (ni->ni_data == &kibnal_data); - - CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", - atomic_read (&libcfs_kmemory)); - - switch (kibnal_data.kib_init) { - - case IBNAL_INIT_ALL: - /* stop accepting connections and prevent new peers */ - kibnal_stop_listener(ni); - - /* nuke all existing peers */ - kibnal_del_peer(LNET_NID_ANY); - - /* Wait for all peer state to clean up */ - i = 2; - while (atomic_read(&kibnal_data.kib_npeers) != 0) { - i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */ - "waiting for %d peers to disconnect\n", - atomic_read(&kibnal_data.kib_npeers)); - cfs_pause(cfs_time_seconds(1)); - } - /* fall through */ - - case IBNAL_INIT_CQ: - vvrc = vv_cq_destroy(kibnal_data.kib_hca, kibnal_data.kib_cq); - if (vvrc != vv_return_ok) - CERROR ("Destroy CQ error: %d\n", vvrc); - /* fall through */ - - case IBNAL_INIT_TXD: - kibnal_free_pages (kibnal_data.kib_tx_pages); -#if IBNAL_USE_FMR - kibnal_free_fmrs(IBNAL_TX_MSGS()); -#endif - /* fall through */ - - case IBNAL_INIT_PD: -#if 0 - /* Only deallocate a PD if we actually allocated one */ - vvrc = vv_pd_deallocate(kibnal_data.kib_hca, - kibnal_data.kib_pd); - if (vvrc != vv_return_ok) - CERROR ("Destroy PD error: %d\n", vvrc); -#endif - /* fall through */ - - case IBNAL_INIT_ASYNC: - vvrc = vv_dell_async_event_cb (kibnal_data.kib_hca, - kibnal_async_callback); - if (vvrc != vv_return_ok) - CERROR("vv_dell_async_event_cb error: %d\n", vvrc); - - /* fall through */ - - case IBNAL_INIT_HCA: - vvrc = vv_hca_close(kibnal_data.kib_hca); - if (vvrc != vv_return_ok) - CERROR ("Close HCA error: %d\n", vvrc); - /* fall through */ - - case IBNAL_INIT_DATA: - LASSERT (atomic_read(&kibnal_data.kib_npeers) == 0); - LASSERT (kibnal_data.kib_peers != NULL); - for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { - LASSERT (list_empty (&kibnal_data.kib_peers[i])); - } - LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0); - LASSERT (list_empty (&kibnal_data.kib_connd_zombies)); - LASSERT (list_empty (&kibnal_data.kib_connd_conns)); - LASSERT (list_empty (&kibnal_data.kib_connd_pcreqs)); - LASSERT (list_empty (&kibnal_data.kib_connd_peers)); - - /* flag threads to terminate; wake and wait for them to die */ - kibnal_data.kib_shutdown = 1; - wake_up_all (&kibnal_data.kib_sched_waitq); - wake_up_all (&kibnal_data.kib_connd_waitq); - - i = 2; - while (atomic_read (&kibnal_data.kib_nthreads) != 0) { - i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ - "Waiting for %d threads to terminate\n", - atomic_read (&kibnal_data.kib_nthreads)); - cfs_pause(cfs_time_seconds(1)); - } - /* fall through */ - - case IBNAL_INIT_NOTHING: - break; - } - - kibnal_free_tx_descs(); - - if (kibnal_data.kib_peers != NULL) - LIBCFS_FREE (kibnal_data.kib_peers, - sizeof (struct list_head) * - kibnal_data.kib_peer_hash_size); - - CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", - atomic_read (&libcfs_kmemory)); - - kibnal_data.kib_init = IBNAL_INIT_NOTHING; - PORTAL_MODULE_UNUSE; -} - -int -kibnal_startup (lnet_ni_t *ni) -{ - char scratch[32]; - char ipif_name[32]; - char *hca_name; - __u32 ip; - __u32 netmask; - int up; - int nob; - int devno; - struct timeval tv; - int rc; - int i; - vv_request_event_record_t req_er; - vv_return_t vvrc; - - LASSERT (ni->ni_lnd == &the_kiblnd); - - /* Only 1 instance supported */ - if (kibnal_data.kib_init != IBNAL_INIT_NOTHING) { - CERROR ("Only 1 instance supported\n"); - return -EPERM; - } - - if (*kibnal_tunables.kib_credits > *kibnal_tunables.kib_ntx) { - CERROR ("Can't set credits(%d) > ntx(%d)\n", - *kibnal_tunables.kib_credits, - *kibnal_tunables.kib_ntx); - return -EINVAL; - } - - ni->ni_maxtxcredits = *kibnal_tunables.kib_credits; - ni->ni_peertxcredits = *kibnal_tunables.kib_peercredits; - - CLASSERT (LNET_MAX_INTERFACES > 1); - - if (ni->ni_interfaces[0] != NULL) { - /* Use the HCA specified in 'networks=' */ - - if (ni->ni_interfaces[1] != NULL) { - CERROR("Multiple interfaces not supported\n"); - return -EPERM; - } - - /* Parse */ - hca_name = ni->ni_interfaces[0]; - nob = strlen(*kibnal_tunables.kib_hca_basename); - - if (strncmp(hca_name, *kibnal_tunables.kib_hca_basename, nob) || - sscanf(hca_name + nob, "%d%n", &devno, &nob) < 1) { - CERROR("Unrecognised HCA %s\n", hca_name); - return -EINVAL; - } - - } else { - /* Use 0 */ - devno = 0; - - hca_name = scratch; - snprintf(hca_name, sizeof(scratch), "%s%d", - *kibnal_tunables.kib_hca_basename, devno); - if (strlen(hca_name) == sizeof(scratch) - 1) { - CERROR("HCA name %s truncated\n", hca_name); - return -EINVAL; - } - } - - /* Find IP address from */ - snprintf(ipif_name, sizeof(ipif_name), "%s%d", - *kibnal_tunables.kib_ipif_basename, devno); - if (strlen(ipif_name) == sizeof(ipif_name) - 1) { - CERROR("IPoIB interface name %s truncated\n", ipif_name); - return -EINVAL; - } - - rc = libcfs_ipif_query(ipif_name, &up, &ip, &netmask); - if (rc != 0) { - CERROR("Can't query IPoIB interface %s: %d\n", ipif_name, rc); - return -ENETDOWN; - } - - if (!up) { - CERROR("Can't query IPoIB interface %s: it's down\n", ipif_name); - return -ENETDOWN; - } - - ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ip); - - PORTAL_MODULE_USE; - memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */ - - kibnal_data.kib_ni = ni; - ni->ni_data = &kibnal_data; - - do_gettimeofday(&tv); - kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; - - rwlock_init(&kibnal_data.kib_global_lock); - - kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE; - LIBCFS_ALLOC (kibnal_data.kib_peers, - sizeof (struct list_head) * kibnal_data.kib_peer_hash_size); - if (kibnal_data.kib_peers == NULL) { - goto failed; - } - for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) - INIT_LIST_HEAD(&kibnal_data.kib_peers[i]); - - spin_lock_init (&kibnal_data.kib_connd_lock); - INIT_LIST_HEAD (&kibnal_data.kib_connd_peers); - INIT_LIST_HEAD (&kibnal_data.kib_connd_pcreqs); - INIT_LIST_HEAD (&kibnal_data.kib_connd_conns); - INIT_LIST_HEAD (&kibnal_data.kib_connd_zombies); - init_waitqueue_head (&kibnal_data.kib_connd_waitq); - - spin_lock_init (&kibnal_data.kib_sched_lock); - init_waitqueue_head (&kibnal_data.kib_sched_waitq); - - spin_lock_init (&kibnal_data.kib_tx_lock); - INIT_LIST_HEAD (&kibnal_data.kib_idle_txs); - - rc = kibnal_alloc_tx_descs(); - if (rc != 0) { - CERROR("Can't allocate tx descs\n"); - goto failed; - } - - /* lists/ptrs/locks initialised */ - kibnal_data.kib_init = IBNAL_INIT_DATA; - /*****************************************************/ - - for (i = 0; i < IBNAL_N_SCHED; i++) { - rc = kibnal_thread_start (kibnal_scheduler, (void *)((long)i)); - if (rc != 0) { - CERROR("Can't spawn vibnal scheduler[%d]: %d\n", - i, rc); - goto failed; - } - } - - rc = kibnal_thread_start (kibnal_connd, NULL); - if (rc != 0) { - CERROR ("Can't spawn vibnal connd: %d\n", rc); - goto failed; - } - - vvrc = vv_hca_open(hca_name, NULL, &kibnal_data.kib_hca); - if (vvrc != vv_return_ok) { - CERROR ("Can't open HCA %s: %d\n", hca_name, vvrc); - goto failed; - } - - /* Channel Adapter opened */ - kibnal_data.kib_init = IBNAL_INIT_HCA; - - /* register to get HCA's asynchronous events. */ - req_er.req_event_type = VV_ASYNC_EVENT_ALL_MASK; - vvrc = vv_set_async_event_cb (kibnal_data.kib_hca, req_er, - kibnal_async_callback); - if (vvrc != vv_return_ok) { - CERROR ("Can't set HCA %s callback: %d\n", hca_name, vvrc); - goto failed; - } - - kibnal_data.kib_init = IBNAL_INIT_ASYNC; - - /*****************************************************/ - - vvrc = vv_hca_query(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs); - if (vvrc != vv_return_ok) { - CERROR ("Can't size port attrs for %s: %d\n", hca_name, vvrc); - goto failed; - } - - kibnal_data.kib_port = -1; - - for (i = 0; iport_state) { - case vv_state_linkDoun: - CDEBUG(D_NET, "port[%d] Down\n", port_num); - continue; - case vv_state_linkInit: - CDEBUG(D_NET, "port[%d] Init\n", port_num); - continue; - case vv_state_linkArm: - CDEBUG(D_NET, "port[%d] Armed\n", port_num); - continue; - case vv_state_linkActive: - CDEBUG(D_NET, "port[%d] Active\n", port_num); - - /* Found a suitable port. Get its GUID and PKEY. */ - tbl_count = 1; - vvrc = vv_get_port_gid_tbl(kibnal_data.kib_hca, - port_num, &tbl_count, - &kibnal_data.kib_port_gid); - if (vvrc != vv_return_ok) { - CERROR("vv_get_port_gid_tbl failed " - "for %s port %d: %d\n", - hca_name, port_num, vvrc); - continue; - } - - tbl_count = 1; - vvrc = vv_get_port_partition_tbl(kibnal_data.kib_hca, - port_num, &tbl_count, - &kibnal_data.kib_port_pkey); - if (vvrc != vv_return_ok) { - CERROR("vv_get_port_partition_tbl failed " - "for %s port %d: %d\n", - hca_name, port_num, vvrc); - continue; - } - - kibnal_data.kib_port = port_num; - - break; - case vv_state_linkActDefer: /* TODO: correct? */ - case vv_state_linkNoChange: - CERROR("Unexpected %s port[%d] state %d\n", - hca_name, i, pattr->port_state); - continue; - } - break; - } - - if (kibnal_data.kib_port == -1) { - CERROR ("Can't find an active port on %s\n", hca_name); - goto failed; - } - - CDEBUG(D_NET, "Using %s port %d - GID="LPX64":"LPX64"\n", - hca_name, kibnal_data.kib_port, - kibnal_data.kib_port_gid.scope.g.subnet, - kibnal_data.kib_port_gid.scope.g.eui64); - - /*****************************************************/ - -#if 1 - /* We use a pre-allocated PD */ - vvrc = vv_get_gen_pd_h(kibnal_data.kib_hca, &kibnal_data.kib_pd); -#else - vvrc = vv_pd_allocate(kibnal_data.kib_hca, &kibnal_data.kib_pd); -#endif - if (vvrc != vv_return_ok) { - CERROR ("Can't init PD: %d\n", vvrc); - goto failed; - } - - /* flag PD initialised */ - kibnal_data.kib_init = IBNAL_INIT_PD; - /*****************************************************/ - - rc = kibnal_setup_tx_descs(); - if (rc != 0) { - CERROR ("Can't register tx descs: %d\n", rc); - goto failed; - } - - /* flag TX descs initialised */ - kibnal_data.kib_init = IBNAL_INIT_TXD; - /*****************************************************/ - - { - __u32 nentries; - - vvrc = vv_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES(), - kibnal_cq_callback, - NULL, /* context */ - &kibnal_data.kib_cq, &nentries); - if (vvrc != 0) { - CERROR ("Can't create RX CQ: %d\n", vvrc); - goto failed; - } - - /* flag CQ initialised */ - kibnal_data.kib_init = IBNAL_INIT_CQ; - - if (nentries < IBNAL_CQ_ENTRIES()) { - CERROR ("CQ only has %d entries, need %d\n", - nentries, IBNAL_CQ_ENTRIES()); - goto failed; - } - - vvrc = vv_request_completion_notification(kibnal_data.kib_hca, - kibnal_data.kib_cq, - vv_next_solicit_unsolicit_event); - if (vvrc != 0) { - CERROR ("Failed to re-arm completion queue: %d\n", rc); - goto failed; - } - } - - rc = kibnal_start_listener(ni); - if (rc != 0) { - CERROR("Can't start listener: %d\n", rc); - goto failed; - } - - /* flag everything initialised */ - kibnal_data.kib_init = IBNAL_INIT_ALL; - /*****************************************************/ - - return (0); - - failed: - CDEBUG(D_NET, "kibnal_startup failed\n"); - kibnal_shutdown (ni); - return (-ENETDOWN); -} - -void __exit -kibnal_module_fini (void) -{ - lnet_unregister_lnd(&the_kiblnd); - kibnal_tunables_fini(); -} - -int __init -kibnal_module_init (void) -{ - int rc; - - vibnal_assert_wire_constants(); - - CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t) - <= cm_REQ_priv_data_len); - CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t) - <= cm_REP_priv_data_len); - CLASSERT (sizeof(kib_msg_t) <= IBNAL_MSG_SIZE); -#if !IBNAL_USE_FMR - CLASSERT (offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS]) - <= IBNAL_MSG_SIZE); - CLASSERT (offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS]) - <= IBNAL_MSG_SIZE); -#endif - rc = kibnal_tunables_init(); - if (rc != 0) - return rc; - - lnet_register_lnd(&the_kiblnd); - - return 0; -} - -MODULE_AUTHOR("Sun Microsystems, Inc. "); -MODULE_DESCRIPTION("Kernel Voltaire IB LND v1.00"); -MODULE_LICENSE("GPL"); - -module_init(kibnal_module_init); -module_exit(kibnal_module_fini); diff --git a/lnet/klnds/viblnd/viblnd.h b/lnet/klnds/viblnd/viblnd.h deleted file mode 100644 index f69abca..0000000 --- a/lnet/klnds/viblnd/viblnd.h +++ /dev/null @@ -1,693 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. - * - * GPL HEADER END - */ -/* - * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/viblnd/viblnd.h - * - * Author: Eric Barton - * Author: Frank Zago - */ - -#ifndef EXPORT_SYMTAB -# define EXPORT_SYMTAB -#endif -#ifndef AUTOCONF_INCLUDED -#include -#endif -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#define DEBUG_SUBSYSTEM S_LND - -#include -#include -#include -#include - -/* CPU_{L,B}E #defines needed by Voltaire headers */ -#include -#ifdef __BIG_ENDIAN__ -#define CPU_BE 1 -#define CPU_LE 0 -#endif -#ifdef __LITTLE_ENDIAN__ -#define CPU_BE 0 -#define CPU_LE 1 -#endif - -#include -#include -#include - -/* GCC 3.2.2, miscompiles this driver. - * See http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9853. */ -#define GCC_VERSION ((__GNUC__*100 + __GNUC_MINOR__)*100 + __GNUC_PATCHLEVEL__) -#if (GCC_VERSION >= 30000) && (GCC_VERSION < 30203) -# error Invalid GCC version. Must use GCC < 3.0.0 || GCC >= 3.2.3 -#endif - -#ifdef CONFIG_SMP -# define IBNAL_N_SCHED num_online_cpus() /* # schedulers */ -#else -# define IBNAL_N_SCHED 1 /* # schedulers */ -#endif - -#define IBNAL_USE_FMR 1 - -/* tunables fixed at compile time */ -#define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */ -#define IBNAL_RESCHED 100 /* # scheduler loops before reschedule */ -#define IBNAL_MSG_QUEUE_SIZE 8 /* # messages/RDMAs in-flight */ -#define IBNAL_CREDIT_HIGHWATER 7 /* when eagerly to return credits */ -#define IBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */ - -/* constants derived from sdp-connection.c */ -#define IBNAL_QKEY 0 -#define IBNAL_PKEY 0xffff -#define IBNAL_PKEY_IDX 0 -#define IBNAL_SGID_IDX 0 -#define IBNAL_SERVICE_LEVEL 0 -#define IBNAL_STATIC_RATE 0 -#define IBNAL_EE_FLOW_CNT 1 -#define IBNAL_LOCAL_SUB 1 -#define IBNAL_TRAFFIC_CLASS 0 -#define IBNAL_SOURCE_PATH_BIT 0 -#define IBNAL_OUS_DST_RD 1 -#define IBNAL_IB_MTU vv_mtu_1024 - -/* constants derived from sdp-hca-params.h */ -#define PATH_RATE_2_5GB 2 -#define MLX_IPD_1x 1 -#define MLX_IPD_4x 0 -#define IBNAL_R_2_STATIC_RATE(r) ((r) == PATH_RATE_2_5GB ? MLX_IPD_1x : MLX_IPD_4x) - -/* other low-level IB constants */ -#define IBNAL_PKT_LIFETIME 5 -#define IBNAL_ARB_INITIATOR_DEPTH 0 -#define IBNAL_ARB_RESP_RES 0 -#define IBNAL_FAILOVER_ACCEPTED 0 - -/************************/ -/* derived constants... */ - -/* TX messages (shared by all connections) */ -#define IBNAL_TX_MSGS() (*kibnal_tunables.kib_ntx) -#define IBNAL_TX_MSG_BYTES() (IBNAL_TX_MSGS() * IBNAL_MSG_SIZE) -#define IBNAL_TX_MSG_PAGES() ((IBNAL_TX_MSG_BYTES() + PAGE_SIZE - 1)/PAGE_SIZE) - -#if IBNAL_USE_FMR -# define IBNAL_MAX_RDMA_FRAGS 1 -# define IBNAL_CONCURRENT_SENDS IBNAL_RX_MSGS -#else -# define IBNAL_MAX_RDMA_FRAGS LNET_MAX_IOV -# define IBNAL_CONCURRENT_SENDS IBNAL_MSG_QUEUE_SIZE -#endif - -/* RX messages (per connection) */ -#define IBNAL_RX_MSGS (IBNAL_MSG_QUEUE_SIZE*2) -#define IBNAL_RX_MSG_BYTES (IBNAL_RX_MSGS * IBNAL_MSG_SIZE) -#define IBNAL_RX_MSG_PAGES ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) - -#define IBNAL_CQ_ENTRIES() (IBNAL_TX_MSGS() * (1 + IBNAL_MAX_RDMA_FRAGS) + \ - IBNAL_RX_MSGS * *kibnal_tunables.kib_concurrent_peers) - -typedef struct -{ - unsigned int *kib_service_number; /* IB service number */ - int *kib_min_reconnect_interval; /* first failed connection retry... */ - int *kib_max_reconnect_interval; /* ...exponentially increasing to this */ - int *kib_concurrent_peers; /* max # nodes all talking to me */ - int *kib_cksum; /* checksum kib_msg_t? */ - int *kib_timeout; /* comms timeout (seconds) */ - int *kib_ntx; /* # tx descs */ - int *kib_credits; /* # concurrent sends */ - int *kib_peercredits; /* # concurrent sends to 1 peer */ - int *kib_arp_retries; /* # times to retry ARP */ - char **kib_hca_basename; /* HCA base name */ - char **kib_ipif_basename; /* IPoIB interface base name */ - int *kib_local_ack_timeout; /* IB RC QP ack timeout... */ - int *kib_retry_cnt; /* ...and retry */ - int *kib_rnr_cnt; /* RNR retries... */ - int *kib_rnr_nak_timer; /* ...and interval */ - int *kib_keepalive; /* keepalive interval */ - int *kib_concurrent_sends; /* send work queue sizing */ -#if IBNAL_USE_FMR - int *kib_fmr_remaps; /* # FMR maps before unmap required */ -#endif -#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM - cfs_sysctl_table_header_t *kib_sysctl; /* sysctl interface */ -#endif -} kib_tunables_t; - -typedef struct -{ - int ibp_npages; /* # pages */ - struct page *ibp_pages[0]; -} kib_pages_t; - -#if IBNAL_USE_FMR -typedef struct -{ - vv_fmr_h_t md_fmrhandle; /* FMR handle */ - int md_fmrcount; /* # mappings left */ - int md_active; /* mapping in use? */ - __u32 md_lkey; /* local key */ - __u32 md_rkey; /* remote key */ - __u64 md_addr; /* IO VM address */ -} kib_md_t; -#endif - -typedef struct -{ - int kib_init; /* initialisation state */ - __u64 kib_incarnation; /* which one am I */ - int kib_shutdown; /* shut down? */ - atomic_t kib_nthreads; /* # live threads */ - lnet_ni_t *kib_ni; /* _the_ nal instance */ - - vv_gid_t kib_port_gid; /* device/port GID */ - vv_p_key_t kib_port_pkey; /* device/port pkey */ - - cm_cep_handle_t kib_listen_handle; /* IB listen handle */ - - rwlock_t kib_global_lock; /* stabilize peer/conn ops */ - int kib_ready; /* CQ callback fired */ - int kib_checking_cq; /* a scheduler is checking the CQ */ - - struct list_head *kib_peers; /* hash table of all my known peers */ - int kib_peer_hash_size; /* size of kib_peers */ - atomic_t kib_npeers; /* # peers extant */ - atomic_t kib_nconns; /* # connections extant */ - - void *kib_connd; /* the connd task (serialisation assertions) */ - struct list_head kib_connd_peers; /* peers wanting to get connected */ - struct list_head kib_connd_pcreqs; /* passive connection requests */ - struct list_head kib_connd_conns; /* connections to setup/teardown */ - struct list_head kib_connd_zombies; /* connections with zero refcount */ - wait_queue_head_t kib_connd_waitq; /* connection daemon sleeps here */ - spinlock_t kib_connd_lock; /* serialise */ - - wait_queue_head_t kib_sched_waitq; /* schedulers sleep here */ - spinlock_t kib_sched_lock; /* serialise */ - - struct kib_tx *kib_tx_descs; /* all the tx descriptors */ - kib_pages_t *kib_tx_pages; /* premapped tx msg pages */ - - struct list_head kib_idle_txs; /* idle tx descriptors */ - __u64 kib_next_tx_cookie; /* RDMA completion cookie */ - spinlock_t kib_tx_lock; /* serialise */ - - vv_hca_h_t kib_hca; /* The HCA */ - vv_hca_attrib_t kib_hca_attrs; /* its properties */ - int kib_port; /* port on the device */ - vv_port_attrib_t kib_port_attr; /* its properties */ - - vv_pd_h_t kib_pd; /* protection domain */ - vv_cq_h_t kib_cq; /* completion queue */ - -} kib_data_t; - -#define IBNAL_INIT_NOTHING 0 -#define IBNAL_INIT_DATA 1 -#define IBNAL_INIT_LIB 2 -#define IBNAL_INIT_HCA 3 -#define IBNAL_INIT_ASYNC 4 -#define IBNAL_INIT_PD 5 -#define IBNAL_INIT_TXD 6 -#define IBNAL_INIT_CQ 7 -#define IBNAL_INIT_ALL 8 - -#include "viblnd_wire.h" - -/***********************************************************************/ - -typedef struct kib_rx /* receive message */ -{ - struct list_head rx_list; /* queue for attention */ - struct kib_conn *rx_conn; /* owning conn */ - int rx_nob; /* # bytes received (-1 while posted) */ - vv_l_key_t rx_lkey; /* local key */ - kib_msg_t *rx_msg; /* pre-mapped buffer (host vaddr) */ - vv_wr_t rx_wrq; /* receive work item */ - vv_scatgat_t rx_gl; /* and its memory */ -} kib_rx_t; - -typedef struct kib_tx /* transmit message */ -{ - struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */ - struct kib_conn *tx_conn; /* owning conn */ - int tx_sending; /* # tx callbacks outstanding */ - int tx_queued; /* queued for sending */ - int tx_waiting; /* waiting for peer */ - int tx_status; /* completion status */ - unsigned long tx_deadline; /* completion deadline */ - __u64 tx_cookie; /* completion cookie */ - lnet_msg_t *tx_lntmsg[2]; /* lnet msgs to finalize on completion */ - vv_l_key_t tx_lkey; /* local key for message buffer */ - kib_msg_t *tx_msg; /* message buffer (host vaddr) */ - int tx_nwrq; /* # send work items */ -#if IBNAL_USE_FMR - vv_wr_t tx_wrq[2]; /* send work items... */ - vv_scatgat_t tx_gl[2]; /* ...and their memory */ - kib_rdma_desc_t tx_rd[1]; /* rdma descriptor */ - kib_md_t tx_md; /* FMR mapping descriptor */ - __u64 *tx_pages; /* page phys addrs */ -#else - vv_wr_t *tx_wrq; /* send work items... */ - vv_scatgat_t *tx_gl; /* ...and their memory */ - kib_rdma_desc_t *tx_rd; /* rdma descriptor (src buffers) */ -#endif -} kib_tx_t; - -/* Passive connection request (listener callback) queued for handling by connd */ -typedef struct kib_pcreq -{ - struct list_head pcr_list; /* queue for handling by connd */ - cm_cep_handle_t pcr_cep; /* listening handle */ - cm_request_data_t pcr_cmreq; /* request data */ -} kib_pcreq_t; - -typedef struct kib_connvars -{ - /* connection-in-progress variables */ - __u32 cv_port; - __u32 cv_pkey_index; - __u32 cv_rnr_count; - __u32 cv_sgid_index; - __u32 cv_remote_qpn; - __u32 cv_local_qpn; - __u32 cv_rxpsn; - __u32 cv_txpsn; - ib_path_record_v2_t cv_path; - ibat_arp_data_t cv_arp; - ibat_stat_t cv_arprc; - cm_conn_data_t cv_conndata; -} kib_connvars_t; - -typedef struct kib_conn -{ - struct kib_peer *ibc_peer; /* owning peer */ - struct list_head ibc_list; /* stash on peer's conn list */ - __u64 ibc_incarnation; /* which instance of the peer */ - __u64 ibc_txseq; /* tx sequence number */ - __u64 ibc_rxseq; /* rx sequence number */ - __u32 ibc_version; /* peer protocol version */ - atomic_t ibc_refcount; /* # users */ - int ibc_state; /* what's happening */ - int ibc_nsends_posted; /* # uncompleted sends */ - int ibc_credits; /* # credits I have */ - int ibc_outstanding_credits; /* # credits to return */ - int ibc_reserved_credits; /* # credits for ACK/DONE msgs */ - int ibc_disconnect; /* some disconnect callback fired */ - int ibc_comms_error; /* set on comms error */ - unsigned long ibc_last_send; /* time of last send */ - struct list_head ibc_early_rxs; /* rxs completed before ESTABLISHED */ - struct list_head ibc_tx_queue_nocred; /* sends that don't need a cred */ - struct list_head ibc_tx_queue_rsrvd; /* sends that need a reserved cred */ - struct list_head ibc_tx_queue; /* send queue */ - struct list_head ibc_active_txs; /* active tx awaiting completion */ - spinlock_t ibc_lock; /* serialise */ - kib_rx_t *ibc_rxs; /* the rx descs */ - kib_pages_t *ibc_rx_pages; /* premapped rx msg pages */ - vv_qp_h_t ibc_qp; /* queue pair */ - cm_cep_handle_t ibc_cep; /* connection endpoint */ - kib_connvars_t *ibc_connvars; /* in-progress connection state */ -} kib_conn_t; - -#define IBNAL_CONN_INIT_NOTHING 0 /* incomplete init */ -#define IBNAL_CONN_INIT_QP 1 /* QP allocated */ -#define IBNAL_CONN_INIT 2 /* completed init */ -#define IBNAL_CONN_ACTIVE_ARP 3 /* active arping */ -#define IBNAL_CONN_ACTIVE_CONNECT 4 /* active sending req */ -#define IBNAL_CONN_ACTIVE_CHECK_REPLY 5 /* active checking reply */ -#define IBNAL_CONN_ACTIVE_RTU 6 /* active sending rtu */ -#define IBNAL_CONN_PASSIVE_WAIT 7 /* passive waiting for rtu */ -#define IBNAL_CONN_ESTABLISHED 8 /* connection established */ -#define IBNAL_CONN_DISCONNECT1 9 /* disconnect phase 1 */ -#define IBNAL_CONN_DISCONNECT2 10 /* disconnect phase 2 */ -#define IBNAL_CONN_DISCONNECTED 11 /* disconnect complete */ - -typedef struct kib_peer -{ - struct list_head ibp_list; /* stash on global peer list */ - struct list_head ibp_connd_list; /* schedule on kib_connd_peers */ - lnet_nid_t ibp_nid; /* who's on the other end(s) */ - __u32 ibp_ip; /* IP to query for peer conn params */ - int ibp_port; /* port to qery for peer conn params */ - __u64 ibp_incarnation; /* peer's incarnation */ - atomic_t ibp_refcount; /* # users */ - int ibp_persistence; /* "known" peer refs */ - struct list_head ibp_conns; /* all active connections */ - struct list_head ibp_tx_queue; /* msgs waiting for a conn */ - int ibp_connecting; /* current active connection attempts */ - int ibp_accepting; /* current passive connection attempts */ - int ibp_arp_count; /* # arp attempts */ - unsigned long ibp_reconnect_time; /* when reconnect may be attempted */ - unsigned long ibp_reconnect_interval; /* exponential backoff */ - int ibp_error; /* errno on closing this peer */ - cfs_time_t ibp_last_alive; /* when (in jiffies) I was last alive */ -} kib_peer_t; - - -extern kib_data_t kibnal_data; -extern kib_tunables_t kibnal_tunables; - -int kibnal_startup (lnet_ni_t *ni); -void kibnal_shutdown (lnet_ni_t *ni); -int kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg); -int kibnal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); -extern int kibnal_eager_recv (lnet_ni_t *ni, void *private, - lnet_msg_t *lntmsg, void **new_private); -int kibnal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, - int delayed, unsigned int niov, - struct iovec *iov, lnet_kiov_t *kiov, - unsigned int offset, unsigned int mlen, unsigned int rlen); -extern void kibnal_init_msg(kib_msg_t *msg, int type, int body_nob); -extern void kibnal_pack_msg(kib_msg_t *msg, __u32 version, int credits, - lnet_nid_t dstnid, __u64 dststamp, __u64 seq); -extern int kibnal_unpack_msg(kib_msg_t *msg, __u32 expected_version, int nob); -extern int kibnal_create_peer(kib_peer_t **peerp, lnet_nid_t nid); -extern void kibnal_destroy_peer(kib_peer_t *peer); -extern int kibnal_add_persistent_peer (lnet_nid_t nid, __u32 ip); -extern int kibnal_del_peer(lnet_nid_t nid); -extern kib_peer_t *kibnal_find_peer_locked(lnet_nid_t nid); -extern void kibnal_unlink_peer_locked(kib_peer_t *peer); -extern void kibnal_peer_alive(kib_peer_t *peer); -extern int kibnal_close_stale_conns_locked(kib_peer_t *peer, - __u64 incarnation); -extern kib_conn_t *kibnal_create_conn(cm_cep_handle_t cep); -extern void kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *info, void *arg); - -extern int kibnal_alloc_pages(kib_pages_t **pp, int npages, int access); -extern void kibnal_free_pages(kib_pages_t *p); - -extern void kibnal_check_sends(kib_conn_t *conn); -extern void kibnal_close_conn_locked(kib_conn_t *conn, int error); -extern void kibnal_destroy_conn(kib_conn_t *conn); -extern int kibnal_thread_start(int (*fn)(void *arg), void *arg); -extern int kibnal_scheduler(void *arg); -extern int kibnal_connd(void *arg); -extern void kibnal_init_tx_msg(kib_tx_t *tx, int type, int body_nob); -extern void kibnal_close_conn(kib_conn_t *conn, int why); -extern int kibnal_set_qp_state(kib_conn_t *conn, vv_qp_state_t new_state); -extern void kibnal_async_callback(vv_event_record_t ev); -extern void kibnal_cq_callback(unsigned long context); -extern void kibnal_passive_connreq(kib_pcreq_t *pcr, int reject); -extern void kibnal_txlist_done (struct list_head *txlist, int status); -extern void kibnal_queue_tx(kib_tx_t *tx, kib_conn_t *conn); -extern int kibnal_init_rdma(kib_tx_t *tx, int type, int nob, - kib_rdma_desc_t *dstrd, __u64 dstcookie); -extern int kibnal_tunables_init(void); -extern void kibnal_tunables_fini(void); - -#define kibnal_conn_addref(conn) \ -do { \ - CDEBUG(D_NET, "conn[%p] (%d)++\n", \ - (conn), atomic_read(&(conn)->ibc_refcount)); \ - LASSERT(atomic_read(&(conn)->ibc_refcount) > 0); \ - atomic_inc(&(conn)->ibc_refcount); \ -} while (0) - -#define kibnal_conn_decref(conn) \ -do { \ - unsigned long flags; \ - \ - CDEBUG(D_NET, "conn[%p] (%d)--\n", \ - (conn), atomic_read(&(conn)->ibc_refcount)); \ - LASSERT(atomic_read(&(conn)->ibc_refcount) > 0); \ - if (atomic_dec_and_test(&(conn)->ibc_refcount)) { \ - spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); \ - list_add_tail(&(conn)->ibc_list, \ - &kibnal_data.kib_connd_zombies); \ - wake_up(&kibnal_data.kib_connd_waitq); \ - spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); \ - } \ -} while (0) - -#define kibnal_peer_addref(peer) \ -do { \ - CDEBUG(D_NET, "peer[%p] -> %s (%d)++\n", \ - (peer), libcfs_nid2str((peer)->ibp_nid), \ - atomic_read (&(peer)->ibp_refcount)); \ - LASSERT(atomic_read(&(peer)->ibp_refcount) > 0); \ - atomic_inc(&(peer)->ibp_refcount); \ -} while (0) - -#define kibnal_peer_decref(peer) \ -do { \ - CDEBUG(D_NET, "peer[%p] -> %s (%d)--\n", \ - (peer), libcfs_nid2str((peer)->ibp_nid), \ - atomic_read (&(peer)->ibp_refcount)); \ - LASSERT(atomic_read(&(peer)->ibp_refcount) > 0); \ - if (atomic_dec_and_test(&(peer)->ibp_refcount)) \ - kibnal_destroy_peer(peer); \ -} while (0) - -static inline struct list_head * -kibnal_nid2peerlist (lnet_nid_t nid) -{ - unsigned int hash = ((unsigned int)nid) % kibnal_data.kib_peer_hash_size; - - return (&kibnal_data.kib_peers [hash]); -} - -static inline int -kibnal_peer_active (kib_peer_t *peer) -{ - /* Am I in the peer hash table? */ - return (!list_empty(&peer->ibp_list)); -} - -static inline void -kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn) -{ - struct list_head *q; - - LASSERT (tx->tx_nwrq > 0); /* work items set up */ - LASSERT (!tx->tx_queued); /* not queued for sending already */ - - tx->tx_queued = 1; - tx->tx_deadline = jiffies + (*kibnal_tunables.kib_timeout * HZ); - - if (tx->tx_conn == NULL) { - kibnal_conn_addref(conn); - tx->tx_conn = conn; - LASSERT (tx->tx_msg->ibm_type != IBNAL_MSG_PUT_DONE); - } else { - LASSERT (tx->tx_conn == conn); - LASSERT (tx->tx_msg->ibm_type == IBNAL_MSG_PUT_DONE); - } - - if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) { - /* All messages have simple credit control */ - q = &conn->ibc_tx_queue; - } else { - LASSERT (conn->ibc_version == IBNAL_MSG_VERSION); - - switch (tx->tx_msg->ibm_type) { - case IBNAL_MSG_PUT_REQ: - case IBNAL_MSG_GET_REQ: - /* RDMA request: reserve a buffer for the RDMA reply - * before sending */ - q = &conn->ibc_tx_queue_rsrvd; - break; - - case IBNAL_MSG_PUT_NAK: - case IBNAL_MSG_PUT_ACK: - case IBNAL_MSG_PUT_DONE: - case IBNAL_MSG_GET_DONE: - /* RDMA reply/completion: no credits; peer has reserved - * a reply buffer */ - q = &conn->ibc_tx_queue_nocred; - break; - - case IBNAL_MSG_NOOP: - case IBNAL_MSG_IMMEDIATE: - /* Otherwise: consume a credit before sending */ - q = &conn->ibc_tx_queue; - break; - - default: - LBUG(); - q = NULL; - } - } - - list_add_tail(&tx->tx_list, q); -} - -static inline int -kibnal_send_keepalive(kib_conn_t *conn) -{ - return (*kibnal_tunables.kib_keepalive > 0) && - time_after(jiffies, conn->ibc_last_send + - *kibnal_tunables.kib_keepalive*HZ); -} - -#ifndef IBNAL_VOIDSTAR_SGADDR -# define IBNAL_VOIDSTAR_SGADDR 0 -#endif - -#if IBNAL_VOIDSTAR_SGADDR -# if defined(CONFIG_HIGHMEM) -# if defined(CONFIG_X86) && defined(CONFIG_HIGHMEM4G) - /* truncation to void* doesn't matter if 0 <= physmem < 4G - * so allow x86 with 32 bit phys addrs */ -# elif defined(CONFIG_IA64) - /* OK anyway on 64-bit arch */ -# else -# error "Can't support HIGHMEM when vv_scatgat_t::v_address is void *" -# endif -# endif -# define KIBNAL_ADDR2SG(a) ((void *)((unsigned long)(a))) -# define KIBNAL_SG2ADDR(a) ((__u64)((unsigned long)(a))) -static inline __u64 kibnal_addr2net (__u64 addr) -{ - void *netaddr; - vv_return_t vvrc = vv_va2advertise_addr(kibnal_data.kib_hca, - KIBNAL_ADDR2SG(addr), - &netaddr); - LASSERT (vvrc == vv_return_ok); - return KIBNAL_SG2ADDR(netaddr); -} -#else -# define KIBNAL_ADDR2SG(a) a -# define KIBNAL_SG2ADDR(a) a -static inline __u64 kibnal_addr2net (__u64 addr) -{ - __u64 netaddr; - vv_return_t vvrc = vv_va2advertise_addr(kibnal_data.kib_hca, - addr, - &netaddr); - LASSERT (vvrc == vv_return_ok); - return netaddr; -} -#endif - -/* CAVEAT EMPTOR: We rely on tx/rx descriptor alignment to allow us to use the - * lowest 2 bits of the work request id to stash the work item type (the op - * field is not valid when the wc completes in error). */ - -#define IBNAL_WID_TX 0 -#define IBNAL_WID_RX 1 -#define IBNAL_WID_RDMA 2 -#define IBNAL_WID_MASK 3UL - -static inline vv_wr_id_t -kibnal_ptr2wreqid (void *ptr, int type) -{ - unsigned long lptr = (unsigned long)ptr; - - LASSERT ((lptr & IBNAL_WID_MASK) == 0); - LASSERT ((type & ~IBNAL_WID_MASK) == 0); - return (vv_wr_id_t)(lptr | type); -} - -static inline void * -kibnal_wreqid2ptr (vv_wr_id_t wreqid) -{ - return (void *)(((unsigned long)wreqid) & ~IBNAL_WID_MASK); -} - -static inline int -kibnal_wreqid2type (vv_wr_id_t wreqid) -{ - return (wreqid & IBNAL_WID_MASK); -} - -static inline void -kibnal_set_conn_state (kib_conn_t *conn, int state) -{ - conn->ibc_state = state; - mb(); -} - -#if IBNAL_USE_FMR - -static inline int -kibnal_rd_size (kib_rdma_desc_t *rd) -{ - return rd->rd_nob; -} - -#else -static inline __u64 -kibnal_rf_addr (kib_rdma_frag_t *rf) -{ - return (((__u64)rf->rf_addr_hi)<<32) | ((__u64)rf->rf_addr_lo); -} - -static inline void -kibnal_rf_set (kib_rdma_frag_t *rf, __u64 addr, int nob) -{ - rf->rf_addr_lo = addr & 0xffffffff; - rf->rf_addr_hi = (addr >> 32) & 0xffffffff; - rf->rf_nob = nob; -} - -static inline int -kibnal_rd_size (kib_rdma_desc_t *rd) -{ - int i; - int size; - - for (i = size = 0; i < rd->rd_nfrag; i++) - size += rd->rd_frags[i].rf_nob; - - return size; -} -#endif diff --git a/lnet/klnds/viblnd/viblnd_cb.c b/lnet/klnds/viblnd/viblnd_cb.c deleted file mode 100644 index ec096aa..0000000 --- a/lnet/klnds/viblnd/viblnd_cb.c +++ /dev/null @@ -1,3694 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. - * - * GPL HEADER END - */ -/* - * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/viblnd/viblnd_cb.c - * - * Author: Eric Barton - * Author: Frank Zago - */ - -#include "viblnd.h" - -void -kibnal_tx_done (kib_tx_t *tx) -{ - lnet_msg_t *lntmsg[2]; - int rc = tx->tx_status; - int i; - - LASSERT (!in_interrupt()); - LASSERT (!tx->tx_queued); /* mustn't be queued for sending */ - LASSERT (tx->tx_sending == 0); /* mustn't be awaiting sent callback */ - LASSERT (!tx->tx_waiting); /* mustn't be awaiting peer response */ - -#if IBNAL_USE_FMR - if (tx->tx_md.md_fmrcount == 0 || - (rc != 0 && tx->tx_md.md_active)) { - vv_return_t vvrc; - - /* mapping must be active (it dropped fmrcount to 0) */ - LASSERT (tx->tx_md.md_active); - - vvrc = vv_unmap_fmr(kibnal_data.kib_hca, - 1, &tx->tx_md.md_fmrhandle); - LASSERT (vvrc == vv_return_ok); - - tx->tx_md.md_fmrcount = *kibnal_tunables.kib_fmr_remaps; - } - tx->tx_md.md_active = 0; -#endif - - /* tx may have up to 2 lnet msgs to finalise */ - lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL; - lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL; - - if (tx->tx_conn != NULL) { - kibnal_conn_decref(tx->tx_conn); - tx->tx_conn = NULL; - } - - tx->tx_nwrq = 0; - tx->tx_status = 0; - - spin_lock(&kibnal_data.kib_tx_lock); - - list_add (&tx->tx_list, &kibnal_data.kib_idle_txs); - - spin_unlock(&kibnal_data.kib_tx_lock); - - /* delay finalize until my descs have been freed */ - for (i = 0; i < 2; i++) { - if (lntmsg[i] == NULL) - continue; - - lnet_finalize (kibnal_data.kib_ni, lntmsg[i], rc); - } -} - -void -kibnal_txlist_done (struct list_head *txlist, int status) -{ - kib_tx_t *tx; - - while (!list_empty (txlist)) { - tx = list_entry (txlist->next, kib_tx_t, tx_list); - - list_del (&tx->tx_list); - /* complete now */ - tx->tx_waiting = 0; - tx->tx_status = status; - kibnal_tx_done (tx); - } -} - -kib_tx_t * -kibnal_get_idle_tx (void) -{ - kib_tx_t *tx; - - spin_lock(&kibnal_data.kib_tx_lock); - - if (list_empty (&kibnal_data.kib_idle_txs)) { - spin_unlock(&kibnal_data.kib_tx_lock); - return NULL; - } - - tx = list_entry (kibnal_data.kib_idle_txs.next, kib_tx_t, tx_list); - list_del (&tx->tx_list); - - /* Allocate a new completion cookie. It might not be needed, - * but we've got a lock right now and we're unlikely to - * wrap... */ - tx->tx_cookie = kibnal_data.kib_next_tx_cookie++; - - spin_unlock(&kibnal_data.kib_tx_lock); - - LASSERT (tx->tx_nwrq == 0); - LASSERT (!tx->tx_queued); - LASSERT (tx->tx_sending == 0); - LASSERT (!tx->tx_waiting); - LASSERT (tx->tx_status == 0); - LASSERT (tx->tx_conn == NULL); - LASSERT (tx->tx_lntmsg[0] == NULL); - LASSERT (tx->tx_lntmsg[1] == NULL); - - return tx; -} - -int -kibnal_post_rx (kib_rx_t *rx, int credit, int rsrvd_credit) -{ - kib_conn_t *conn = rx->rx_conn; - int rc = 0; - __u64 addr = (__u64)((unsigned long)((rx)->rx_msg)); - vv_return_t vvrc; - - LASSERT (!in_interrupt()); - /* old peers don't reserve rxs for RDMA replies */ - LASSERT (!rsrvd_credit || - conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD); - - rx->rx_gl = (vv_scatgat_t) { - .v_address = KIBNAL_ADDR2SG(addr), - .l_key = rx->rx_lkey, - .length = IBNAL_MSG_SIZE, - }; - - rx->rx_wrq = (vv_wr_t) { - .wr_id = kibnal_ptr2wreqid(rx, IBNAL_WID_RX), - .completion_notification = 1, - .scatgat_list = &rx->rx_gl, - .num_of_data_segments = 1, - .wr_type = vv_wr_receive, - }; - - LASSERT (conn->ibc_state >= IBNAL_CONN_INIT); - LASSERT (rx->rx_nob >= 0); /* not posted */ - - CDEBUG(D_NET, "posting rx [%d %x "LPX64"]\n", - rx->rx_wrq.scatgat_list->length, - rx->rx_wrq.scatgat_list->l_key, - KIBNAL_SG2ADDR(rx->rx_wrq.scatgat_list->v_address)); - - if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) { - /* No more posts for this rx; so lose its ref */ - kibnal_conn_decref(conn); - return 0; - } - - rx->rx_nob = -1; /* flag posted */ - - spin_lock(&conn->ibc_lock); - /* Serialise vv_post_receive; it's not re-entrant on the same QP */ - vvrc = vv_post_receive(kibnal_data.kib_hca, - conn->ibc_qp, &rx->rx_wrq); - - if (vvrc == vv_return_ok) { - if (credit) - conn->ibc_outstanding_credits++; - if (rsrvd_credit) - conn->ibc_reserved_credits++; - - spin_unlock(&conn->ibc_lock); - - if (credit || rsrvd_credit) - kibnal_check_sends(conn); - - return 0; - } - - spin_unlock(&conn->ibc_lock); - - CERROR ("post rx -> %s failed %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), vvrc); - rc = -EIO; - kibnal_close_conn(conn, rc); - /* No more posts for this rx; so lose its ref */ - kibnal_conn_decref(conn); - return rc; -} - -int -kibnal_post_receives (kib_conn_t *conn) -{ - int i; - int rc; - - LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED); - LASSERT (conn->ibc_comms_error == 0); - - for (i = 0; i < IBNAL_RX_MSGS; i++) { - /* +1 ref for rx desc. This ref remains until kibnal_post_rx - * fails (i.e. actual failure or we're disconnecting) */ - kibnal_conn_addref(conn); - rc = kibnal_post_rx (&conn->ibc_rxs[i], 0, 0); - if (rc != 0) - return rc; - } - - return 0; -} - -kib_tx_t * -kibnal_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie) -{ - struct list_head *tmp; - - list_for_each(tmp, &conn->ibc_active_txs) { - kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list); - - LASSERT (!tx->tx_queued); - LASSERT (tx->tx_sending != 0 || tx->tx_waiting); - - if (tx->tx_cookie != cookie) - continue; - - if (tx->tx_waiting && - tx->tx_msg->ibm_type == txtype) - return tx; - - CWARN("Bad completion: %swaiting, type %x (wanted %x)\n", - tx->tx_waiting ? "" : "NOT ", - tx->tx_msg->ibm_type, txtype); - } - return NULL; -} - -void -kibnal_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie) -{ - kib_tx_t *tx; - int idle; - - spin_lock(&conn->ibc_lock); - - tx = kibnal_find_waiting_tx_locked(conn, txtype, cookie); - if (tx == NULL) { - spin_unlock(&conn->ibc_lock); - - CWARN("Unmatched completion type %x cookie "LPX64" from %s\n", - txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - kibnal_close_conn (conn, -EPROTO); - return; - } - - if (tx->tx_status == 0) { /* success so far */ - if (status < 0) { /* failed? */ - tx->tx_status = status; - } else if (txtype == IBNAL_MSG_GET_REQ) { - lnet_set_reply_msg_len(kibnal_data.kib_ni, - tx->tx_lntmsg[1], status); - } - } - - tx->tx_waiting = 0; - - idle = !tx->tx_queued && (tx->tx_sending == 0); - if (idle) - list_del(&tx->tx_list); - - spin_unlock(&conn->ibc_lock); - - if (idle) - kibnal_tx_done(tx); -} - -void -kibnal_send_completion (kib_conn_t *conn, int type, int status, __u64 cookie) -{ - kib_tx_t *tx = kibnal_get_idle_tx(); - - if (tx == NULL) { - CERROR("Can't get tx for completion %x for %s\n", - type, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - return; - } - - tx->tx_msg->ibm_u.completion.ibcm_status = status; - tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie; - kibnal_init_tx_msg(tx, type, sizeof(kib_completion_msg_t)); - - kibnal_queue_tx(tx, conn); -} - -void -kibnal_handle_rx (kib_rx_t *rx) -{ - kib_msg_t *msg = rx->rx_msg; - kib_conn_t *conn = rx->rx_conn; - int credits = msg->ibm_credits; - kib_tx_t *tx; - int rc = 0; - int repost = 1; - int rsrvd_credit = 0; - int rc2; - - LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED); - - CDEBUG (D_NET, "Received %x[%d] from %s\n", - msg->ibm_type, credits, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - - if (credits != 0) { - /* Have I received credits that will let me send? */ - spin_lock(&conn->ibc_lock); - conn->ibc_credits += credits; - spin_unlock(&conn->ibc_lock); - - kibnal_check_sends(conn); - } - - switch (msg->ibm_type) { - default: - CERROR("Bad IBNAL message type %x from %s\n", - msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - rc = -EPROTO; - break; - - case IBNAL_MSG_NOOP: - break; - - case IBNAL_MSG_IMMEDIATE: - rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.immediate.ibim_hdr, - msg->ibm_srcnid, rx, 0); - repost = rc < 0; /* repost on error */ - break; - - case IBNAL_MSG_PUT_REQ: - rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.putreq.ibprm_hdr, - msg->ibm_srcnid, rx, 1); - repost = rc < 0; /* repost on error */ - break; - - case IBNAL_MSG_PUT_NAK: - rsrvd_credit = 1; /* rdma reply (was pre-reserved) */ - - CWARN ("PUT_NACK from %s\n", libcfs_nid2str(conn->ibc_peer->ibp_nid)); - kibnal_handle_completion(conn, IBNAL_MSG_PUT_REQ, - msg->ibm_u.completion.ibcm_status, - msg->ibm_u.completion.ibcm_cookie); - break; - - case IBNAL_MSG_PUT_ACK: - rsrvd_credit = 1; /* rdma reply (was pre-reserved) */ - - spin_lock(&conn->ibc_lock); - tx = kibnal_find_waiting_tx_locked(conn, IBNAL_MSG_PUT_REQ, - msg->ibm_u.putack.ibpam_src_cookie); - if (tx != NULL) - list_del(&tx->tx_list); - spin_unlock(&conn->ibc_lock); - - if (tx == NULL) { - CERROR("Unmatched PUT_ACK from %s\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - rc = -EPROTO; - break; - } - - LASSERT (tx->tx_waiting); - /* CAVEAT EMPTOR: I could be racing with tx_complete, but... - * (a) I can overwrite tx_msg since my peer has received it! - * (b) tx_waiting set tells tx_complete() it's not done. */ - - tx->tx_nwrq = 0; /* overwrite PUT_REQ */ - - rc2 = kibnal_init_rdma(tx, IBNAL_MSG_PUT_DONE, - kibnal_rd_size(&msg->ibm_u.putack.ibpam_rd), - &msg->ibm_u.putack.ibpam_rd, - msg->ibm_u.putack.ibpam_dst_cookie); - if (rc2 < 0) - CERROR("Can't setup rdma for PUT to %s: %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2); - - spin_lock(&conn->ibc_lock); - if (tx->tx_status == 0 && rc2 < 0) - tx->tx_status = rc2; - tx->tx_waiting = 0; /* clear waiting and queue atomically */ - kibnal_queue_tx_locked(tx, conn); - spin_unlock(&conn->ibc_lock); - break; - - case IBNAL_MSG_PUT_DONE: - /* This buffer was pre-reserved by not returning the credit - * when the PUT_REQ's buffer was reposted, so I just return it - * now */ - kibnal_handle_completion(conn, IBNAL_MSG_PUT_ACK, - msg->ibm_u.completion.ibcm_status, - msg->ibm_u.completion.ibcm_cookie); - break; - - case IBNAL_MSG_GET_REQ: - rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.get.ibgm_hdr, - msg->ibm_srcnid, rx, 1); - repost = rc < 0; /* repost on error */ - break; - - case IBNAL_MSG_GET_DONE: - rsrvd_credit = 1; /* rdma reply (was pre-reserved) */ - - kibnal_handle_completion(conn, IBNAL_MSG_GET_REQ, - msg->ibm_u.completion.ibcm_status, - msg->ibm_u.completion.ibcm_cookie); - break; - } - - if (rc < 0) /* protocol error */ - kibnal_close_conn(conn, rc); - - if (repost) { - if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) - rsrvd_credit = 0; /* peer isn't pre-reserving */ - - kibnal_post_rx(rx, !rsrvd_credit, rsrvd_credit); - } -} - -void -kibnal_rx_complete (kib_rx_t *rx, vv_comp_status_t vvrc, int nob, __u64 rxseq) -{ - kib_msg_t *msg = rx->rx_msg; - kib_conn_t *conn = rx->rx_conn; - unsigned long flags; - int rc; - - CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); - LASSERT (rx->rx_nob < 0); /* was posted */ - rx->rx_nob = 0; /* isn't now */ - - if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) - goto ignore; - - if (vvrc != vv_comp_status_success) { - CERROR("Rx from %s failed: %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), vvrc); - goto failed; - } - - rc = kibnal_unpack_msg(msg, conn->ibc_version, nob); - if (rc != 0) { - CERROR ("Error %d unpacking rx from %s\n", - rc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - goto failed; - } - - rx->rx_nob = nob; /* Can trust 'nob' now */ - - if (conn->ibc_peer->ibp_nid != msg->ibm_srcnid || - kibnal_data.kib_ni->ni_nid != msg->ibm_dstnid || - msg->ibm_srcstamp != conn->ibc_incarnation || - msg->ibm_dststamp != kibnal_data.kib_incarnation) { - CERROR ("Stale rx from %s\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - goto failed; - } - - if (msg->ibm_seq != rxseq) { - CERROR ("Out-of-sequence rx from %s" - ": got "LPD64" but expected "LPD64"\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), - msg->ibm_seq, rxseq); - goto failed; - } - - /* set time last known alive */ - kibnal_peer_alive(conn->ibc_peer); - - /* racing with connection establishment/teardown! */ - - if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) { - write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - /* must check holding global lock to eliminate race */ - if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) { - list_add_tail(&rx->rx_list, &conn->ibc_early_rxs); - write_unlock_irqrestore(&kibnal_data.kib_global_lock, - flags); - return; - } - write_unlock_irqrestore(&kibnal_data.kib_global_lock, - flags); - } - kibnal_handle_rx(rx); - return; - - failed: - CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); - kibnal_close_conn(conn, -EIO); - ignore: - /* Don't re-post rx & drop its ref on conn */ - kibnal_conn_decref(conn); -} - -struct page * -kibnal_kvaddr_to_page (unsigned long vaddr) -{ - struct page *page; - - if (vaddr >= VMALLOC_START && - vaddr < VMALLOC_END) { - page = vmalloc_to_page ((void *)vaddr); - LASSERT (page != NULL); - return page; - } -#ifdef CONFIG_HIGHMEM - if (vaddr >= PKMAP_BASE && - vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) { - /* No highmem pages only used for bulk (kiov) I/O */ - CERROR("find page for address in highmem\n"); - LBUG(); - } -#endif - page = virt_to_page (vaddr); - LASSERT (page != NULL); - return page; -} - -#if !IBNAL_USE_FMR -int -kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page, - unsigned long page_offset, unsigned long len) -{ - kib_rdma_frag_t *frag = &rd->rd_frags[rd->rd_nfrag]; - vv_l_key_t l_key; - vv_r_key_t r_key; - __u64 addr; - __u64 frag_addr; - vv_mem_reg_h_t mem_h; - vv_return_t vvrc; - - if (rd->rd_nfrag >= IBNAL_MAX_RDMA_FRAGS) { - CERROR ("Too many RDMA fragments\n"); - return -EMSGSIZE; - } - - /* Try to create an address that adaptor-tavor will munge into a valid - * network address, given how it maps all phys mem into 1 region */ - addr = lnet_page2phys(page) + page_offset + PAGE_OFFSET; - - /* NB this relies entirely on there being a single region for the whole - * of memory, since "high" memory will wrap in the (void *) cast! */ - vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca, - (void *)((unsigned long)addr), - len, &mem_h, &l_key, &r_key); - LASSERT (vvrc == vv_return_ok); - - if (active) { - if (rd->rd_nfrag == 0) { - rd->rd_key = l_key; - } else if (l_key != rd->rd_key) { - CERROR ("> 1 key for single RDMA desc\n"); - return -EINVAL; - } - frag_addr = addr; - } else { - if (rd->rd_nfrag == 0) { - rd->rd_key = r_key; - } else if (r_key != rd->rd_key) { - CERROR ("> 1 key for single RDMA desc\n"); - return -EINVAL; - } - - frag_addr = kibnal_addr2net(addr); - } - - kibnal_rf_set(frag, frag_addr, len); - - CDEBUG(D_NET,"map frag [%d][%d %x %08x%08x] "LPX64"\n", - rd->rd_nfrag, frag->rf_nob, rd->rd_key, - frag->rf_addr_hi, frag->rf_addr_lo, frag_addr); - - rd->rd_nfrag++; - return 0; -} - -int -kibnal_setup_rd_iov(kib_tx_t *tx, kib_rdma_desc_t *rd, - vv_access_con_bit_mask_t access, - unsigned int niov, struct iovec *iov, int offset, int nob) -{ - /* active if I'm sending */ - int active = ((access & vv_acc_r_mem_write) == 0); - int fragnob; - int rc; - unsigned long vaddr; - struct page *page; - int page_offset; - - LASSERT (nob > 0); - LASSERT (niov > 0); - LASSERT ((rd != tx->tx_rd) == !active); - - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - niov--; - iov++; - LASSERT (niov > 0); - } - - rd->rd_nfrag = 0; - do { - LASSERT (niov > 0); - - vaddr = ((unsigned long)iov->iov_base) + offset; - page_offset = vaddr & (PAGE_SIZE - 1); - page = kibnal_kvaddr_to_page(vaddr); - if (page == NULL) { - CERROR ("Can't find page\n"); - return -EFAULT; - } - - fragnob = min((int)(iov->iov_len - offset), nob); - fragnob = min(fragnob, (int)PAGE_SIZE - page_offset); - - rc = kibnal_append_rdfrag(rd, active, page, - page_offset, fragnob); - if (rc != 0) - return rc; - - if (offset + fragnob < iov->iov_len) { - offset += fragnob; - } else { - offset = 0; - iov++; - niov--; - } - nob -= fragnob; - } while (nob > 0); - - return 0; -} - -int -kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, - vv_access_con_bit_mask_t access, - int nkiov, lnet_kiov_t *kiov, int offset, int nob) -{ - /* active if I'm sending */ - int active = ((access & vv_acc_r_mem_write) == 0); - int fragnob; - int rc; - - CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob); - - LASSERT (nob > 0); - LASSERT (nkiov > 0); - LASSERT ((rd != tx->tx_rd) == !active); - - while (offset >= kiov->kiov_len) { - offset -= kiov->kiov_len; - nkiov--; - kiov++; - LASSERT (nkiov > 0); - } - - rd->rd_nfrag = 0; - do { - LASSERT (nkiov > 0); - fragnob = min((int)(kiov->kiov_len - offset), nob); - - rc = kibnal_append_rdfrag(rd, active, kiov->kiov_page, - kiov->kiov_offset + offset, - fragnob); - if (rc != 0) - return rc; - - offset = 0; - kiov++; - nkiov--; - nob -= fragnob; - } while (nob > 0); - - return 0; -} -#else -int -kibnal_map_tx (kib_tx_t *tx, kib_rdma_desc_t *rd, int active, - int npages, unsigned long page_offset, int nob) -{ - vv_return_t vvrc; - vv_fmr_map_t map_props; - - LASSERT ((rd != tx->tx_rd) == !active); - LASSERT (!tx->tx_md.md_active); - LASSERT (tx->tx_md.md_fmrcount > 0); - LASSERT (page_offset < PAGE_SIZE); - LASSERT (npages >= (1 + ((page_offset + nob - 1)>>PAGE_SHIFT))); - LASSERT (npages <= LNET_MAX_IOV); - - memset(&map_props, 0, sizeof(map_props)); - - map_props.start = (void *)page_offset; - map_props.size = nob; - map_props.page_array_len = npages; - map_props.page_array = tx->tx_pages; - - vvrc = vv_map_fmr(kibnal_data.kib_hca, tx->tx_md.md_fmrhandle, - &map_props, &tx->tx_md.md_lkey, &tx->tx_md.md_rkey); - if (vvrc != vv_return_ok) { - CERROR ("Can't map vaddr %p for %d in %d pages: %d\n", - map_props.start, nob, npages, vvrc); - return -EFAULT; - } - - tx->tx_md.md_addr = (unsigned long)map_props.start; - tx->tx_md.md_active = 1; - tx->tx_md.md_fmrcount--; - - rd->rd_key = active ? tx->tx_md.md_lkey : tx->tx_md.md_rkey; - rd->rd_nob = nob; - rd->rd_addr = tx->tx_md.md_addr; - - /* Compensate for adaptor-tavor's munging of gatherlist addresses */ - if (active) - rd->rd_addr += PAGE_OFFSET; - - return 0; -} - -int -kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd, - vv_access_con_bit_mask_t access, - unsigned int niov, struct iovec *iov, int offset, int nob) -{ - /* active if I'm sending */ - int active = ((access & vv_acc_r_mem_write) == 0); - int resid; - int fragnob; - struct page *page; - int npages; - unsigned long page_offset; - unsigned long vaddr; - - LASSERT (nob > 0); - LASSERT (niov > 0); - - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - niov--; - iov++; - LASSERT (niov > 0); - } - - if (nob > iov->iov_len - offset) { - CERROR ("Can't map multiple vaddr fragments\n"); - return (-EMSGSIZE); - } - - vaddr = ((unsigned long)iov->iov_base) + offset; - - page_offset = vaddr & (PAGE_SIZE - 1); - resid = nob; - npages = 0; - - do { - LASSERT (npages < LNET_MAX_IOV); - - page = kibnal_kvaddr_to_page(vaddr); - if (page == NULL) { - CERROR("Can't find page for %lu\n", vaddr); - return -EFAULT; - } - - tx->tx_pages[npages++] = lnet_page2phys(page); - - fragnob = PAGE_SIZE - (vaddr & (PAGE_SIZE - 1)); - vaddr += fragnob; - resid -= fragnob; - - } while (resid > 0); - - return kibnal_map_tx(tx, rd, active, npages, page_offset, nob); -} - -int -kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, - vv_access_con_bit_mask_t access, - int nkiov, lnet_kiov_t *kiov, int offset, int nob) -{ - /* active if I'm sending */ - int active = ((access & vv_acc_r_mem_write) == 0); - int resid; - int npages; - unsigned long page_offset; - - CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob); - - LASSERT (nob > 0); - LASSERT (nkiov > 0); - LASSERT (nkiov <= LNET_MAX_IOV); - LASSERT (!tx->tx_md.md_active); - LASSERT ((rd != tx->tx_rd) == !active); - - while (offset >= kiov->kiov_len) { - offset -= kiov->kiov_len; - nkiov--; - kiov++; - LASSERT (nkiov > 0); - } - - page_offset = kiov->kiov_offset + offset; - - resid = offset + nob; - npages = 0; - - do { - LASSERT (npages < LNET_MAX_IOV); - LASSERT (nkiov > 0); - - if ((npages > 0 && kiov->kiov_offset != 0) || - (resid > kiov->kiov_len && - (kiov->kiov_offset + kiov->kiov_len) != PAGE_SIZE)) { - /* Can't have gaps */ - CERROR ("Can't make payload contiguous in I/O VM:" - "page %d, offset %d, len %d \n", - npages, kiov->kiov_offset, kiov->kiov_len); - - return -EINVAL; - } - - tx->tx_pages[npages++] = lnet_page2phys(kiov->kiov_page); - resid -= kiov->kiov_len; - kiov++; - nkiov--; - } while (resid > 0); - - return kibnal_map_tx(tx, rd, active, npages, page_offset, nob); -} -#endif - -kib_conn_t * -kibnal_find_conn_locked (kib_peer_t *peer) -{ - struct list_head *tmp; - - /* just return the first connection */ - list_for_each (tmp, &peer->ibp_conns) { - return (list_entry(tmp, kib_conn_t, ibc_list)); - } - - return (NULL); -} - -void -kibnal_check_sends (kib_conn_t *conn) -{ - kib_tx_t *tx; - vv_return_t vvrc; - int rc; - int consume_cred; - int done; - - /* Don't send anything until after the connection is established */ - if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) { - CDEBUG(D_NET, "%s too soon\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - return; - } - - spin_lock(&conn->ibc_lock); - - LASSERT (conn->ibc_nsends_posted <= - *kibnal_tunables.kib_concurrent_sends); - LASSERT (conn->ibc_reserved_credits >= 0); - - while (conn->ibc_reserved_credits > 0 && - !list_empty(&conn->ibc_tx_queue_rsrvd)) { - LASSERT (conn->ibc_version != - IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD); - tx = list_entry(conn->ibc_tx_queue_rsrvd.next, - kib_tx_t, tx_list); - list_del(&tx->tx_list); - list_add_tail(&tx->tx_list, &conn->ibc_tx_queue); - conn->ibc_reserved_credits--; - } - - if (list_empty(&conn->ibc_tx_queue) && - list_empty(&conn->ibc_tx_queue_nocred) && - (conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER || - kibnal_send_keepalive(conn))) { - spin_unlock(&conn->ibc_lock); - - tx = kibnal_get_idle_tx(); - if (tx != NULL) - kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0); - - spin_lock(&conn->ibc_lock); - - if (tx != NULL) - kibnal_queue_tx_locked(tx, conn); - } - - for (;;) { - if (!list_empty(&conn->ibc_tx_queue_nocred)) { - LASSERT (conn->ibc_version != - IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD); - tx = list_entry (conn->ibc_tx_queue_nocred.next, - kib_tx_t, tx_list); - consume_cred = 0; - } else if (!list_empty (&conn->ibc_tx_queue)) { - tx = list_entry (conn->ibc_tx_queue.next, - kib_tx_t, tx_list); - consume_cred = 1; - } else { - /* nothing waiting */ - break; - } - - LASSERT (tx->tx_queued); - /* We rely on this for QP sizing */ - LASSERT (tx->tx_nwrq > 0 && tx->tx_nwrq <= 1 + IBNAL_MAX_RDMA_FRAGS); - - LASSERT (conn->ibc_outstanding_credits >= 0); - LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE); - LASSERT (conn->ibc_credits >= 0); - LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE); - - if (conn->ibc_nsends_posted == - *kibnal_tunables.kib_concurrent_sends) { - /* We've got some tx completions outstanding... */ - CDEBUG(D_NET, "%s: posted enough\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - break; - } - - if (consume_cred) { - if (conn->ibc_credits == 0) { /* no credits */ - CDEBUG(D_NET, "%s: no credits\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - break; - } - - if (conn->ibc_credits == 1 && /* last credit reserved for */ - conn->ibc_outstanding_credits == 0) { /* giving back credits */ - CDEBUG(D_NET, "%s: not using last credit\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - break; - } - } - - list_del (&tx->tx_list); - tx->tx_queued = 0; - - /* NB don't drop ibc_lock before bumping tx_sending */ - - if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP && - (!list_empty(&conn->ibc_tx_queue) || - !list_empty(&conn->ibc_tx_queue_nocred) || - (conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER && - !kibnal_send_keepalive(conn)))) { - /* redundant NOOP */ - spin_unlock(&conn->ibc_lock); - kibnal_tx_done(tx); - spin_lock(&conn->ibc_lock); - CDEBUG(D_NET, "%s: redundant noop\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - continue; - } - - kibnal_pack_msg(tx->tx_msg, conn->ibc_version, - conn->ibc_outstanding_credits, - conn->ibc_peer->ibp_nid, conn->ibc_incarnation, - conn->ibc_txseq); - - conn->ibc_txseq++; - conn->ibc_outstanding_credits = 0; - conn->ibc_nsends_posted++; - if (consume_cred) - conn->ibc_credits--; - - /* CAVEAT EMPTOR! This tx could be the PUT_DONE of an RDMA - * PUT. If so, it was first queued here as a PUT_REQ, sent and - * stashed on ibc_active_txs, matched by an incoming PUT_ACK, - * and then re-queued here. It's (just) possible that - * tx_sending is non-zero if we've not done the tx_complete() from - * the first send; hence the ++ rather than = below. */ - tx->tx_sending++; - - list_add (&tx->tx_list, &conn->ibc_active_txs); - - /* Keep holding ibc_lock while posting sends on this - * connection; vv_post_send() isn't re-entrant on the same - * QP!! */ - - LASSERT (tx->tx_nwrq > 0); -#if 0 - if (tx->tx_wrq[0].wr_type == vv_wr_rdma_write) - CDEBUG(D_NET, "WORK[0]: RDMA gl %p for %d k %x -> "LPX64" k %x\n", - tx->tx_wrq[0].scatgat_list->v_address, - tx->tx_wrq[0].scatgat_list->length, - tx->tx_wrq[0].scatgat_list->l_key, - tx->tx_wrq[0].type.send.send_qp_type.rc_type.r_addr, - tx->tx_wrq[0].type.send.send_qp_type.rc_type.r_r_key); - else - CDEBUG(D_NET, "WORK[0]: %s gl %p for %d k %x\n", - tx->tx_wrq[0].wr_type == vv_wr_send ? "SEND" : "????", - tx->tx_wrq[0].scatgat_list->v_address, - tx->tx_wrq[0].scatgat_list->length, - tx->tx_wrq[0].scatgat_list->l_key); - - if (tx->tx_nwrq > 1) { - if (tx->tx_wrq[1].wr_type == vv_wr_rdma_write) - CDEBUG(D_NET, "WORK[1]: RDMA gl %p for %d k %x -> "LPX64" k %x\n", - tx->tx_wrq[1].scatgat_list->v_address, - tx->tx_wrq[1].scatgat_list->length, - tx->tx_wrq[1].scatgat_list->l_key, - tx->tx_wrq[1].type.send.send_qp_type.rc_type.r_addr, - tx->tx_wrq[1].type.send.send_qp_type.rc_type.r_r_key); - else - CDEBUG(D_NET, "WORK[1]: %s gl %p for %d k %x\n", - tx->tx_wrq[1].wr_type == vv_wr_send ? "SEND" : "????", - tx->tx_wrq[1].scatgat_list->v_address, - tx->tx_wrq[1].scatgat_list->length, - tx->tx_wrq[1].scatgat_list->l_key); - } -#endif - rc = -ECONNABORTED; - vvrc = vv_return_ok; - if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { - tx->tx_status = 0; - vvrc = vv_post_send_list(kibnal_data.kib_hca, - conn->ibc_qp, - tx->tx_nwrq, - tx->tx_wrq, - vv_operation_type_send_rc); - rc = (vvrc == vv_return_ok) ? 0 : -EIO; - } - - conn->ibc_last_send = jiffies; - - if (rc != 0) { - /* NB credits are transferred in the actual - * message, which can only be the last work item */ - conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits; - if (consume_cred) - conn->ibc_credits++; - conn->ibc_nsends_posted--; - - tx->tx_status = rc; - tx->tx_waiting = 0; - tx->tx_sending--; - - done = (tx->tx_sending == 0); - if (done) - list_del (&tx->tx_list); - - spin_unlock(&conn->ibc_lock); - - if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) - CERROR ("Error %d posting transmit to %s\n", - vvrc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - else - CDEBUG (D_NET, "Error %d posting transmit to %s\n", - rc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - - kibnal_close_conn (conn, rc); - - if (done) - kibnal_tx_done (tx); - return; - } - } - - spin_unlock(&conn->ibc_lock); -} - -void -kibnal_tx_complete (kib_tx_t *tx, vv_comp_status_t vvrc) -{ - kib_conn_t *conn = tx->tx_conn; - int failed = (vvrc != vv_comp_status_success); - int idle; - - CDEBUG(D_NET, "tx %p conn %p sending %d nwrq %d vvrc %d\n", - tx, conn, tx->tx_sending, tx->tx_nwrq, vvrc); - - LASSERT (tx->tx_sending > 0); - - if (failed && - tx->tx_status == 0 && - conn->ibc_state == IBNAL_CONN_ESTABLISHED) - CDEBUG(D_NETERROR, "tx -> %s type %x cookie "LPX64 - "sending %d waiting %d: failed %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), - tx->tx_msg->ibm_type, tx->tx_cookie, - tx->tx_sending, tx->tx_waiting, vvrc); - - spin_lock(&conn->ibc_lock); - - /* I could be racing with rdma completion. Whoever makes 'tx' idle - * gets to free it, which also drops its ref on 'conn'. */ - - tx->tx_sending--; - conn->ibc_nsends_posted--; - - if (failed) { - tx->tx_waiting = 0; - tx->tx_status = -EIO; - } - - idle = (tx->tx_sending == 0) && /* This is the final callback */ - !tx->tx_waiting && /* Not waiting for peer */ - !tx->tx_queued; /* Not re-queued (PUT_DONE) */ - if (idle) - list_del(&tx->tx_list); - - kibnal_conn_addref(conn); /* 1 ref for me.... */ - - spin_unlock(&conn->ibc_lock); - - if (idle) - kibnal_tx_done (tx); - - if (failed) { - kibnal_close_conn (conn, -EIO); - } else { - kibnal_peer_alive(conn->ibc_peer); - kibnal_check_sends(conn); - } - - kibnal_conn_decref(conn); /* ...until here */ -} - -void -kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob) -{ - vv_scatgat_t *gl = &tx->tx_gl[tx->tx_nwrq]; - vv_wr_t *wrq = &tx->tx_wrq[tx->tx_nwrq]; - int nob = offsetof (kib_msg_t, ibm_u) + body_nob; - __u64 addr = (__u64)((unsigned long)((tx)->tx_msg)); - - LASSERT (tx->tx_nwrq >= 0 && - tx->tx_nwrq < (1 + IBNAL_MAX_RDMA_FRAGS)); - LASSERT (nob <= IBNAL_MSG_SIZE); - - kibnal_init_msg(tx->tx_msg, type, body_nob); - - *gl = (vv_scatgat_t) { - .v_address = KIBNAL_ADDR2SG(addr), - .l_key = tx->tx_lkey, - .length = nob, - }; - - memset(wrq, 0, sizeof(*wrq)); - - wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_TX); - wrq->wr_type = vv_wr_send; - wrq->scatgat_list = gl; - wrq->num_of_data_segments = 1; - wrq->completion_notification = 1; - wrq->type.send.solicited_event = 1; - wrq->type.send.immidiate_data_indicator = 0; - wrq->type.send.send_qp_type.rc_type.fance_indicator = 0; - - tx->tx_nwrq++; -} - -int -kibnal_init_rdma (kib_tx_t *tx, int type, int nob, - kib_rdma_desc_t *dstrd, __u64 dstcookie) -{ - kib_msg_t *ibmsg = tx->tx_msg; - kib_rdma_desc_t *srcrd = tx->tx_rd; - vv_scatgat_t *gl; - vv_wr_t *wrq; - int rc; - -#if IBNAL_USE_FMR - LASSERT (tx->tx_nwrq == 0); - - gl = &tx->tx_gl[0]; - gl->length = nob; - gl->v_address = KIBNAL_ADDR2SG(srcrd->rd_addr); - gl->l_key = srcrd->rd_key; - - wrq = &tx->tx_wrq[0]; - - wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA); - wrq->completion_notification = 0; - wrq->scatgat_list = gl; - wrq->num_of_data_segments = 1; - wrq->wr_type = vv_wr_rdma_write; - wrq->type.send.solicited_event = 0; - wrq->type.send.send_qp_type.rc_type.fance_indicator = 0; - wrq->type.send.send_qp_type.rc_type.r_addr = dstrd->rd_addr; - wrq->type.send.send_qp_type.rc_type.r_r_key = dstrd->rd_key; - - tx->tx_nwrq = 1; - rc = nob; -#else - /* CAVEAT EMPTOR: this 'consumes' the frags in 'dstrd' */ - int resid = nob; - kib_rdma_frag_t *srcfrag; - int srcidx; - kib_rdma_frag_t *dstfrag; - int dstidx; - int wrknob; - - /* Called by scheduler */ - LASSERT (!in_interrupt()); - - LASSERT (type == IBNAL_MSG_GET_DONE || - type == IBNAL_MSG_PUT_DONE); - - srcidx = dstidx = 0; - srcfrag = &srcrd->rd_frags[0]; - dstfrag = &dstrd->rd_frags[0]; - rc = resid; - - while (resid > 0) { - if (srcidx >= srcrd->rd_nfrag) { - CERROR("Src buffer exhausted: %d frags\n", srcidx); - rc = -EPROTO; - break; - } - - if (dstidx == dstrd->rd_nfrag) { - CERROR("Dst buffer exhausted: %d frags\n", dstidx); - rc = -EPROTO; - break; - } - - if (tx->tx_nwrq == IBNAL_MAX_RDMA_FRAGS) { - CERROR("RDMA too fragmented: %d/%d src %d/%d dst frags\n", - srcidx, srcrd->rd_nfrag, - dstidx, dstrd->rd_nfrag); - rc = -EMSGSIZE; - break; - } - - wrknob = MIN(MIN(srcfrag->rf_nob, dstfrag->rf_nob), resid); - - gl = &tx->tx_gl[tx->tx_nwrq]; - gl->v_address = KIBNAL_ADDR2SG(kibnal_rf_addr(srcfrag)); - gl->length = wrknob; - gl->l_key = srcrd->rd_key; - - wrq = &tx->tx_wrq[tx->tx_nwrq]; - - wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA); - wrq->completion_notification = 0; - wrq->scatgat_list = gl; - wrq->num_of_data_segments = 1; - wrq->wr_type = vv_wr_rdma_write; - wrq->type.send.solicited_event = 0; - wrq->type.send.send_qp_type.rc_type.fance_indicator = 0; - wrq->type.send.send_qp_type.rc_type.r_addr = kibnal_rf_addr(dstfrag); - wrq->type.send.send_qp_type.rc_type.r_r_key = dstrd->rd_key; - - resid -= wrknob; - if (wrknob < srcfrag->rf_nob) { - kibnal_rf_set(srcfrag, - kibnal_rf_addr(srcfrag) + wrknob, - srcfrag->rf_nob - wrknob); - } else { - srcfrag++; - srcidx++; - } - - if (wrknob < dstfrag->rf_nob) { - kibnal_rf_set(dstfrag, - kibnal_rf_addr(dstfrag) + wrknob, - dstfrag->rf_nob - wrknob); - } else { - dstfrag++; - dstidx++; - } - - tx->tx_nwrq++; - } - - if (rc < 0) /* no RDMA if completing with failure */ - tx->tx_nwrq = 0; -#endif - - ibmsg->ibm_u.completion.ibcm_status = rc; - ibmsg->ibm_u.completion.ibcm_cookie = dstcookie; - kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t)); - - return rc; -} - -void -kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn) -{ - spin_lock(&conn->ibc_lock); - kibnal_queue_tx_locked (tx, conn); - spin_unlock(&conn->ibc_lock); - - kibnal_check_sends(conn); -} - -void -kibnal_schedule_peer_arp (kib_peer_t *peer) -{ - unsigned long flags; - - LASSERT (peer->ibp_connecting != 0); - LASSERT (peer->ibp_arp_count > 0); - - kibnal_peer_addref(peer); /* extra ref for connd */ - - spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); - - list_add_tail (&peer->ibp_connd_list, &kibnal_data.kib_connd_peers); - wake_up (&kibnal_data.kib_connd_waitq); - - spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); -} - -void -kibnal_launch_tx (kib_tx_t *tx, lnet_nid_t nid) -{ - kib_peer_t *peer; - kib_conn_t *conn; - unsigned long flags; - rwlock_t *g_lock = &kibnal_data.kib_global_lock; - int retry; - int rc; - - /* If I get here, I've committed to send, so I complete the tx with - * failure on any problems */ - - LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */ - LASSERT (tx->tx_nwrq > 0); /* work items have been set up */ - - for (retry = 0; ; retry = 1) { - read_lock_irqsave(g_lock, flags); - - peer = kibnal_find_peer_locked (nid); - if (peer != NULL) { - conn = kibnal_find_conn_locked (peer); - if (conn != NULL) { - kibnal_conn_addref(conn); /* 1 ref for me... */ - read_unlock_irqrestore(g_lock, flags); - - kibnal_queue_tx (tx, conn); - kibnal_conn_decref(conn); /* ...to here */ - return; - } - } - - /* Making one or more connections; I'll need a write lock... */ - read_unlock(g_lock); - write_lock(g_lock); - - peer = kibnal_find_peer_locked (nid); - if (peer != NULL) - break; - - write_unlock_irqrestore(g_lock, flags); - - if (retry) { - CERROR("Can't find peer %s\n", libcfs_nid2str(nid)); - - tx->tx_status = -EHOSTUNREACH; - tx->tx_waiting = 0; - kibnal_tx_done (tx); - return; - } - - rc = kibnal_add_persistent_peer(nid, LNET_NIDADDR(nid)); - if (rc != 0) { - CERROR("Can't add peer %s: %d\n", - libcfs_nid2str(nid), rc); - - tx->tx_status = -EHOSTUNREACH; - tx->tx_waiting = 0; - kibnal_tx_done (tx); - return; - } - } - - conn = kibnal_find_conn_locked (peer); - if (conn != NULL) { - /* Connection exists; queue message on it */ - kibnal_conn_addref(conn); /* 1 ref for me... */ - write_unlock_irqrestore(g_lock, flags); - - kibnal_queue_tx (tx, conn); - kibnal_conn_decref(conn); /* ...until here */ - return; - } - - if (peer->ibp_connecting == 0 && - peer->ibp_accepting == 0) { - if (!(peer->ibp_reconnect_interval == 0 || /* first attempt */ - time_after_eq(jiffies, peer->ibp_reconnect_time))) { - write_unlock_irqrestore(g_lock, flags); - tx->tx_status = -EHOSTUNREACH; - tx->tx_waiting = 0; - kibnal_tx_done (tx); - return; - } - - peer->ibp_connecting = 1; - peer->ibp_arp_count = 1 + *kibnal_tunables.kib_arp_retries; - kibnal_schedule_peer_arp(peer); - } - - /* A connection is being established; queue the message... */ - list_add_tail (&tx->tx_list, &peer->ibp_tx_queue); - - write_unlock_irqrestore(g_lock, flags); -} - -int -kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) -{ - lnet_hdr_t *hdr = &lntmsg->msg_hdr; - int type = lntmsg->msg_type; - lnet_process_id_t target = lntmsg->msg_target; - int target_is_router = lntmsg->msg_target_is_router; - int routing = lntmsg->msg_routing; - unsigned int payload_niov = lntmsg->msg_niov; - struct iovec *payload_iov = lntmsg->msg_iov; - lnet_kiov_t *payload_kiov = lntmsg->msg_kiov; - unsigned int payload_offset = lntmsg->msg_offset; - unsigned int payload_nob = lntmsg->msg_len; - kib_msg_t *ibmsg; - kib_tx_t *tx; - int nob; - int rc; - - /* NB 'private' is different depending on what we're sending.... */ - - CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n", - payload_nob, payload_niov, libcfs_id2str(target)); - - LASSERT (payload_nob == 0 || payload_niov > 0); - LASSERT (payload_niov <= LNET_MAX_IOV); - - /* Thread context */ - LASSERT (!in_interrupt()); - /* payload is either all vaddrs or all pages */ - LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); - - switch (type) { - default: - LBUG(); - return (-EIO); - - case LNET_MSG_ACK: - LASSERT (payload_nob == 0); - break; - - case LNET_MSG_GET: - if (routing || target_is_router) - break; /* send IMMEDIATE */ - - /* is the REPLY message too small for RDMA? */ - nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]); - if (nob <= IBNAL_MSG_SIZE) - break; /* send IMMEDIATE */ - - tx = kibnal_get_idle_tx(); - if (tx == NULL) { - CERROR("Can allocate txd for GET to %s: \n", - libcfs_nid2str(target.nid)); - return -ENOMEM; - } - - ibmsg = tx->tx_msg; - ibmsg->ibm_u.get.ibgm_hdr = *hdr; - ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie; - - if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0) - rc = kibnal_setup_rd_iov(tx, &ibmsg->ibm_u.get.ibgm_rd, - vv_acc_r_mem_write, - lntmsg->msg_md->md_niov, - lntmsg->msg_md->md_iov.iov, - 0, lntmsg->msg_md->md_length); - else - rc = kibnal_setup_rd_kiov(tx, &ibmsg->ibm_u.get.ibgm_rd, - vv_acc_r_mem_write, - lntmsg->msg_md->md_niov, - lntmsg->msg_md->md_iov.kiov, - 0, lntmsg->msg_md->md_length); - if (rc != 0) { - CERROR("Can't setup GET sink for %s: %d\n", - libcfs_nid2str(target.nid), rc); - kibnal_tx_done(tx); - return -EIO; - } - -#if IBNAL_USE_FMR - nob = sizeof(kib_get_msg_t); -#else - { - int n = ibmsg->ibm_u.get.ibgm_rd.rd_nfrag; - - nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]); - } -#endif - kibnal_init_tx_msg(tx, IBNAL_MSG_GET_REQ, nob); - - tx->tx_lntmsg[1] = lnet_create_reply_msg(kibnal_data.kib_ni, - lntmsg); - if (tx->tx_lntmsg[1] == NULL) { - CERROR("Can't create reply for GET -> %s\n", - libcfs_nid2str(target.nid)); - kibnal_tx_done(tx); - return -EIO; - } - - tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg[0,1] on completion */ - tx->tx_waiting = 1; /* waiting for GET_DONE */ - kibnal_launch_tx(tx, target.nid); - return 0; - - case LNET_MSG_REPLY: - case LNET_MSG_PUT: - /* Is the payload small enough not to need RDMA? */ - nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); - if (nob <= IBNAL_MSG_SIZE) - break; /* send IMMEDIATE */ - - tx = kibnal_get_idle_tx(); - if (tx == NULL) { - CERROR("Can't allocate %s txd for %s\n", - type == LNET_MSG_PUT ? "PUT" : "REPLY", - libcfs_nid2str(target.nid)); - return -ENOMEM; - } - - if (payload_kiov == NULL) - rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0, - payload_niov, payload_iov, - payload_offset, payload_nob); - else - rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0, - payload_niov, payload_kiov, - payload_offset, payload_nob); - if (rc != 0) { - CERROR("Can't setup PUT src for %s: %d\n", - libcfs_nid2str(target.nid), rc); - kibnal_tx_done(tx); - return -EIO; - } - - ibmsg = tx->tx_msg; - ibmsg->ibm_u.putreq.ibprm_hdr = *hdr; - ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie; - kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_REQ, sizeof(kib_putreq_msg_t)); - - tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ - tx->tx_waiting = 1; /* waiting for PUT_{ACK,NAK} */ - kibnal_launch_tx(tx, target.nid); - return 0; - } - - /* send IMMEDIATE */ - - LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]) - <= IBNAL_MSG_SIZE); - - tx = kibnal_get_idle_tx(); - if (tx == NULL) { - CERROR ("Can't send %d to %s: tx descs exhausted\n", - type, libcfs_nid2str(target.nid)); - return -ENOMEM; - } - - ibmsg = tx->tx_msg; - ibmsg->ibm_u.immediate.ibim_hdr = *hdr; - - if (payload_kiov != NULL) - lnet_copy_kiov2flat(IBNAL_MSG_SIZE, ibmsg, - offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), - payload_niov, payload_kiov, - payload_offset, payload_nob); - else - lnet_copy_iov2flat(IBNAL_MSG_SIZE, ibmsg, - offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), - payload_niov, payload_iov, - payload_offset, payload_nob); - - nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]); - kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, nob); - - tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ - kibnal_launch_tx(tx, target.nid); - return 0; -} - -void -kibnal_reply (lnet_ni_t *ni, kib_rx_t *rx, lnet_msg_t *lntmsg) -{ - lnet_process_id_t target = lntmsg->msg_target; - unsigned int niov = lntmsg->msg_niov; - struct iovec *iov = lntmsg->msg_iov; - lnet_kiov_t *kiov = lntmsg->msg_kiov; - unsigned int offset = lntmsg->msg_offset; - unsigned int nob = lntmsg->msg_len; - kib_tx_t *tx; - int rc; - - tx = kibnal_get_idle_tx(); - if (tx == NULL) { - CERROR("Can't get tx for REPLY to %s\n", - libcfs_nid2str(target.nid)); - goto failed_0; - } - - if (nob == 0) - rc = 0; - else if (kiov == NULL) - rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0, - niov, iov, offset, nob); - else - rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0, - niov, kiov, offset, nob); - - if (rc != 0) { - CERROR("Can't setup GET src for %s: %d\n", - libcfs_nid2str(target.nid), rc); - goto failed_1; - } - - rc = kibnal_init_rdma(tx, IBNAL_MSG_GET_DONE, nob, - &rx->rx_msg->ibm_u.get.ibgm_rd, - rx->rx_msg->ibm_u.get.ibgm_cookie); - if (rc < 0) { - CERROR("Can't setup rdma for GET from %s: %d\n", - libcfs_nid2str(target.nid), rc); - goto failed_1; - } - - if (rc == 0) { - /* No RDMA: local completion may happen now! */ - lnet_finalize(ni, lntmsg, 0); - } else { - /* RDMA: lnet_finalize(lntmsg) when it - * completes */ - tx->tx_lntmsg[0] = lntmsg; - } - - kibnal_queue_tx(tx, rx->rx_conn); - return; - - failed_1: - kibnal_tx_done(tx); - failed_0: - lnet_finalize(ni, lntmsg, -EIO); -} - -int -kibnal_eager_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, - void **new_private) -{ - kib_rx_t *rx = private; - kib_conn_t *conn = rx->rx_conn; - - if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) { - /* Can't block if RDMA completions need normal credits */ - LCONSOLE_ERROR_MSG(0x129, "Dropping message from %s: no buffers" - " free. %s is running an old version of LNET " - "that may deadlock if messages wait for" - "buffers) \n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - return -EDEADLK; - } - - *new_private = private; - return 0; -} - -int -kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, - unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, - unsigned int offset, unsigned int mlen, unsigned int rlen) -{ - kib_rx_t *rx = private; - kib_msg_t *rxmsg = rx->rx_msg; - kib_conn_t *conn = rx->rx_conn; - kib_tx_t *tx; - kib_msg_t *txmsg; - int nob; - int post_cred = 1; - int rc = 0; - - LASSERT (mlen <= rlen); - LASSERT (!in_interrupt()); - /* Either all pages or all vaddrs */ - LASSERT (!(kiov != NULL && iov != NULL)); - - switch (rxmsg->ibm_type) { - default: - LBUG(); - - case IBNAL_MSG_IMMEDIATE: - nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]); - if (nob > rx->rx_nob) { - CERROR ("Immediate message from %s too big: %d(%d)\n", - libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid), - nob, rx->rx_nob); - rc = -EPROTO; - break; - } - - if (kiov != NULL) - lnet_copy_flat2kiov(niov, kiov, offset, - IBNAL_MSG_SIZE, rxmsg, - offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), - mlen); - else - lnet_copy_flat2iov(niov, iov, offset, - IBNAL_MSG_SIZE, rxmsg, - offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), - mlen); - lnet_finalize (ni, lntmsg, 0); - break; - - case IBNAL_MSG_PUT_REQ: - if (mlen == 0) { - lnet_finalize(ni, lntmsg, 0); - kibnal_send_completion(conn, IBNAL_MSG_PUT_NAK, 0, - rxmsg->ibm_u.putreq.ibprm_cookie); - break; - } - - tx = kibnal_get_idle_tx(); - if (tx == NULL) { - CERROR("Can't allocate tx for %s\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - /* Not replying will break the connection */ - rc = -ENOMEM; - break; - } - - txmsg = tx->tx_msg; - if (kiov == NULL) - rc = kibnal_setup_rd_iov(tx, - &txmsg->ibm_u.putack.ibpam_rd, - vv_acc_r_mem_write, - niov, iov, offset, mlen); - else - rc = kibnal_setup_rd_kiov(tx, - &txmsg->ibm_u.putack.ibpam_rd, - vv_acc_r_mem_write, - niov, kiov, offset, mlen); - if (rc != 0) { - CERROR("Can't setup PUT sink for %s: %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); - kibnal_tx_done(tx); - /* tell peer it's over */ - kibnal_send_completion(conn, IBNAL_MSG_PUT_NAK, rc, - rxmsg->ibm_u.putreq.ibprm_cookie); - break; - } - - txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie; - txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie; -#if IBNAL_USE_FMR - nob = sizeof(kib_putack_msg_t); -#else - { - int n = tx->tx_msg->ibm_u.putack.ibpam_rd.rd_nfrag; - - nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]); - } -#endif - kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_ACK, nob); - - tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ - tx->tx_waiting = 1; /* waiting for PUT_DONE */ - kibnal_queue_tx(tx, conn); - - if (conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) - post_cred = 0; /* peer still owns 'rx' for sending PUT_DONE */ - break; - - case IBNAL_MSG_GET_REQ: - if (lntmsg != NULL) { - /* Optimized GET; RDMA lntmsg's payload */ - kibnal_reply(ni, rx, lntmsg); - } else { - /* GET didn't match anything */ - kibnal_send_completion(conn, IBNAL_MSG_GET_DONE, -ENODATA, - rxmsg->ibm_u.get.ibgm_cookie); - } - break; - } - - kibnal_post_rx(rx, post_cred, 0); - return rc; -} - -int -kibnal_thread_start (int (*fn)(void *arg), void *arg) -{ - long pid = kernel_thread (fn, arg, 0); - - if (pid < 0) - return ((int)pid); - - atomic_inc (&kibnal_data.kib_nthreads); - return (0); -} - -void -kibnal_thread_fini (void) -{ - atomic_dec (&kibnal_data.kib_nthreads); -} - -void -kibnal_peer_alive (kib_peer_t *peer) -{ - /* This is racy, but everyone's only writing cfs_time_current() */ - peer->ibp_last_alive = cfs_time_current(); - mb(); -} - -void -kibnal_peer_notify (kib_peer_t *peer) -{ - time_t last_alive = 0; - int error = 0; - unsigned long flags; - - read_lock_irqsave(&kibnal_data.kib_global_lock, flags); - - if (list_empty(&peer->ibp_conns) && - peer->ibp_accepting == 0 && - peer->ibp_connecting == 0 && - peer->ibp_error != 0) { - error = peer->ibp_error; - peer->ibp_error = 0; - - last_alive = cfs_time_current_sec() - - cfs_duration_sec(cfs_time_current() - - peer->ibp_last_alive); - } - - read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - - if (error != 0) - lnet_notify(kibnal_data.kib_ni, peer->ibp_nid, 0, last_alive); -} - -void -kibnal_schedule_conn (kib_conn_t *conn) -{ - unsigned long flags; - - kibnal_conn_addref(conn); /* ++ref for connd */ - - spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); - - list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns); - wake_up (&kibnal_data.kib_connd_waitq); - - spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); -} - -void -kibnal_close_conn_locked (kib_conn_t *conn, int error) -{ - /* This just does the immediate housekeeping. 'error' is zero for a - * normal shutdown which can happen only after the connection has been - * established. If the connection is established, schedule the - * connection to be finished off by the connd. Otherwise the connd is - * already dealing with it (either to set it up or tear it down). - * Caller holds kib_global_lock exclusively in irq context */ - kib_peer_t *peer = conn->ibc_peer; - - LASSERT (error != 0 || conn->ibc_state >= IBNAL_CONN_ESTABLISHED); - - if (error != 0 && conn->ibc_comms_error == 0) - conn->ibc_comms_error = error; - - if (conn->ibc_state != IBNAL_CONN_ESTABLISHED) - return; /* already being handled */ - - /* NB Can't take ibc_lock here (could be in IRQ context), without - * risking deadlock, so access to ibc_{tx_queue,active_txs} is racey */ - - if (error == 0 && - list_empty(&conn->ibc_tx_queue) && - list_empty(&conn->ibc_tx_queue_rsrvd) && - list_empty(&conn->ibc_tx_queue_nocred) && - list_empty(&conn->ibc_active_txs)) { - CDEBUG(D_NET, "closing conn to %s" - " rx# "LPD64" tx# "LPD64"\n", - libcfs_nid2str(peer->ibp_nid), - conn->ibc_txseq, conn->ibc_rxseq); - } else { - CDEBUG(D_NETERROR, "Closing conn to %s: error %d%s%s%s%s" - " rx# "LPD64" tx# "LPD64"\n", - libcfs_nid2str(peer->ibp_nid), error, - list_empty(&conn->ibc_tx_queue) ? "" : "(sending)", - list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)", - list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)", - list_empty(&conn->ibc_active_txs) ? "" : "(waiting)", - conn->ibc_txseq, conn->ibc_rxseq); - } - - list_del (&conn->ibc_list); - - if (list_empty (&peer->ibp_conns)) { /* no more conns */ - if (peer->ibp_persistence == 0 && /* non-persistent peer */ - kibnal_peer_active(peer)) /* still in peer table */ - kibnal_unlink_peer_locked (peer); - - /* set/clear error on last conn */ - peer->ibp_error = conn->ibc_comms_error; - } - - kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECT1); - - kibnal_schedule_conn(conn); - kibnal_conn_decref(conn); /* lose ibc_list's ref */ -} - -void -kibnal_close_conn (kib_conn_t *conn, int error) -{ - unsigned long flags; - - write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - - kibnal_close_conn_locked (conn, error); - - write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); -} - -void -kibnal_handle_early_rxs(kib_conn_t *conn) -{ - unsigned long flags; - kib_rx_t *rx; - - LASSERT (!in_interrupt()); - LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED); - - write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - while (!list_empty(&conn->ibc_early_rxs)) { - rx = list_entry(conn->ibc_early_rxs.next, - kib_rx_t, rx_list); - list_del(&rx->rx_list); - write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - - kibnal_handle_rx(rx); - - write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - } - write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); -} - -void -kibnal_abort_txs(kib_conn_t *conn, struct list_head *txs) -{ - LIST_HEAD (zombies); - struct list_head *tmp; - struct list_head *nxt; - kib_tx_t *tx; - - spin_lock(&conn->ibc_lock); - - list_for_each_safe (tmp, nxt, txs) { - tx = list_entry (tmp, kib_tx_t, tx_list); - - if (txs == &conn->ibc_active_txs) { - LASSERT (!tx->tx_queued); - LASSERT (tx->tx_waiting || tx->tx_sending != 0); - } else { - LASSERT (tx->tx_queued); - } - - tx->tx_status = -ECONNABORTED; - tx->tx_queued = 0; - tx->tx_waiting = 0; - - if (tx->tx_sending == 0) { - list_del (&tx->tx_list); - list_add (&tx->tx_list, &zombies); - } - } - - spin_unlock(&conn->ibc_lock); - - kibnal_txlist_done(&zombies, -ECONNABORTED); -} - -void -kibnal_conn_disconnected(kib_conn_t *conn) -{ - /* I'm the connd */ - LASSERT (!in_interrupt()); - LASSERT (current == kibnal_data.kib_connd); - LASSERT (conn->ibc_state >= IBNAL_CONN_INIT); - - kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTED); - - /* move QP to error state to make posted work items complete */ - kibnal_set_qp_state(conn, vv_qp_state_error); - - /* Complete all tx descs not waiting for sends to complete. - * NB we should be safe from RDMA now that the QP has changed state */ - - kibnal_abort_txs(conn, &conn->ibc_tx_queue); - kibnal_abort_txs(conn, &conn->ibc_tx_queue_rsrvd); - kibnal_abort_txs(conn, &conn->ibc_tx_queue_nocred); - kibnal_abort_txs(conn, &conn->ibc_active_txs); - - kibnal_handle_early_rxs(conn); - - kibnal_peer_notify(conn->ibc_peer); -} - -void -kibnal_peer_connect_failed (kib_peer_t *peer, int active, int error) -{ - LIST_HEAD (zombies); - unsigned long flags; - - /* Only the connd creates conns => single threaded */ - LASSERT (error != 0); - LASSERT (!in_interrupt()); - LASSERT (current == kibnal_data.kib_connd); - - write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - - if (active) { - LASSERT (peer->ibp_connecting != 0); - peer->ibp_connecting--; - } else { - LASSERT (peer->ibp_accepting != 0); - peer->ibp_accepting--; - } - - if (peer->ibp_connecting != 0 || - peer->ibp_accepting != 0) { - /* another connection attempt under way (loopback?)... */ - write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - return; - } - - if (list_empty(&peer->ibp_conns)) { - /* Say when active connection can be re-attempted */ - peer->ibp_reconnect_interval *= 2; - peer->ibp_reconnect_interval = - MAX(peer->ibp_reconnect_interval, - *kibnal_tunables.kib_min_reconnect_interval); - peer->ibp_reconnect_interval = - MIN(peer->ibp_reconnect_interval, - *kibnal_tunables.kib_max_reconnect_interval); - - peer->ibp_reconnect_time = jiffies + - peer->ibp_reconnect_interval * HZ; - - /* Take peer's blocked transmits to complete with error */ - list_add(&zombies, &peer->ibp_tx_queue); - list_del_init(&peer->ibp_tx_queue); - - if (kibnal_peer_active(peer) && - (peer->ibp_persistence == 0)) { - /* failed connection attempt on non-persistent peer */ - kibnal_unlink_peer_locked (peer); - } - - peer->ibp_error = error; - } else { - /* Can't have blocked transmits if there are connections */ - LASSERT (list_empty(&peer->ibp_tx_queue)); - } - - write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - - kibnal_peer_notify(peer); - - if (list_empty (&zombies)) - return; - - CDEBUG (D_NETERROR, "Deleting messages for %s: connection failed\n", - libcfs_nid2str(peer->ibp_nid)); - - kibnal_txlist_done(&zombies, -EHOSTUNREACH); -} - -void -kibnal_reject(cm_cep_handle_t cep, int why) -{ - static cm_reject_data_t rejs[3]; - cm_reject_data_t *rej = &rejs[why]; - - LASSERT (why >= 0 && why < sizeof(rejs)/sizeof(rejs[0])); - - /* If I wasn't so lazy, I'd initialise this only once; it's effective - * read-only */ - rej->reason = cm_rej_code_usr_rej; - rej->priv_data[0] = (IBNAL_MSG_MAGIC) & 0xff; - rej->priv_data[1] = (IBNAL_MSG_MAGIC >> 8) & 0xff; - rej->priv_data[2] = (IBNAL_MSG_MAGIC >> 16) & 0xff; - rej->priv_data[3] = (IBNAL_MSG_MAGIC >> 24) & 0xff; - rej->priv_data[4] = (IBNAL_MSG_VERSION) & 0xff; - rej->priv_data[5] = (IBNAL_MSG_VERSION >> 8) & 0xff; - rej->priv_data[6] = why; - - cm_reject(cep, rej); -} - -void -kibnal_connreq_done(kib_conn_t *conn, int active, int status) -{ - struct list_head txs; - kib_peer_t *peer = conn->ibc_peer; - unsigned long flags; - kib_tx_t *tx; - - CDEBUG(D_NET,"%d\n", status); - - /* Only the connd creates conns => single threaded */ - LASSERT (!in_interrupt()); - LASSERT (current == kibnal_data.kib_connd); - LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED); - - if (active) { - LASSERT (peer->ibp_connecting > 0); - } else { - LASSERT (peer->ibp_accepting > 0); - } - - LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); - conn->ibc_connvars = NULL; - - if (status != 0) { - /* failed to establish connection */ - switch (conn->ibc_state) { - default: - LBUG(); - - case IBNAL_CONN_ACTIVE_CHECK_REPLY: - /* got a connection reply but failed checks */ - LASSERT (active); - kibnal_reject(conn->ibc_cep, IBNAL_REJECT_FATAL); - break; - - case IBNAL_CONN_ACTIVE_CONNECT: - LASSERT (active); - cm_cancel(conn->ibc_cep); - cfs_pause(cfs_time_seconds(1)/10); - /* cm_connect() failed immediately or - * callback returned failure */ - break; - - case IBNAL_CONN_ACTIVE_ARP: - LASSERT (active); - /* ibat_get_ib_data() failed immediately - * or callback returned failure */ - break; - - case IBNAL_CONN_INIT: - break; - - case IBNAL_CONN_PASSIVE_WAIT: - LASSERT (!active); - /* cm_accept callback returned failure */ - break; - } - - kibnal_peer_connect_failed(peer, active, status); - kibnal_conn_disconnected(conn); - return; - } - - /* connection established */ - write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - - if (active) { - LASSERT(conn->ibc_state == IBNAL_CONN_ACTIVE_RTU); - } else { - LASSERT(conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT); - } - - conn->ibc_last_send = jiffies; - kibnal_set_conn_state(conn, IBNAL_CONN_ESTABLISHED); - kibnal_peer_alive(peer); - - /* Add conn to peer's list and nuke any dangling conns from a different - * peer instance... */ - kibnal_conn_addref(conn); /* +1 ref for ibc_list */ - list_add(&conn->ibc_list, &peer->ibp_conns); - kibnal_close_stale_conns_locked (peer, conn->ibc_incarnation); - - if (!kibnal_peer_active(peer) || /* peer has been deleted */ - conn->ibc_comms_error != 0 || /* comms error */ - conn->ibc_disconnect) { /* need to disconnect */ - - /* start to shut down connection */ - kibnal_close_conn_locked(conn, -ECONNABORTED); - - write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - kibnal_peer_connect_failed(peer, active, -ECONNABORTED); - return; - } - - if (active) - peer->ibp_connecting--; - else - peer->ibp_accepting--; - - /* grab pending txs while I have the lock */ - list_add(&txs, &peer->ibp_tx_queue); - list_del_init(&peer->ibp_tx_queue); - - peer->ibp_reconnect_interval = 0; /* OK to reconnect at any time */ - - write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - - /* Schedule blocked txs */ - spin_lock (&conn->ibc_lock); - while (!list_empty (&txs)) { - tx = list_entry (txs.next, kib_tx_t, tx_list); - list_del (&tx->tx_list); - - kibnal_queue_tx_locked (tx, conn); - } - spin_unlock (&conn->ibc_lock); - kibnal_check_sends (conn); - - /* schedule blocked rxs */ - kibnal_handle_early_rxs(conn); -} - -void -kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *cmdata, void *arg) -{ - static cm_dreply_data_t drep; /* just zeroed space */ - - kib_conn_t *conn = (kib_conn_t *)arg; - unsigned long flags; - - /* CAVEAT EMPTOR: tasklet context */ - - switch (cmdata->status) { - default: - LBUG(); - - case cm_event_disconn_request: - /* IBNAL_CONN_ACTIVE_RTU: gets closed in kibnal_connreq_done - * IBNAL_CONN_ESTABLISHED: I start it closing - * otherwise: it's closing anyway */ - cm_disconnect(conn->ibc_cep, NULL, &drep); - cm_cancel(conn->ibc_cep); - - write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - LASSERT (!conn->ibc_disconnect); - conn->ibc_disconnect = 1; - - switch (conn->ibc_state) { - default: - LBUG(); - - case IBNAL_CONN_ACTIVE_RTU: - /* kibnal_connreq_done is getting there; It'll see - * ibc_disconnect set... */ - break; - - case IBNAL_CONN_ESTABLISHED: - /* kibnal_connreq_done got there already; get - * disconnect going... */ - kibnal_close_conn_locked(conn, 0); - break; - - case IBNAL_CONN_DISCONNECT1: - /* kibnal_disconnect_conn is getting there; It'll see - * ibc_disconnect set... */ - break; - - case IBNAL_CONN_DISCONNECT2: - /* kibnal_disconnect_conn got there already; complete - * the disconnect. */ - kibnal_schedule_conn(conn); - break; - } - write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - break; - - case cm_event_disconn_timeout: - case cm_event_disconn_reply: - write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT2); - LASSERT (!conn->ibc_disconnect); - conn->ibc_disconnect = 1; - - /* kibnal_disconnect_conn sent the disconnect request. */ - kibnal_schedule_conn(conn); - - write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - break; - - case cm_event_connected: - case cm_event_conn_timeout: - case cm_event_conn_reject: - LASSERT (conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT); - conn->ibc_connvars->cv_conndata = *cmdata; - - kibnal_schedule_conn(conn); - break; - } - - kibnal_conn_decref(conn); /* lose my ref */ -} - -void -kibnal_check_passive_wait(kib_conn_t *conn) -{ - int rc; - - switch (conn->ibc_connvars->cv_conndata.status) { - default: - LBUG(); - - case cm_event_connected: - kibnal_conn_addref(conn); /* ++ ref for CM callback */ - rc = kibnal_set_qp_state(conn, vv_qp_state_rts); - if (rc != 0) - conn->ibc_comms_error = rc; - /* connection _has_ been established; it's just that we've had - * an error immediately... */ - kibnal_connreq_done(conn, 0, 0); - break; - - case cm_event_conn_timeout: - kibnal_connreq_done(conn, 0, -ETIMEDOUT); - break; - - case cm_event_conn_reject: - kibnal_connreq_done(conn, 0, -ECONNRESET); - break; - } -} - -void -kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq) -{ - static kib_msg_t txmsg; - static kib_msg_t rxmsg; - static cm_reply_data_t reply; - - kib_conn_t *conn = NULL; - int rc = 0; - int reason; - int rxmsgnob; - rwlock_t *g_lock = &kibnal_data.kib_global_lock; - kib_peer_t *peer; - kib_peer_t *peer2; - unsigned long flags; - kib_connvars_t *cv; - cm_return_t cmrc; - vv_return_t vvrc; - - /* I'm the connd executing in thread context - * No concurrency problems with static data! */ - LASSERT (!in_interrupt()); - LASSERT (current == kibnal_data.kib_connd); - - if (cmreq->sid != (__u64)(*kibnal_tunables.kib_service_number)) { - CERROR(LPX64" != IBNAL_SERVICE_NUMBER("LPX64")\n", - cmreq->sid, (__u64)(*kibnal_tunables.kib_service_number)); - reason = IBNAL_REJECT_FATAL; - goto reject; - } - - /* copy into rxmsg to avoid alignment issues */ - rxmsgnob = MIN(cm_REQ_priv_data_len, sizeof(rxmsg)); - memcpy(&rxmsg, cmreq->priv_data, rxmsgnob); - - rc = kibnal_unpack_msg(&rxmsg, 0, rxmsgnob); - if (rc != 0) { - /* SILENT! kibnal_unpack_msg() complains if required */ - reason = IBNAL_REJECT_FATAL; - goto reject; - } - - if (rxmsg.ibm_version != IBNAL_MSG_VERSION) - CWARN("Connection from %s: old protocol version 0x%x\n", - libcfs_nid2str(rxmsg.ibm_srcnid), rxmsg.ibm_version); - - if (rxmsg.ibm_type != IBNAL_MSG_CONNREQ) { - CERROR("Unexpected connreq msg type: %x from %s\n", - rxmsg.ibm_type, libcfs_nid2str(rxmsg.ibm_srcnid)); - reason = IBNAL_REJECT_FATAL; - goto reject; - } - - if (kibnal_data.kib_ni->ni_nid != rxmsg.ibm_dstnid) { - CERROR("Can't accept %s: bad dst nid %s\n", - libcfs_nid2str(rxmsg.ibm_srcnid), - libcfs_nid2str(rxmsg.ibm_dstnid)); - reason = IBNAL_REJECT_FATAL; - goto reject; - } - - if (rxmsg.ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) { - CERROR("Can't accept %s: incompatible queue depth %d (%d wanted)\n", - libcfs_nid2str(rxmsg.ibm_srcnid), - rxmsg.ibm_u.connparams.ibcp_queue_depth, - IBNAL_MSG_QUEUE_SIZE); - reason = IBNAL_REJECT_FATAL; - goto reject; - } - - if (rxmsg.ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) { - CERROR("Can't accept %s: message size %d too big (%d max)\n", - libcfs_nid2str(rxmsg.ibm_srcnid), - rxmsg.ibm_u.connparams.ibcp_max_msg_size, - IBNAL_MSG_SIZE); - reason = IBNAL_REJECT_FATAL; - goto reject; - } - - if (rxmsg.ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) { - CERROR("Can't accept %s: max frags %d too big (%d max)\n", - libcfs_nid2str(rxmsg.ibm_srcnid), - rxmsg.ibm_u.connparams.ibcp_max_frags, - IBNAL_MAX_RDMA_FRAGS); - reason = IBNAL_REJECT_FATAL; - goto reject; - } - - /* assume 'rxmsg.ibm_srcnid' is a new peer; create */ - rc = kibnal_create_peer (&peer, rxmsg.ibm_srcnid); - if (rc != 0) { - CERROR("Can't create peer for %s\n", - libcfs_nid2str(rxmsg.ibm_srcnid)); - reason = IBNAL_REJECT_NO_RESOURCES; - goto reject; - } - - write_lock_irqsave(g_lock, flags); - - if (kibnal_data.kib_listen_handle == NULL) { - write_unlock_irqrestore(g_lock, flags); - - CWARN ("Shutdown has started, rejecting connreq from %s\n", - libcfs_nid2str(rxmsg.ibm_srcnid)); - kibnal_peer_decref(peer); - reason = IBNAL_REJECT_FATAL; - goto reject; - } - - peer2 = kibnal_find_peer_locked(rxmsg.ibm_srcnid); - if (peer2 != NULL) { - /* tie-break connection race in favour of the higher NID */ - if (peer2->ibp_connecting != 0 && - rxmsg.ibm_srcnid < kibnal_data.kib_ni->ni_nid) { - write_unlock_irqrestore(g_lock, flags); - - CWARN("Conn race %s\n", - libcfs_nid2str(rxmsg.ibm_srcnid)); - - kibnal_peer_decref(peer); - reason = IBNAL_REJECT_CONN_RACE; - goto reject; - } - - peer2->ibp_accepting++; - kibnal_peer_addref(peer2); - - write_unlock_irqrestore(g_lock, flags); - kibnal_peer_decref(peer); - peer = peer2; - } else { - /* Brand new peer */ - LASSERT (peer->ibp_accepting == 0); - peer->ibp_accepting = 1; - - kibnal_peer_addref(peer); - list_add_tail(&peer->ibp_list, kibnal_nid2peerlist(rxmsg.ibm_srcnid)); - - write_unlock_irqrestore(g_lock, flags); - } - - conn = kibnal_create_conn(cep); - if (conn == NULL) { - CERROR("Can't create conn for %s\n", - libcfs_nid2str(rxmsg.ibm_srcnid)); - kibnal_peer_connect_failed(peer, 0, -ENOMEM); - kibnal_peer_decref(peer); - reason = IBNAL_REJECT_NO_RESOURCES; - goto reject; - } - - conn->ibc_version = rxmsg.ibm_version; - - conn->ibc_peer = peer; /* conn takes over my ref */ - conn->ibc_incarnation = rxmsg.ibm_srcstamp; - conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; - conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE; - LASSERT (conn->ibc_credits + conn->ibc_reserved_credits - <= IBNAL_RX_MSGS); - - cv = conn->ibc_connvars; - - cv->cv_txpsn = cmreq->cep_data.start_psn; - cv->cv_remote_qpn = cmreq->cep_data.qpn; - cv->cv_path = cmreq->path_data.path; - cv->cv_rnr_count = cmreq->cep_data.rtr_retry_cnt; - // XXX cmreq->cep_data.retry_cnt; - cv->cv_port = cmreq->cep_data.local_port_num; - - vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port, - &cv->cv_path.sgid, &cv->cv_sgid_index); - if (vvrc != vv_return_ok) { - CERROR("gid2gid_index failed for %s: %d\n", - libcfs_nid2str(rxmsg.ibm_srcnid), vvrc); - rc = -EIO; - reason = IBNAL_REJECT_FATAL; - goto reject; - } - - vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port, - cv->cv_path.pkey, &cv->cv_pkey_index); - if (vvrc != vv_return_ok) { - CERROR("pkey2pkey_index failed for %s: %d\n", - libcfs_nid2str(rxmsg.ibm_srcnid), vvrc); - rc = -EIO; - reason = IBNAL_REJECT_FATAL; - goto reject; - } - - rc = kibnal_set_qp_state(conn, vv_qp_state_init); - if (rc != 0) { - reason = IBNAL_REJECT_FATAL; - goto reject; - } - - rc = kibnal_post_receives(conn); - if (rc != 0) { - CERROR("Can't post receives for %s\n", - libcfs_nid2str(rxmsg.ibm_srcnid)); - reason = IBNAL_REJECT_FATAL; - goto reject; - } - - rc = kibnal_set_qp_state(conn, vv_qp_state_rtr); - if (rc != 0) { - reason = IBNAL_REJECT_FATAL; - goto reject; - } - - memset(&reply, 0, sizeof(reply)); - reply.qpn = cv->cv_local_qpn; - reply.qkey = IBNAL_QKEY; - reply.start_psn = cv->cv_rxpsn; - reply.arb_initiator_depth = IBNAL_ARB_INITIATOR_DEPTH; - reply.arb_resp_res = IBNAL_ARB_RESP_RES; - reply.failover_accepted = IBNAL_FAILOVER_ACCEPTED; - reply.rnr_retry_count = cv->cv_rnr_count; - reply.targ_ack_delay = kibnal_data.kib_hca_attrs.ack_delay; - - /* setup txmsg... */ - memset(&txmsg, 0, sizeof(txmsg)); - kibnal_init_msg(&txmsg, IBNAL_MSG_CONNACK, - sizeof(txmsg.ibm_u.connparams)); - LASSERT (txmsg.ibm_nob <= cm_REP_priv_data_len); - txmsg.ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE; - txmsg.ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE; - txmsg.ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS; - kibnal_pack_msg(&txmsg, conn->ibc_version, - 0, rxmsg.ibm_srcnid, rxmsg.ibm_srcstamp, 0); - - /* ...and copy into reply to avoid alignment issues */ - memcpy(&reply.priv_data, &txmsg, txmsg.ibm_nob); - - kibnal_set_conn_state(conn, IBNAL_CONN_PASSIVE_WAIT); - - cmrc = cm_accept(conn->ibc_cep, &reply, NULL, - kibnal_cm_callback, conn); - - if (cmrc == cm_stat_success) - return; /* callback has got my ref on conn */ - - /* back out state change (no callback happening) */ - kibnal_set_conn_state(conn, IBNAL_CONN_INIT); - rc = -EIO; - reason = IBNAL_REJECT_FATAL; - - reject: - CDEBUG(D_NET, "Rejecting connreq from %s\n", - libcfs_nid2str(rxmsg.ibm_srcnid)); - - kibnal_reject(cep, reason); - - if (conn != NULL) { - LASSERT (rc != 0); - kibnal_connreq_done(conn, 0, rc); - kibnal_conn_decref(conn); - } else { - cm_destroy_cep(cep); - } -} - -void -kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *data, void *arg) -{ - cm_request_data_t *cmreq = &data->data.request; - kib_pcreq_t *pcr; - unsigned long flags; - - LASSERT (arg == NULL); - - if (data->status != cm_event_conn_request) { - CERROR("status %d is not cm_event_conn_request\n", - data->status); - return; - } - - LIBCFS_ALLOC_ATOMIC(pcr, sizeof(*pcr)); - if (pcr == NULL) { - CERROR("Can't allocate passive connreq\n"); - - kibnal_reject(cep, IBNAL_REJECT_NO_RESOURCES); - cm_destroy_cep(cep); - return; - } - - pcr->pcr_cep = cep; - pcr->pcr_cmreq = *cmreq; - - spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); - - list_add_tail(&pcr->pcr_list, &kibnal_data.kib_connd_pcreqs); - wake_up(&kibnal_data.kib_connd_waitq); -spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); -} - - -void -kibnal_active_connect_callback (cm_cep_handle_t cep, cm_conn_data_t *cd, - void *arg) -{ - /* CAVEAT EMPTOR: tasklet context */ - kib_conn_t *conn = (kib_conn_t *)arg; - kib_connvars_t *cv = conn->ibc_connvars; - - LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT); - cv->cv_conndata = *cd; - - kibnal_schedule_conn(conn); - kibnal_conn_decref(conn); -} - -void -kibnal_connect_conn (kib_conn_t *conn) -{ - static cm_request_data_t cmreq; - static kib_msg_t msg; - - kib_connvars_t *cv = conn->ibc_connvars; - kib_peer_t *peer = conn->ibc_peer; - cm_return_t cmrc; - - /* Only called by connd => statics OK */ - LASSERT (!in_interrupt()); - LASSERT (current == kibnal_data.kib_connd); - LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP); - - memset(&cmreq, 0, sizeof(cmreq)); - - cmreq.sid = (__u64)(*kibnal_tunables.kib_service_number); - - cmreq.cep_data.ca_guid = kibnal_data.kib_hca_attrs.guid; - cmreq.cep_data.qpn = cv->cv_local_qpn; - cmreq.cep_data.retry_cnt = *kibnal_tunables.kib_retry_cnt; - cmreq.cep_data.rtr_retry_cnt = *kibnal_tunables.kib_rnr_cnt; - cmreq.cep_data.start_psn = cv->cv_rxpsn; - cmreq.cep_data.end_to_end_flow_ctrl = IBNAL_EE_FLOW_CNT; - // XXX ack_timeout? - // offered_resp_res - // offered_initiator_depth - - cmreq.path_data.subn_local = IBNAL_LOCAL_SUB; - cmreq.path_data.path = cv->cv_path; - - /* setup msg... */ - memset(&msg, 0, sizeof(msg)); - kibnal_init_msg(&msg, IBNAL_MSG_CONNREQ, sizeof(msg.ibm_u.connparams)); - LASSERT(msg.ibm_nob <= cm_REQ_priv_data_len); - msg.ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE; - msg.ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE; - msg.ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS; - kibnal_pack_msg(&msg, conn->ibc_version, 0, peer->ibp_nid, 0, 0); - - if (the_lnet.ln_testprotocompat != 0) { - /* single-shot proto check */ - LNET_LOCK(); - if ((the_lnet.ln_testprotocompat & 1) != 0) { - msg.ibm_version++; - the_lnet.ln_testprotocompat &= ~1; - } - if ((the_lnet.ln_testprotocompat & 2) != 0) { - msg.ibm_magic = LNET_PROTO_MAGIC; - the_lnet.ln_testprotocompat &= ~2; - } - LNET_UNLOCK(); - } - - /* ...and copy into cmreq to avoid alignment issues */ - memcpy(&cmreq.priv_data, &msg, msg.ibm_nob); - - CDEBUG(D_NET, "Connecting %p to %s\n", conn, - libcfs_nid2str(peer->ibp_nid)); - - kibnal_conn_addref(conn); /* ++ref for CM callback */ - kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CONNECT); - - cmrc = cm_connect(conn->ibc_cep, &cmreq, - kibnal_active_connect_callback, conn); - if (cmrc == cm_stat_success) { - CDEBUG(D_NET, "connection REQ sent to %s\n", - libcfs_nid2str(peer->ibp_nid)); - return; - } - - CERROR ("Connect %s failed: %d\n", libcfs_nid2str(peer->ibp_nid), cmrc); - kibnal_conn_decref(conn); /* drop callback's ref */ - kibnal_connreq_done(conn, 1, -EHOSTUNREACH); -} - -void -kibnal_reconnect (kib_conn_t *conn, int why) -{ - kib_peer_t *peer = conn->ibc_peer; - int retry; - unsigned long flags; - cm_return_t cmrc; - cm_cep_handle_t cep; - - LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT); - - read_lock_irqsave(&kibnal_data.kib_global_lock, flags); - - LASSERT (peer->ibp_connecting > 0); /* 'conn' at least */ - - /* retry connection if it's still needed and no other connection - * attempts (active or passive) are in progress. - * Immediate reconnect is required, so I don't even look at the - * reconnection timeout etc */ - - retry = (!list_empty(&peer->ibp_tx_queue) && - peer->ibp_connecting == 1 && - peer->ibp_accepting == 0); - - read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - - if (!retry) { - kibnal_connreq_done(conn, 1, why); - return; - } - - cep = cm_create_cep(cm_cep_transp_rc); - if (cep == NULL) { - CERROR("Can't create new CEP\n"); - kibnal_connreq_done(conn, 1, -ENOMEM); - return; - } - - cmrc = cm_cancel(conn->ibc_cep); - LASSERT (cmrc == cm_stat_success); - cmrc = cm_destroy_cep(conn->ibc_cep); - LASSERT (cmrc == cm_stat_success); - - conn->ibc_cep = cep; - - /* reuse conn; no need to peer->ibp_connecting++ */ - kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP); - kibnal_connect_conn(conn); -} - -void -kibnal_check_connreply (kib_conn_t *conn) -{ - static cm_rtu_data_t rtu; - static kib_msg_t msg; - - kib_connvars_t *cv = conn->ibc_connvars; - cm_reply_data_t *reply = &cv->cv_conndata.data.reply; - kib_peer_t *peer = conn->ibc_peer; - int msgnob; - cm_return_t cmrc; - unsigned long flags; - int rc; - - /* Only called by connd => statics OK */ - LASSERT (!in_interrupt()); - LASSERT (current == kibnal_data.kib_connd); - LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT); - - if (cv->cv_conndata.status == cm_event_conn_reply) { - cv->cv_remote_qpn = reply->qpn; - cv->cv_txpsn = reply->start_psn; - // XXX reply->targ_ack_delay; - cv->cv_rnr_count = reply->rnr_retry_count; - - kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY); - - /* copy into msg to avoid alignment issues */ - msgnob = MIN(cm_REP_priv_data_len, sizeof(msg)); - memcpy(&msg, &reply->priv_data, msgnob); - - rc = kibnal_unpack_msg(&msg, conn->ibc_version, msgnob); - if (rc != 0) { - CERROR("Can't unpack reply from %s\n", - libcfs_nid2str(peer->ibp_nid)); - kibnal_connreq_done(conn, 1, rc); - return; - } - - if (msg.ibm_type != IBNAL_MSG_CONNACK ) { - CERROR("Unexpected message type %d from %s\n", - msg.ibm_type, libcfs_nid2str(peer->ibp_nid)); - kibnal_connreq_done(conn, 1, -EPROTO); - return; - } - - if (msg.ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) { - CERROR("%s has incompatible queue depth %d(%d wanted)\n", - libcfs_nid2str(peer->ibp_nid), - msg.ibm_u.connparams.ibcp_queue_depth, - IBNAL_MSG_QUEUE_SIZE); - kibnal_connreq_done(conn, 1, -EPROTO); - return; - } - - if (msg.ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) { - CERROR("%s max message size %d too big (%d max)\n", - libcfs_nid2str(peer->ibp_nid), - msg.ibm_u.connparams.ibcp_max_msg_size, - IBNAL_MSG_SIZE); - kibnal_connreq_done(conn, 1, -EPROTO); - return; - } - - if (msg.ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) { - CERROR("%s max frags %d too big (%d max)\n", - libcfs_nid2str(peer->ibp_nid), - msg.ibm_u.connparams.ibcp_max_frags, - IBNAL_MAX_RDMA_FRAGS); - kibnal_connreq_done(conn, 1, -EPROTO); - return; - } - - read_lock_irqsave(&kibnal_data.kib_global_lock, flags); - if (kibnal_data.kib_ni->ni_nid == msg.ibm_dstnid && - msg.ibm_dststamp == kibnal_data.kib_incarnation) - rc = 0; - else - rc = -ESTALE; - read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - if (rc != 0) { - CERROR("Stale connection reply from %s\n", - libcfs_nid2str(peer->ibp_nid)); - kibnal_connreq_done(conn, 1, rc); - return; - } - - conn->ibc_incarnation = msg.ibm_srcstamp; - conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; - conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE; - LASSERT (conn->ibc_credits + conn->ibc_reserved_credits - <= IBNAL_RX_MSGS); - - rc = kibnal_post_receives(conn); - if (rc != 0) { - CERROR("Can't post receives for %s\n", - libcfs_nid2str(peer->ibp_nid)); - kibnal_connreq_done(conn, 1, rc); - return; - } - - rc = kibnal_set_qp_state(conn, vv_qp_state_rtr); - if (rc != 0) { - kibnal_connreq_done(conn, 1, rc); - return; - } - - rc = kibnal_set_qp_state(conn, vv_qp_state_rts); - if (rc != 0) { - kibnal_connreq_done(conn, 1, rc); - return; - } - - kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_RTU); - kibnal_conn_addref(conn); /* ++for CM callback */ - - memset(&rtu, 0, sizeof(rtu)); - cmrc = cm_accept(conn->ibc_cep, NULL, &rtu, - kibnal_cm_callback, conn); - if (cmrc == cm_stat_success) { - /* Now I'm racing with disconnect signalled by - * kibnal_cm_callback */ - kibnal_connreq_done(conn, 1, 0); - return; - } - - CERROR("cm_accept %s failed: %d\n", - libcfs_nid2str(peer->ibp_nid), cmrc); - /* Back out of RTU: no callback coming */ - kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY); - kibnal_conn_decref(conn); - kibnal_connreq_done(conn, 1, -EIO); - return; - } - - if (cv->cv_conndata.status == cm_event_conn_reject) { - - if (cv->cv_conndata.data.reject.reason == cm_rej_code_usr_rej) { - unsigned char *bytes = - cv->cv_conndata.data.reject.priv_data; - int magic = (bytes[0]) | - (bytes[1] << 8) | - (bytes[2] << 16) | - (bytes[3] << 24); - int version = (bytes[4]) | - (bytes[5] << 8); - int why = (bytes[6]); - - /* Expected proto/version: she just doesn't like me (or - * ran out of resources) */ - if (magic == IBNAL_MSG_MAGIC && - version == conn->ibc_version) { - CERROR("conn -> %s rejected: fatal error %d\n", - libcfs_nid2str(peer->ibp_nid), why); - - if (why == IBNAL_REJECT_CONN_RACE) - kibnal_reconnect(conn, -EALREADY); - else - kibnal_connreq_done(conn, 1, -ECONNREFUSED); - return; - } - - /* Fail unless it's worth retrying with an old proto - * version */ - if (!(magic == IBNAL_MSG_MAGIC && - version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD && - conn->ibc_version == IBNAL_MSG_VERSION)) { - CERROR("conn -> %s rejected: bad protocol " - "magic/ver %08x/%x why %d\n", - libcfs_nid2str(peer->ibp_nid), - magic, version, why); - - kibnal_connreq_done(conn, 1, -ECONNREFUSED); - return; - } - - conn->ibc_version = version; - CWARN ("Connection to %s refused: " - "retrying with old protocol version 0x%x\n", - libcfs_nid2str(peer->ibp_nid), version); - - kibnal_reconnect(conn, -ECONNREFUSED); - return; - } else if (cv->cv_conndata.data.reject.reason == - cm_rej_code_stale_conn) { - - CWARN ("conn -> %s stale: retrying\n", - libcfs_nid2str(peer->ibp_nid)); - - kibnal_reconnect(conn, -ESTALE); - return; - } else { - CDEBUG(D_NETERROR, "conn -> %s rejected: reason %d\n", - libcfs_nid2str(peer->ibp_nid), - cv->cv_conndata.data.reject.reason); - kibnal_connreq_done(conn, 1, -ECONNREFUSED); - return; - } - /* NOT REACHED */ - } - - CDEBUG(D_NETERROR, "conn -> %s failed: %d\n", - libcfs_nid2str(peer->ibp_nid), cv->cv_conndata.status); - kibnal_connreq_done(conn, 1, -ECONNABORTED); -} - -void -kibnal_arp_done (kib_conn_t *conn) -{ - kib_peer_t *peer = conn->ibc_peer; - kib_connvars_t *cv = conn->ibc_connvars; - ibat_arp_data_t *arp = &cv->cv_arp; - ib_path_record_v2_t *path = &cv->cv_path; - vv_return_t vvrc; - int rc; - unsigned long flags; - - LASSERT (!in_interrupt()); - LASSERT (current == kibnal_data.kib_connd); - LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP); - LASSERT (peer->ibp_arp_count > 0); - - if (cv->cv_arprc != ibat_stat_ok) { - CDEBUG(D_NETERROR, "Arp %s @ %u.%u.%u.%u failed: %d\n", - libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip), - cv->cv_arprc); - goto failed; - } - - if ((arp->mask & IBAT_PRI_PATH_VALID) != 0) { - CDEBUG(D_NET, "Got valid path for %s\n", - libcfs_nid2str(peer->ibp_nid)); - - *path = *arp->primary_path; - - vvrc = base_gid2port_num(kibnal_data.kib_hca, &path->sgid, - &cv->cv_port); - if (vvrc != vv_return_ok) { - CWARN("base_gid2port_num failed for %s @ %u.%u.%u.%u: %d\n", - libcfs_nid2str(peer->ibp_nid), - HIPQUAD(peer->ibp_ip), vvrc); - goto failed; - } - - vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port, - &path->sgid, &cv->cv_sgid_index); - if (vvrc != vv_return_ok) { - CWARN("gid2gid_index failed for %s @ %u.%u.%u.%u: %d\n", - libcfs_nid2str(peer->ibp_nid), - HIPQUAD(peer->ibp_ip), vvrc); - goto failed; - } - - vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port, - path->pkey, &cv->cv_pkey_index); - if (vvrc != vv_return_ok) { - CWARN("pkey2pkey_index failed for %s @ %u.%u.%u.%u: %d\n", - libcfs_nid2str(peer->ibp_nid), - HIPQUAD(peer->ibp_ip), vvrc); - goto failed; - } - - path->mtu = IBNAL_IB_MTU; - - } else if ((arp->mask & IBAT_LID_VALID) != 0) { - CWARN("Creating new path record for %s @ %u.%u.%u.%u\n", - libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip)); - - cv->cv_pkey_index = IBNAL_PKEY_IDX; - cv->cv_sgid_index = IBNAL_SGID_IDX; - cv->cv_port = arp->local_port_num; - - memset(path, 0, sizeof(*path)); - - vvrc = port_num2base_gid(kibnal_data.kib_hca, cv->cv_port, - &path->sgid); - if (vvrc != vv_return_ok) { - CWARN("port_num2base_gid failed for %s @ %u.%u.%u.%u: %d\n", - libcfs_nid2str(peer->ibp_ip), - HIPQUAD(peer->ibp_ip), vvrc); - goto failed; - } - - vvrc = port_num2base_lid(kibnal_data.kib_hca, cv->cv_port, - &path->slid); - if (vvrc != vv_return_ok) { - CWARN("port_num2base_lid failed for %s @ %u.%u.%u.%u: %d\n", - libcfs_nid2str(peer->ibp_ip), - HIPQUAD(peer->ibp_ip), vvrc); - goto failed; - } - - path->dgid = arp->gid; - path->sl = IBNAL_SERVICE_LEVEL; - path->dlid = arp->lid; - path->mtu = IBNAL_IB_MTU; - path->rate = IBNAL_STATIC_RATE; - path->pkt_life_time = IBNAL_PKT_LIFETIME; - path->pkey = IBNAL_PKEY; - path->traffic_class = IBNAL_TRAFFIC_CLASS; - } else { - CWARN("Arp for %s @ %u.%u.%u.%u returned neither PATH nor LID\n", - libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip)); - goto failed; - } - - rc = kibnal_set_qp_state(conn, vv_qp_state_init); - if (rc != 0) { - kibnal_connreq_done(conn, 1, rc); - } - - /* do the actual connection request */ - kibnal_connect_conn(conn); - return; - - failed: - write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - peer->ibp_arp_count--; - if (peer->ibp_arp_count == 0) { - /* final ARP attempt failed */ - write_unlock_irqrestore(&kibnal_data.kib_global_lock, - flags); - CDEBUG(D_NETERROR, "Arp %s @ %u.%u.%u.%u failed (final attempt)\n", - libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip)); - } else { - /* Retry ARP: ibp_connecting++ so terminating conn - * doesn't end peer's connection attempt */ - peer->ibp_connecting++; - write_unlock_irqrestore(&kibnal_data.kib_global_lock, - flags); - CDEBUG(D_NETERROR, "Arp %s @ %u.%u.%u.%u failed (%d attempts left)\n", - libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip), - peer->ibp_arp_count); - - kibnal_schedule_peer_arp(peer); - } - kibnal_connreq_done(conn, 1, -ENETUNREACH); -} - -void -kibnal_arp_callback (ibat_stat_t arprc, ibat_arp_data_t *arp_data, void *arg) -{ - /* CAVEAT EMPTOR: tasklet context */ - kib_peer_t *peer; - kib_conn_t *conn = (kib_conn_t *)arg; - - LASSERT (conn != NULL); - LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP); - - peer = conn->ibc_peer; - - if (arprc != ibat_stat_ok) - CDEBUG(D_NETERROR, "Arp %s at %u.%u.%u.%u failed: %d\n", - libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip), arprc); - else - CDEBUG(D_NET, "Arp %s at %u.%u.%u.%u OK: LID %s PATH %s\n", - libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip), - (arp_data->mask & IBAT_LID_VALID) == 0 ? "invalid" : "valid", - (arp_data->mask & IBAT_PRI_PATH_VALID) == 0 ? "invalid" : "valid"); - - conn->ibc_connvars->cv_arprc = arprc; - if (arprc == ibat_stat_ok) - conn->ibc_connvars->cv_arp = *arp_data; - - kibnal_schedule_conn(conn); - kibnal_conn_decref(conn); -} - -void -kibnal_arp_peer (kib_peer_t *peer) -{ - cm_cep_handle_t cep; - kib_conn_t *conn; - int ibatrc; - - /* Only the connd does this (i.e. single threaded) */ - LASSERT (current == kibnal_data.kib_connd); - LASSERT (peer->ibp_connecting != 0); - LASSERT (peer->ibp_arp_count > 0); - - cep = cm_create_cep(cm_cep_transp_rc); - if (cep == NULL) { - CERROR ("Can't create cep for conn->%s\n", - libcfs_nid2str(peer->ibp_nid)); - kibnal_peer_connect_failed(peer, 1, -ENOMEM); - return; - } - - conn = kibnal_create_conn(cep); - if (conn == NULL) { - CERROR ("Can't allocate conn->%s\n", - libcfs_nid2str(peer->ibp_nid)); - cm_destroy_cep(cep); - kibnal_peer_connect_failed(peer, 1, -ENOMEM); - return; - } - - conn->ibc_peer = peer; - kibnal_peer_addref(peer); - - kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP); - - ibatrc = ibat_get_ib_data(htonl(peer->ibp_ip), INADDR_ANY, - ibat_paths_primary, - &conn->ibc_connvars->cv_arp, - kibnal_arp_callback, conn, 0); - CDEBUG(D_NET,"ibatrc %d\n", ibatrc); - switch (ibatrc) { - default: - LBUG(); - - case ibat_stat_pending: - /* NB callback has my ref on conn */ - break; - - case ibat_stat_ok: - case ibat_stat_error: - case ibat_stat_timeout: - case ibat_stat_not_found: - /* Immediate return (ARP cache hit or failure) == no callback. - * Do the next stage directly... */ - conn->ibc_connvars->cv_arprc = ibatrc; - kibnal_arp_done(conn); - kibnal_conn_decref(conn); - break; - } -} - -int -kibnal_check_txs (kib_conn_t *conn, struct list_head *txs) -{ - kib_tx_t *tx; - struct list_head *ttmp; - int timed_out = 0; - - spin_lock(&conn->ibc_lock); - - list_for_each (ttmp, txs) { - tx = list_entry (ttmp, kib_tx_t, tx_list); - - if (txs == &conn->ibc_active_txs) { - LASSERT (!tx->tx_queued); - LASSERT (tx->tx_waiting || tx->tx_sending != 0); - } else { - LASSERT (tx->tx_queued); - } - - if (time_after_eq (jiffies, tx->tx_deadline)) { - timed_out = 1; - break; - } - } - - spin_unlock(&conn->ibc_lock); - return timed_out; -} - -int -kibnal_conn_timed_out (kib_conn_t *conn) -{ - return kibnal_check_txs(conn, &conn->ibc_tx_queue) || - kibnal_check_txs(conn, &conn->ibc_tx_queue_rsrvd) || - kibnal_check_txs(conn, &conn->ibc_tx_queue_nocred) || - kibnal_check_txs(conn, &conn->ibc_active_txs); -} - -void -kibnal_check_conns (int idx) -{ - struct list_head *peers = &kibnal_data.kib_peers[idx]; - struct list_head *ptmp; - kib_peer_t *peer; - kib_conn_t *conn; - struct list_head *ctmp; - unsigned long flags; - - again: - /* NB. We expect to have a look at all the peers and not find any - * rdmas to time out, so we just use a shared lock while we - * take a look... */ - read_lock_irqsave(&kibnal_data.kib_global_lock, flags); - - list_for_each (ptmp, peers) { - peer = list_entry (ptmp, kib_peer_t, ibp_list); - - list_for_each (ctmp, &peer->ibp_conns) { - conn = list_entry (ctmp, kib_conn_t, ibc_list); - - LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED); - - /* In case we have enough credits to return via a - * NOOP, but there were no non-blocking tx descs - * free to do it last time... */ - kibnal_check_sends(conn); - - if (!kibnal_conn_timed_out(conn)) - continue; - - /* Handle timeout by closing the whole connection. We - * can only be sure RDMA activity has ceased once the - * QP has been modified. */ - - kibnal_conn_addref(conn); /* 1 ref for me... */ - - read_unlock_irqrestore(&kibnal_data.kib_global_lock, - flags); - - CERROR("Timed out RDMA with %s\n", - libcfs_nid2str(peer->ibp_nid)); - - kibnal_close_conn (conn, -ETIMEDOUT); - kibnal_conn_decref(conn); /* ...until here */ - - /* start again now I've dropped the lock */ - goto again; - } - } - - read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); -} - -void -kibnal_disconnect_conn (kib_conn_t *conn) -{ - static cm_drequest_data_t dreq; /* just for the space */ - - cm_return_t cmrc; - unsigned long flags; - - LASSERT (!in_interrupt()); - LASSERT (current == kibnal_data.kib_connd); - - write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - - if (conn->ibc_disconnect) { - /* Had the CM callback already */ - write_unlock_irqrestore(&kibnal_data.kib_global_lock, - flags); - kibnal_conn_disconnected(conn); - return; - } - - LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT1); - - /* active disconnect */ - cmrc = cm_disconnect(conn->ibc_cep, &dreq, NULL); - if (cmrc == cm_stat_success) { - /* waiting for CM */ - conn->ibc_state = IBNAL_CONN_DISCONNECT2; - write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - return; - } - - write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - - cm_cancel(conn->ibc_cep); - cfs_pause(cfs_time_seconds(1)/10); - - if (!conn->ibc_disconnect) /* CM callback will never happen now */ - kibnal_conn_decref(conn); - - LASSERT (atomic_read(&conn->ibc_refcount) > 0); - LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECT1); - - kibnal_conn_disconnected(conn); -} - -int -kibnal_connd (void *arg) -{ - wait_queue_t wait; - unsigned long flags; - kib_pcreq_t *pcr; - kib_conn_t *conn; - kib_peer_t *peer; - int timeout; - int i; - int dropped_lock; - int peer_index = 0; - unsigned long deadline = jiffies; - - cfs_daemonize ("kibnal_connd"); - cfs_block_allsigs (); - - init_waitqueue_entry (&wait, current); - kibnal_data.kib_connd = current; - - spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); - - while (!kibnal_data.kib_shutdown) { - - dropped_lock = 0; - - if (!list_empty (&kibnal_data.kib_connd_zombies)) { - conn = list_entry (kibnal_data.kib_connd_zombies.next, - kib_conn_t, ibc_list); - list_del (&conn->ibc_list); - - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - dropped_lock = 1; - - kibnal_destroy_conn(conn); - - spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); - } - - if (!list_empty (&kibnal_data.kib_connd_pcreqs)) { - pcr = list_entry(kibnal_data.kib_connd_pcreqs.next, - kib_pcreq_t, pcr_list); - list_del(&pcr->pcr_list); - - spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); - dropped_lock = 1; - - kibnal_recv_connreq(pcr->pcr_cep, &pcr->pcr_cmreq); - LIBCFS_FREE(pcr, sizeof(*pcr)); - - spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); - } - - if (!list_empty (&kibnal_data.kib_connd_peers)) { - peer = list_entry (kibnal_data.kib_connd_peers.next, - kib_peer_t, ibp_connd_list); - - list_del_init (&peer->ibp_connd_list); - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - dropped_lock = 1; - - kibnal_arp_peer (peer); - kibnal_peer_decref (peer); - - spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); - } - - if (!list_empty (&kibnal_data.kib_connd_conns)) { - conn = list_entry (kibnal_data.kib_connd_conns.next, - kib_conn_t, ibc_list); - list_del (&conn->ibc_list); - - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - dropped_lock = 1; - - switch (conn->ibc_state) { - default: - LBUG(); - - case IBNAL_CONN_ACTIVE_ARP: - kibnal_arp_done(conn); - break; - - case IBNAL_CONN_ACTIVE_CONNECT: - kibnal_check_connreply(conn); - break; - - case IBNAL_CONN_PASSIVE_WAIT: - kibnal_check_passive_wait(conn); - break; - - case IBNAL_CONN_DISCONNECT1: - case IBNAL_CONN_DISCONNECT2: - kibnal_disconnect_conn(conn); - break; - } - kibnal_conn_decref(conn); - - spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); - } - - /* careful with the jiffy wrap... */ - timeout = (int)(deadline - jiffies); - if (timeout <= 0) { - const int n = 4; - const int p = 1; - int chunk = kibnal_data.kib_peer_hash_size; - - spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); - dropped_lock = 1; - - /* Time to check for RDMA timeouts on a few more - * peers: I do checks every 'p' seconds on a - * proportion of the peer table and I need to check - * every connection 'n' times within a timeout - * interval, to ensure I detect a timeout on any - * connection within (n+1)/n times the timeout - * interval. */ - - if (*kibnal_tunables.kib_timeout > n * p) - chunk = (chunk * n * p) / - *kibnal_tunables.kib_timeout; - if (chunk == 0) - chunk = 1; - - for (i = 0; i < chunk; i++) { - kibnal_check_conns (peer_index); - peer_index = (peer_index + 1) % - kibnal_data.kib_peer_hash_size; - } - - deadline += p * HZ; - spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); - } - - if (dropped_lock) - continue; - - /* Nothing to do for 'timeout' */ - set_current_state (TASK_INTERRUPTIBLE); - add_wait_queue (&kibnal_data.kib_connd_waitq, &wait); - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - - schedule_timeout (timeout); - - set_current_state (TASK_RUNNING); - remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait); - spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); - } - - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - - kibnal_thread_fini (); - return (0); -} - -void -kibnal_async_callback(vv_event_record_t ev) -{ - CERROR("type: %d, port: %d, data: "LPX64"\n", - ev.event_type, ev.port_num, ev.type.data); -} - -void -kibnal_cq_callback (unsigned long unused_context) -{ - unsigned long flags; - - spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); - kibnal_data.kib_ready = 1; - wake_up(&kibnal_data.kib_sched_waitq); - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); -} - -int -kibnal_scheduler(void *arg) -{ - long id = (long)arg; - wait_queue_t wait; - char name[16]; - vv_wc_t wc; - vv_return_t vvrc; - vv_return_t vvrc2; - unsigned long flags; - kib_rx_t *rx; - __u64 rxseq = 0; - int busy_loops = 0; - - snprintf(name, sizeof(name), "kibnal_sd_%02ld", id); - cfs_daemonize(name); - cfs_block_allsigs(); - - init_waitqueue_entry(&wait, current); - - spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); - - while (!kibnal_data.kib_shutdown) { - if (busy_loops++ >= IBNAL_RESCHED) { - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, - flags); - - cfs_cond_resched(); - busy_loops = 0; - - spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); - } - - if (kibnal_data.kib_ready && - !kibnal_data.kib_checking_cq) { - /* take ownership of completion polling */ - kibnal_data.kib_checking_cq = 1; - /* Assume I'll exhaust the CQ */ - kibnal_data.kib_ready = 0; - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, - flags); - - vvrc = vv_poll_for_completion(kibnal_data.kib_hca, - kibnal_data.kib_cq, &wc); - if (vvrc == vv_return_err_cq_empty) { - vvrc2 = vv_request_completion_notification( - kibnal_data.kib_hca, - kibnal_data.kib_cq, - vv_next_solicit_unsolicit_event); - LASSERT (vvrc2 == vv_return_ok); - } - - if (vvrc == vv_return_ok && - kibnal_wreqid2type(wc.wr_id) == IBNAL_WID_RX) { - rx = (kib_rx_t *)kibnal_wreqid2ptr(wc.wr_id); - - /* Grab the RX sequence number NOW before - * anyone else can get an RX completion */ - rxseq = rx->rx_conn->ibc_rxseq++; - } - - spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); - /* give up ownership of completion polling */ - kibnal_data.kib_checking_cq = 0; - - if (vvrc == vv_return_err_cq_empty) - continue; - - LASSERT (vvrc == vv_return_ok); - /* Assume there's more: get another scheduler to check - * while I handle this completion... */ - - kibnal_data.kib_ready = 1; - wake_up(&kibnal_data.kib_sched_waitq); - - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, - flags); - - switch (kibnal_wreqid2type(wc.wr_id)) { - case IBNAL_WID_RX: - kibnal_rx_complete( - (kib_rx_t *)kibnal_wreqid2ptr(wc.wr_id), - wc.completion_status, - wc.num_bytes_transfered, - rxseq); - break; - - case IBNAL_WID_TX: - kibnal_tx_complete( - (kib_tx_t *)kibnal_wreqid2ptr(wc.wr_id), - wc.completion_status); - break; - - case IBNAL_WID_RDMA: - /* We only get RDMA completion notification if - * it fails. So we just ignore them completely - * because... - * - * 1) If an RDMA fails, all subsequent work - * items, including the final SEND will fail - * too, so I'm still guaranteed to notice that - * this connection is hosed. - * - * 2) It's positively dangerous to look inside - * the tx descriptor obtained from an RDMA work - * item. As soon as I drop the kib_sched_lock, - * I give a scheduler on another CPU a chance - * to get the final SEND completion, so the tx - * descriptor can get freed as I inspect it. */ - CDEBUG(D_NETERROR, "RDMA failed: %d\n", - wc.completion_status); - break; - - default: - LBUG(); - } - - spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); - continue; - } - - /* Nothing to do; sleep... */ - - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue_exclusive(&kibnal_data.kib_sched_waitq, &wait); - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, - flags); - - schedule(); - - remove_wait_queue(&kibnal_data.kib_sched_waitq, &wait); - set_current_state(TASK_RUNNING); - spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); - } - - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); - - kibnal_thread_fini(); - return (0); -} diff --git a/lnet/klnds/viblnd/viblnd_modparams.c b/lnet/klnds/viblnd/viblnd_modparams.c deleted file mode 100644 index 6f82d7b..0000000 --- a/lnet/klnds/viblnd/viblnd_modparams.c +++ /dev/null @@ -1,416 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/viblnd/viblnd_modparams.c - * - * Author: Eric Barton - */ - -#include "viblnd.h" - -static int service_number = 0x11b9a2; -CFS_MODULE_PARM(service_number, "i", int, 0444, - "IB service number"); - -static int min_reconnect_interval = 1; -CFS_MODULE_PARM(min_reconnect_interval, "i", int, 0644, - "minimum connection retry interval (seconds)"); - -static int max_reconnect_interval = 60; -CFS_MODULE_PARM(max_reconnect_interval, "i", int, 0644, - "maximum connection retry interval (seconds)"); - -static int concurrent_peers = 1152; -CFS_MODULE_PARM(concurrent_peers, "i", int, 0444, - "maximum number of peers that may connect"); - -static int cksum = 0; -CFS_MODULE_PARM(cksum, "i", int, 0644, - "set non-zero to enable message (not RDMA) checksums"); - -static int timeout = 50; -CFS_MODULE_PARM(timeout, "i", int, 0644, - "timeout (seconds)"); - -static int ntx = 256; -CFS_MODULE_PARM(ntx, "i", int, 0444, - "# of message descriptors"); - -static int credits = 128; -CFS_MODULE_PARM(credits, "i", int, 0444, - "# concurrent sends"); - -static int peer_credits = 8; -CFS_MODULE_PARM(peer_credits, "i", int, 0444, - "# concurrent sends to 1 peer"); - -static int arp_retries = 3; -CFS_MODULE_PARM(arp_retries, "i", int, 0644, - "# of times to retry ARP"); - -static char *hca_basename = "InfiniHost"; -CFS_MODULE_PARM(hca_basename, "s", charp, 0444, - "HCA base name"); - -static char *ipif_basename = "ipoib"; -CFS_MODULE_PARM(ipif_basename, "s", charp, 0444, - "IPoIB interface base name"); - -static int local_ack_timeout = 0x12; -CFS_MODULE_PARM(local_ack_timeout, "i", int, 0644, - "ACK timeout for low-level 'sends'"); - -static int retry_cnt = 7; -CFS_MODULE_PARM(retry_cnt, "i", int, 0644, - "Retransmissions when no ACK received"); - -static int rnr_cnt = 6; -CFS_MODULE_PARM(rnr_cnt, "i", int, 0644, - "RNR retransmissions"); - -static int rnr_nak_timer = 0x10; -CFS_MODULE_PARM(rnr_nak_timer, "i", int, 0644, - "RNR retransmission interval"); - -static int keepalive = 100; -CFS_MODULE_PARM(keepalive, "i", int, 0644, - "Idle time in seconds before sending a keepalive"); - -static int concurrent_sends = IBNAL_RX_MSGS; -CFS_MODULE_PARM(concurrent_sends, "i", int, 0644, - "send work-queue sizing"); - -#if IBNAL_USE_FMR -static int fmr_remaps = 1000; -CFS_MODULE_PARM(fmr_remaps, "i", int, 0444, - "FMR mappings allowed before unmap"); -#endif - -kib_tunables_t kibnal_tunables = { - .kib_service_number = &service_number, - .kib_min_reconnect_interval = &min_reconnect_interval, - .kib_max_reconnect_interval = &max_reconnect_interval, - .kib_concurrent_peers = &concurrent_peers, - .kib_cksum = &cksum, - .kib_timeout = &timeout, - .kib_ntx = &ntx, - .kib_credits = &credits, - .kib_peercredits = &peer_credits, - .kib_arp_retries = &arp_retries, - .kib_hca_basename = &hca_basename, - .kib_ipif_basename = &ipif_basename, - .kib_local_ack_timeout = &local_ack_timeout, - .kib_retry_cnt = &retry_cnt, - .kib_rnr_cnt = &rnr_cnt, - .kib_rnr_nak_timer = &rnr_nak_timer, - .kib_keepalive = &keepalive, - .kib_concurrent_sends = &concurrent_sends, -#if IBNAL_USE_FMR - .kib_fmr_remaps = &fmr_remaps, -#endif -}; - -#ifndef HAVE_SYSCTL_UNNUMBERED - -enum { - VIBLND_SERVICE = 1, - VIBLND_RECONNECT_MIN, - VIBLND_RECONNECT_MAX, - VIBLND_CONCURRENT_PEERS, - VIBLND_CHKSUM, - VIBLND_TIMEOUT, - VIBLND_NTX, - VIBLND_CREDITS, - VIBLND_PEER_CREDITS, - VIBLND_ARP_RETRIES, - VIBLND_HCA_BASENAME, - VIBLND_IPIF_BASENAME, - VIBLND_LOCAL_ACK_TIMEOUT, - VIBLND_RETRY_CNT, - VIBLND_RNR_CNT, - VIBLND_RNR_NAK_TIMER, - VIBLND_KEEPALIVE, - VIBLND_CONCURRENT_SENDS, - VIBLND_FMR_REMAPS -}; -#else - -#define VIBLND_SERVICE CTL_UNNUMBERED -#define VIBLND_RECONNECT_MIN CTL_UNNUMBERED -#define VIBLND_RECONNECT_MAX CTL_UNNUMBERED -#define VIBLND_CONCURRENT_PEERS CTL_UNNUMBERED -#define VIBLND_CHKSUM CTL_UNNUMBERED -#define VIBLND_TIMEOUT CTL_UNNUMBERED -#define VIBLND_NTX CTL_UNNUMBERED -#define VIBLND_CREDITS CTL_UNNUMBERED -#define VIBLND_PEER_CREDITS CTL_UNNUMBERED -#define VIBLND_ARP_RETRIES CTL_UNNUMBERED -#define VIBLND_HCA_BASENAME CTL_UNNUMBERED -#define VIBLND_IPIF_BASENAME CTL_UNNUMBERED -#define VIBLND_LOCAL_ACK_TIMEOUT CTL_UNNUMBERED -#define VIBLND_RETRY_CNT CTL_UNNUMBERED -#define VIBLND_RNR_CNT CTL_UNNUMBERED -#define VIBLND_RNR_NAK_TIMER CTL_UNNUMBERED -#define VIBLND_KEEPALIVE CTL_UNNUMBERED -#define VIBLND_CONCURRENT_SENDS CTL_UNNUMBERED -#define VIBLND_FMR_REMAPS CTL_UNNUMBERED - -#endif - -#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM - -static char hca_basename_space[32]; -static char ipif_basename_space[32]; - -static cfs_sysctl_table_t kibnal_ctl_table[] = { - { - .ctl_name = VIBLND_SERVICE, - .procname = "service_number", - .data = &service_number, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = VIBLND_RECONNECT_MIN, - .procname = "min_reconnect_interval", - .data = &min_reconnect_interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = VIBLND_RECONNECT_MAX, - .procname = "max_reconnect_interval", - .data = &max_reconnect_interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = VIBLND_CONCURRENT_PEERS, - .procname = "concurrent_peers", - .data = &concurrent_peers, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = VIBLND_CHKSUM, - .procname = "cksum", - .data = &cksum, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = VIBLND_TIMEOUT, - .procname = "timeout", - .data = &timeout, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = VIBLND_NTX, - .procname = "ntx", - .data = &ntx, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = VIBLND_CREDITS, - .procname = "credits", - .data = &credits, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = VIBLND_PEER_CREDITS, - .procname = "peer_credits", - .data = &peer_credits, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = VIBLND_ARP_RETRIES, - .procname = "arp_retries", - .data = &arp_retries, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = VIBLND_HCA_BASENAME, - .procname = "hca_basename", - .data = hca_basename_space, - .maxlen = sizeof(hca_basename_space), - .mode = 0444, - .proc_handler = &proc_dostring - }, - { - .ctl_name = VIBLND_IPIF_BASENAME, - .procname = "ipif_basename", - .data = ipif_basename_space, - .maxlen = sizeof(ipif_basename_space), - .mode = 0444, - .proc_handler = &proc_dostring - }, - { - .ctl_name = VIBLND_LOCAL_ACK_TIMEOUT, - .procname = "local_ack_timeout", - .data = &local_ack_timeout, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = VIBLND_RETRY_CNT, - .procname = "retry_cnt", - .data = &retry_cnt, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = VIBLND_RNR_CNT, - .procname = "rnr_cnt", - .data = &rnr_cnt, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = VIBLND_RNR_NAK_TIMER, - .procname = "rnr_nak_timer", - .data = &rnr_nak_timer, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = VIBLND_KEEPALIVE, - .procname = "keepalive", - .data = &keepalive, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = VIBLND_CONCURRENT_SENDS, - .procname = "concurrent_sends", - .data = &concurrent_sends, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, -#if IBNAL_USE_FMR - { - .ctl_name = VIBLND_FMR_REMAPS, - .procname = "fmr_remaps", - .data = &fmr_remaps, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec - }, -#endif - {0} -}; - -static cfs_sysctl_table_t kibnal_top_ctl_table[] = { - { - .ctl_name = CTL_VIBLND, - .procname = "vibnal", - .data = NULL, - .maxlen = 0, - .mode = 0555, - .child = kibnal_ctl_table - }, - {0} -}; - -void -kibnal_initstrtunable(char *space, char *str, int size) -{ - strncpy(space, str, size); - space[size-1] = 0; -} - -int -kibnal_tunables_init () -{ - kibnal_initstrtunable(hca_basename_space, hca_basename, - sizeof(hca_basename_space)); - kibnal_initstrtunable(ipif_basename_space, ipif_basename, - sizeof(ipif_basename_space)); - - kibnal_tunables.kib_sysctl = - cfs_register_sysctl_table(kibnal_top_ctl_table, 0); - - if (kibnal_tunables.kib_sysctl == NULL) - CWARN("Can't setup /proc tunables\n"); - - if (*kibnal_tunables.kib_concurrent_sends > IBNAL_RX_MSGS) - *kibnal_tunables.kib_concurrent_sends = IBNAL_RX_MSGS; - if (*kibnal_tunables.kib_concurrent_sends < IBNAL_MSG_QUEUE_SIZE) - *kibnal_tunables.kib_concurrent_sends = IBNAL_MSG_QUEUE_SIZE; - - return 0; -} - -void -kibnal_tunables_fini () -{ - if (kibnal_tunables.kib_sysctl != NULL) - cfs_unregister_sysctl_table(kibnal_tunables.kib_sysctl); -} - -#else - -int -kibnal_tunables_init () -{ - return 0; -} - -void -kibnal_tunables_fini () -{ -} - -#endif diff --git a/lnet/klnds/viblnd/viblnd_wire.h b/lnet/klnds/viblnd/viblnd_wire.h deleted file mode 100644 index d1bc5c9..0000000 --- a/lnet/klnds/viblnd/viblnd_wire.h +++ /dev/null @@ -1,157 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. - * - * GPL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/viblnd/viblnd_wire.h - * - * IB Wire message format. - * These are sent in sender's byte order (i.e. receiver flips). - */ - -typedef struct kib_connparams -{ - __u32 ibcp_queue_depth; - __u32 ibcp_max_msg_size; - __u32 ibcp_max_frags; -} WIRE_ATTR kib_connparams_t; - -typedef struct -{ - lnet_hdr_t ibim_hdr; /* portals header */ - char ibim_payload[0]; /* piggy-backed payload */ -} WIRE_ATTR kib_immediate_msg_t; - -#ifndef IBNAL_USE_FMR -# error "IBNAL_USE_FMR must be defined 1 or 0 before including this file" -#endif - -#if IBNAL_USE_FMR -typedef struct -{ - __u64 rd_addr; /* IO VMA address */ - __u32 rd_nob; /* # of bytes */ - __u32 rd_key; /* remote key */ -} WIRE_ATTR kib_rdma_desc_t; -#else -/* YEUCH! the __u64 address is split into 2 __u32 fields to ensure proper - * packing. Otherwise we can't fit enough frags into an IBNAL message (<= - * smallest page size on any arch). */ -typedef struct -{ - __u32 rf_nob; /* # of bytes */ - __u32 rf_addr_lo; /* lo 4 bytes of vaddr */ - __u32 rf_addr_hi; /* hi 4 bytes of vaddr */ -} WIRE_ATTR kib_rdma_frag_t; - -typedef struct -{ - __u32 rd_key; /* local/remote key */ - __u32 rd_nfrag; /* # fragments */ - kib_rdma_frag_t rd_frags[0]; /* buffer frags */ -} WIRE_ATTR kib_rdma_desc_t; -#endif - -typedef struct -{ - lnet_hdr_t ibprm_hdr; /* portals header */ - __u64 ibprm_cookie; /* opaque completion cookie */ -} WIRE_ATTR kib_putreq_msg_t; - -typedef struct -{ - __u64 ibpam_src_cookie; /* reflected completion cookie */ - __u64 ibpam_dst_cookie; /* opaque completion cookie */ - kib_rdma_desc_t ibpam_rd; /* sender's sink buffer */ -} WIRE_ATTR kib_putack_msg_t; - -typedef struct -{ - lnet_hdr_t ibgm_hdr; /* portals header */ - __u64 ibgm_cookie; /* opaque completion cookie */ - kib_rdma_desc_t ibgm_rd; /* rdma descriptor */ -} WIRE_ATTR kib_get_msg_t; - -typedef struct -{ - __u64 ibcm_cookie; /* opaque completion cookie */ - __s32 ibcm_status; /* < 0 failure: >= 0 length */ -} WIRE_ATTR kib_completion_msg_t; - -typedef struct -{ - /* First 2 fields fixed FOR ALL TIME */ - __u32 ibm_magic; /* I'm an openibnal message */ - __u16 ibm_version; /* this is my version number */ - - __u8 ibm_type; /* msg type */ - __u8 ibm_credits; /* returned credits */ - __u32 ibm_nob; /* # bytes in whole message */ - __u32 ibm_cksum; /* checksum (0 == no checksum) */ - __u64 ibm_srcnid; /* sender's NID */ - __u64 ibm_srcstamp; /* sender's incarnation */ - __u64 ibm_dstnid; /* destination's NID */ - __u64 ibm_dststamp; /* destination's incarnation */ - __u64 ibm_seq; /* sequence number */ - - union { - kib_connparams_t connparams; - kib_immediate_msg_t immediate; - kib_putreq_msg_t putreq; - kib_putack_msg_t putack; - kib_get_msg_t get; - kib_completion_msg_t completion; - } WIRE_ATTR ibm_u; -} WIRE_ATTR kib_msg_t; - -#define IBNAL_MSG_MAGIC LNET_PROTO_VIB_MAGIC /* unique magic */ - -#define IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD 0x10 /* previous version */ - -#define IBNAL_MSG_VERSION 0x11 /* current version */ - -#define IBNAL_MSG_CONNREQ 0xc0 /* connection request */ -#define IBNAL_MSG_CONNACK 0xc1 /* connection acknowledge */ -#define IBNAL_MSG_NOOP 0xd0 /* nothing (just credits) */ -#define IBNAL_MSG_IMMEDIATE 0xd1 /* immediate */ -#define IBNAL_MSG_PUT_REQ 0xd2 /* putreq (src->sink) */ -#define IBNAL_MSG_PUT_NAK 0xd3 /* completion (sink->src) */ -#define IBNAL_MSG_PUT_ACK 0xd4 /* putack (sink->src) */ -#define IBNAL_MSG_PUT_DONE 0xd5 /* completion (src->sink) */ -#define IBNAL_MSG_GET_REQ 0xd6 /* getreq (sink->src) */ -#define IBNAL_MSG_GET_DONE 0xd7 /* completion (src->sink: all OK) */ - -/* connection rejection reasons */ -#define IBNAL_REJECT_CONN_RACE 0 /* You lost connection race */ -#define IBNAL_REJECT_NO_RESOURCES 1 /* Out of memory/conns etc */ -#define IBNAL_REJECT_FATAL 2 /* Anything else */ diff --git a/lnet/klnds/viblnd/wirecheck.c b/lnet/klnds/viblnd/wirecheck.c deleted file mode 100644 index 711b7d8..0000000 --- a/lnet/klnds/viblnd/wirecheck.c +++ /dev/null @@ -1,260 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. - * - * GPL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#include -#include -#include -#include - -#include - -/* This ghastly hack to allows me to include lib-types.h It doesn't affect any - * assertions generated here (but fails-safe if it ever does) */ -typedef struct { - int counter; -} atomic_t; - -#include - -#define IBNAL_USE_FMR 1 -#include "viblnd_wire.h" - -#ifndef HAVE_STRNLEN -#define strnlen(s, i) strlen(s) -#endif - -#define BLANK_LINE() \ -do { \ - printf ("\n"); \ -} while (0) - -#define COMMENT(c) \ -do { \ - printf (" /* "c" */\n"); \ -} while (0) - -#undef STRINGIFY -#define STRINGIFY(a) #a - -#define CHECK_DEFINE(a) \ -do { \ - printf (" CLASSERT ("#a" == "STRINGIFY(a)");\n"); \ -} while (0) - -#define CHECK_VALUE(a) \ -do { \ - printf (" CLASSERT ("#a" == %d);\n", a); \ -} while (0) - -#define CHECK_MEMBER_OFFSET(s,m) \ -do { \ - CHECK_VALUE((int)offsetof(s, m)); \ -} while (0) - -#define CHECK_MEMBER_SIZEOF(s,m) \ -do { \ - CHECK_VALUE((int)sizeof(((s *)0)->m)); \ -} while (0) - -#define CHECK_MEMBER(s,m) \ -do { \ - CHECK_MEMBER_OFFSET(s, m); \ - CHECK_MEMBER_SIZEOF(s, m); \ -} while (0) - -#define CHECK_STRUCT(s) \ -do { \ - BLANK_LINE (); \ - COMMENT ("Checks for struct "#s); \ - CHECK_VALUE((int)sizeof(s)); \ -} while (0) - -void -system_string (char *cmdline, char *str, int len) -{ - int fds[2]; - int rc; - pid_t pid; - - rc = pipe (fds); - if (rc != 0) - abort (); - - pid = fork (); - if (pid == 0) { - /* child */ - int fd = fileno(stdout); - - rc = dup2(fds[1], fd); - if (rc != fd) - abort(); - - exit(system(cmdline)); - /* notreached */ - } else if ((int)pid < 0) { - abort(); - } else { - FILE *f = fdopen (fds[0], "r"); - - if (f == NULL) - abort(); - - close(fds[1]); - - if (fgets(str, len, f) == NULL) - abort(); - - if (waitpid(pid, &rc, 0) != pid) - abort(); - - if (!WIFEXITED(rc) || - WEXITSTATUS(rc) != 0) - abort(); - - if (strnlen(str, len) == len) - str[len - 1] = 0; - - if (str[strlen(str) - 1] == '\n') - str[strlen(str) - 1] = 0; - - fclose(f); - } -} - -int -main (int argc, char **argv) -{ - char unameinfo[80]; - char gccinfo[80]; - - system_string("uname -a", unameinfo, sizeof(unameinfo)); - system_string("gcc -v 2>&1 | tail -1", gccinfo, sizeof(gccinfo)); - - printf ("void vibnal_assert_wire_constants (void)\n" - "{\n" - " /* Wire protocol assertions generated by 'wirecheck'\n" - " * running on %s\n" - " * with %s */\n" - "\n", unameinfo, gccinfo); - - BLANK_LINE (); - - COMMENT ("Constants..."); - CHECK_DEFINE (IBNAL_MSG_MAGIC); - CHECK_DEFINE (IBNAL_MSG_VERSION); - - CHECK_DEFINE (IBNAL_MSG_CONNREQ); - CHECK_DEFINE (IBNAL_MSG_CONNACK); - CHECK_DEFINE (IBNAL_MSG_NOOP); - CHECK_DEFINE (IBNAL_MSG_IMMEDIATE); - CHECK_DEFINE (IBNAL_MSG_PUT_REQ); - CHECK_DEFINE (IBNAL_MSG_PUT_NAK); - CHECK_DEFINE (IBNAL_MSG_PUT_ACK); - CHECK_DEFINE (IBNAL_MSG_PUT_DONE); - CHECK_DEFINE (IBNAL_MSG_GET_REQ); - CHECK_DEFINE (IBNAL_MSG_GET_DONE); - - CHECK_DEFINE (IBNAL_REJECT_CONN_RACE); - CHECK_DEFINE (IBNAL_REJECT_NO_RESOURCES); - CHECK_DEFINE (IBNAL_REJECT_FATAL); - - CHECK_STRUCT (kib_connparams_t); - CHECK_MEMBER (kib_connparams_t, ibcp_queue_depth); - CHECK_MEMBER (kib_connparams_t, ibcp_max_msg_size); - CHECK_MEMBER (kib_connparams_t, ibcp_max_frags); - - CHECK_STRUCT (kib_immediate_msg_t); - CHECK_MEMBER (kib_immediate_msg_t, ibim_hdr); - CHECK_MEMBER (kib_immediate_msg_t, ibim_payload[13]); - - CHECK_DEFINE (IBNAL_USE_FMR); -#if IBNAL_USE_FMR - CHECK_STRUCT (kib_rdma_desc_t); - CHECK_MEMBER (kib_rdma_desc_t, rd_addr); - CHECK_MEMBER (kib_rdma_desc_t, rd_nob); - CHECK_MEMBER (kib_rdma_desc_t, rd_key); -#else - CHECK_STRUCT (kib_rdma_frag_t); - CHECK_MEMBER (kib_rdma_frag_t, rf_nob); - CHECK_MEMBER (kib_rdma_frag_t, rf_addr_lo); - CHECK_MEMBER (kib_rdma_frag_t, rf_addr_hi); - - CHECK_STRUCT (kib_rdma_desc_t); - CHECK_MEMBER (kib_rdma_desc_t, rd_key); - CHECK_MEMBER (kib_rdma_desc_t, rd_nfrag); - CHECK_MEMBER (kib_rdma_desc_t, rd_frags[13]); -#endif - CHECK_STRUCT (kib_putreq_msg_t); - CHECK_MEMBER (kib_putreq_msg_t, ibprm_hdr); - CHECK_MEMBER (kib_putreq_msg_t, ibprm_cookie); - - CHECK_STRUCT (kib_putack_msg_t); - CHECK_MEMBER (kib_putack_msg_t, ibpam_src_cookie); - CHECK_MEMBER (kib_putack_msg_t, ibpam_dst_cookie); - CHECK_MEMBER (kib_putack_msg_t, ibpam_rd); - - CHECK_STRUCT (kib_get_msg_t); - CHECK_MEMBER (kib_get_msg_t, ibgm_hdr); - CHECK_MEMBER (kib_get_msg_t, ibgm_cookie); - CHECK_MEMBER (kib_get_msg_t, ibgm_rd); - - CHECK_STRUCT (kib_completion_msg_t); - CHECK_MEMBER (kib_completion_msg_t, ibcm_cookie); - CHECK_MEMBER (kib_completion_msg_t, ibcm_status); - - CHECK_STRUCT (kib_msg_t); - CHECK_MEMBER (kib_msg_t, ibm_magic); - CHECK_MEMBER (kib_msg_t, ibm_version); - CHECK_MEMBER (kib_msg_t, ibm_type); - CHECK_MEMBER (kib_msg_t, ibm_credits); - CHECK_MEMBER (kib_msg_t, ibm_nob); - CHECK_MEMBER (kib_msg_t, ibm_cksum); - CHECK_MEMBER (kib_msg_t, ibm_srcnid); - CHECK_MEMBER (kib_msg_t, ibm_srcstamp); - CHECK_MEMBER (kib_msg_t, ibm_dstnid); - CHECK_MEMBER (kib_msg_t, ibm_dststamp); - CHECK_MEMBER (kib_msg_t, ibm_seq); - CHECK_MEMBER (kib_msg_t, ibm_u.connparams); - CHECK_MEMBER (kib_msg_t, ibm_u.immediate); - CHECK_MEMBER (kib_msg_t, ibm_u.putreq); - CHECK_MEMBER (kib_msg_t, ibm_u.putack); - CHECK_MEMBER (kib_msg_t, ibm_u.get); - CHECK_MEMBER (kib_msg_t, ibm_u.completion); - - printf ("}\n\n"); - - return (0); -} diff --git a/lnet/lnet/acceptor.c b/lnet/lnet/acceptor.c index 26d30e6..cf0a56a 100644 --- a/lnet/lnet/acceptor.c +++ b/lnet/lnet/acceptor.c @@ -311,8 +311,6 @@ lnet_accept(cfs_socket_t *sock, __u32 magic) str = "'old' socknal/tcpnal"; else if (lnet_accept_magic(magic, LNET_PROTO_RA_MAGIC)) str = "'old' ranal"; - else if (lnet_accept_magic(magic, LNET_PROTO_OPENIB_MAGIC)) - str = "'old' openibnal"; else str = "unrecognised"; diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index 3b97ea5..b8ba025 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -1028,6 +1028,15 @@ lnet_startup_lndnis (void) LASSERT (libcfs_isknown_lnd(lnd_type)); + if (lnd_type == CIBLND || + lnd_type == OPENIBLND || + lnd_type == IIBLND || + lnd_type == VIBLND) { + CERROR("LND %s obsoleted\n", + libcfs_lnd2str(lnd_type)); + goto failed; + } + LNET_MUTEX_DOWN(&the_lnet.ln_lnd_mutex); lnd = lnet_find_lnd_by_type(lnd_type); diff --git a/lnet/utils/debug.c b/lnet/utils/debug.c index b9a498d..8aca02a 100644 --- a/lnet/utils/debug.c +++ b/lnet/utils/debug.c @@ -853,17 +853,13 @@ static struct mod_paths { } mod_paths[] = { {"libcfs", "libcfs/libcfs"}, {"lnet", "lnet/lnet"}, - {"kciblnd", "lnet/klnds/ciblnd"}, {"kmxlnd", "lnet/klnds/mxlnd"}, - {"kiiblnd", "lnet/klnds/iiblnd"}, {"ko2iblnd", "lnet/klnds/o2iblnd"}, - {"kopeniblnd", "lnet/klnds/openiblnd"}, {"kptllnd", "lnet/klnds/ptllnd"}, {"kqswlnd", "lnet/klnds/qswlnd"}, {"kralnd", "lnet/klnds/ralnd"}, {"ksocklnd", "lnet/klnds/socklnd"}, {"ktdilnd", "lnet/klnds/tdilnd"}, - {"kviblnd", "lnet/klnds/viblnd"}, {"lvfs", "lustre/lvfs"}, {"obdclass", "lustre/obdclass"}, {"llog_test", "lustre/obdclass"}, diff --git a/lnet/utils/portals.c b/lnet/utils/portals.c index 6ee2cef..33e1f1a 100644 --- a/lnet/utils/portals.c +++ b/lnet/utils/portals.c @@ -306,6 +306,14 @@ int jt_ptl_network(int argc, char **argv) return -1; } + if (LNET_NETTYP(net) == CIBLND || + LNET_NETTYP(net) == OPENIBLND || + LNET_NETTYP(net) == IIBLND || + LNET_NETTYP(net) == VIBLND) { + fprintf(stderr, "Net %s obsoleted\n", libcfs_lnd2str(net)); + return -1; + } + g_net_set = 1; g_net = net; return 0; @@ -557,7 +565,7 @@ jt_ptl_print_peers (int argc, char **argv) int rc; if (!g_net_is_compatible (argv[0], SOCKLND, RALND, PTLLND, MXLND, - OPENIBLND, CIBLND, IIBLND, VIBLND, O2IBLND, 0)) + O2IBLND, 0)) return -1; for (index = 0;;index++) { @@ -598,7 +606,7 @@ jt_ptl_print_peers (int argc, char **argv) data.ioc_u32[5] & 0xffff, /* nactiveq */ data.ioc_u32[6] >> 16, /* credits */ data.ioc_u32[6] & 0xffff); /* outstanding_credits */ - } else if (g_net_is_compatible(NULL, RALND, OPENIBLND, CIBLND, VIBLND, 0)) { + } else if (g_net_is_compatible(NULL, RALND, 0)) { printf ("%-20s [%d]@%s:%d\n", libcfs_nid2str(data.ioc_nid), /* peer nid */ data.ioc_count, /* peer persistence */ @@ -631,24 +639,12 @@ jt_ptl_add_peer (int argc, char **argv) int port = 0; int rc; - if (!g_net_is_compatible (argv[0], SOCKLND, RALND, - OPENIBLND, CIBLND, IIBLND, VIBLND, 0)) + if (!g_net_is_compatible (argv[0], SOCKLND, RALND, 0)) return -1; - if (g_net_is_compatible(NULL, SOCKLND, OPENIBLND, CIBLND, RALND, 0)) { - if (argc != 4) { - fprintf (stderr, "usage(tcp,openib,cib,ra): %s nid ipaddr port\n", - argv[0]); - return 0; - } - } else if (g_net_is_compatible(NULL, VIBLND, 0)) { - if (argc != 3) { - fprintf (stderr, "usage(vib): %s nid ipaddr\n", - argv[0]); - return 0; - } - } else if (argc != 2) { - fprintf (stderr, "usage(iib): %s nid\n", argv[0]); + if (argc != 4) { + fprintf (stderr, "usage(tcp,ra): %s nid ipaddr port\n", + argv[0]); return 0; } @@ -658,14 +654,12 @@ jt_ptl_add_peer (int argc, char **argv) return -1; } - if (g_net_is_compatible (NULL, SOCKLND, OPENIBLND, CIBLND, VIBLND, RALND, 0) && - lnet_parse_ipaddr (&ip, argv[2]) != 0) { + if (lnet_parse_ipaddr (&ip, argv[2]) != 0) { fprintf (stderr, "Can't parse ip addr: %s\n", argv[2]); return -1; } - if (g_net_is_compatible (NULL, SOCKLND, OPENIBLND, CIBLND, RALND, 0) && - lnet_parse_port (&port, argv[3]) != 0) { + if (lnet_parse_port (&port, argv[3]) != 0) { fprintf (stderr, "Can't parse port: %s\n", argv[3]); return -1; } @@ -697,7 +691,7 @@ jt_ptl_del_peer (int argc, char **argv) int rc; if (!g_net_is_compatible (argv[0], SOCKLND, RALND, MXLND, PTLLND, - OPENIBLND, CIBLND, IIBLND, VIBLND, O2IBLND, 0)) + O2IBLND, 0)) return -1; if (g_net_is_compatible(NULL, SOCKLND, 0)) { @@ -766,8 +760,7 @@ jt_ptl_print_connections (int argc, char **argv) int index; int rc; - if (!g_net_is_compatible (argv[0], SOCKLND, RALND, MXLND, - OPENIBLND, CIBLND, IIBLND, VIBLND, O2IBLND, 0)) + if (!g_net_is_compatible (argv[0], SOCKLND, RALND, MXLND, O2IBLND, 0)) return -1; for (index = 0; ; index++) { @@ -832,8 +825,7 @@ int jt_ptl_disconnect(int argc, char **argv) return 0; } - if (!g_net_is_compatible (NULL, SOCKLND, RALND, MXLND, - OPENIBLND, CIBLND, IIBLND, VIBLND, O2IBLND, 0)) + if (!g_net_is_compatible (NULL, SOCKLND, RALND, MXLND, O2IBLND, 0)) return 0; if (argc >= 2 && diff --git a/lnet/utils/wirecheck.c b/lnet/utils/wirecheck.c index 1e8cdd0..844a8e5 100644 --- a/lnet/utils/wirecheck.c +++ b/lnet/utils/wirecheck.c @@ -227,7 +227,6 @@ main (int argc, char **argv) COMMENT ("Constants..."); - CHECK_DEFINE (LNET_PROTO_OPENIB_MAGIC); CHECK_DEFINE (LNET_PROTO_RA_MAGIC); CHECK_DEFINE (LNET_PROTO_TCP_MAGIC); diff --git a/lustre/scripts/lc_common b/lustre/scripts/lc_common index 2cea2a8..386b380 100644 --- a/lustre/scripts/lc_common +++ b/lustre/scripts/lc_common @@ -313,7 +313,7 @@ nid2hostname() { ptl*) # Portals # FIXME: Convert portal ID to hostname ;; - *) # tcp, o2ib, cib, openib, iib, vib, ra + *) # tcp, o2ib, ra ip_addr=${addr} # Is it IP address or hostname? if [ -n "`echo ${ip_addr} | sed -e 's/\([0-9]\{1,3\}\.\)\{3,3\}[0-9]\{1,3\}//'`" ] @@ -357,7 +357,7 @@ nids2hostname() { case "${nettype}" in lo* | elan* | ptl*) ;; - *) # tcp, o2ib, cib, openib, iib, vib, ra + *) # tcp, o2ib, ra host_name=$(nid2hostname ${nid}) if [ ${PIPESTATUS[0]} -ne 0 ]; then echo "${host_name}" @@ -391,7 +391,7 @@ ip2hostname_single_node() { case "${nettype}" in lo* | elan* | ptl*) ;; - *) # tcp, o2ib, cib, openib, iib, vib, ra + *) # tcp, o2ib, ra host_name=$(nid2hostname ${nid}) if [ ${PIPESTATUS[0]} -ne 0 ]; then echo "${host_name}" diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 4ddc243..42a1af7 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -1511,11 +1511,6 @@ h2elan() { } declare -fx h2elan -h2openib() { - h2name_or_ip "$1" "openib" -} -declare -fx h2openib - h2o2ib() { h2name_or_ip "$1" "o2ib" } diff --git a/lustre/utils/gss/lsupport.c b/lustre/utils/gss/lsupport.c index 87ff943..ebbeb1e 100644 --- a/lustre/utils/gss/lsupport.c +++ b/lustre/utils/gss/lsupport.c @@ -301,12 +301,8 @@ static struct convert_struct converter[] = { [GMLND] = { "GMLND", external_nid2hostname}, [PTLLND] = { "PTLLND", external_nid2hostname }, [O2IBLND] = { "O2IBLND", ipv4_nid2hostname }, - [CIBLND] = { "CIBLND", external_nid2hostname }, - [OPENIBLND] = { "OPENIBLND",external_nid2hostname }, - [IIBLND] = { "IIBLND", external_nid2hostname }, [LOLND] = { "LOLND", lolnd_nid2hostname }, [RALND] = { "RALND", external_nid2hostname }, - [VIBLND] = { "VIBLND", external_nid2hostname }, [MXLND] = { "MXLND", external_nid2hostname }, };