AM_CONDITIONAL(INKERNEL, test x$enable_inkernel = xyes)
# -------- are we building against an external portals? -------
-AC_MSG_CHECKING([if Cray portals should be used])
+AC_MSG_CHECKING([for Cray portals])
AC_ARG_WITH([cray-portals],
AC_HELP_STRING([--with-cray-portals=path],
[path to cray portals]),
[
if test "$with_cray_portals" != no; then
- if test -r $with_cray_portals/include/portals/api.h ; then
- CRAY_PORTALS_PATH=$with_cray_portals
- CRAY_PORTALS_INCLUDE="-I$with_cray_portals/include"
- AC_DEFINE(CRAY_PORTALS, 1, [Building with Cray Portals])
- else
- AC_MSG_ERROR([--with-cray-portals specified badly])
- fi
- fi
+ CRAY_PORTALS_PATH=$with_cray_portals
+ CRAY_PORTALS_INCLUDES="$with_cray_portals/include"
+ CRAY_PORTALS_LIBS="$with_cray_portals"
+ fi
],[with_cray_portals=no])
AC_SUBST(CRAY_PORTALS_PATH)
-AC_MSG_RESULT([$with_cray_portals])
+AC_MSG_RESULT([$CRAY_PORTALS_PATH])
+
+AC_MSG_CHECKING([for Cray portals includes])
+AC_ARG_WITH([cray-portals-includes],
+ AC_HELP_STRING([--with-cray-portals-includes=path],
+ [path to cray portals includes]),
+ [
+ if test "$with_cray_portals_includes" != no; then
+ CRAY_PORTALS_INCLUDES="$with_cray_portals_includes"
+ fi
+ ])
+AC_SUBST(CRAY_PORTALS_INCLUDES)
+AC_MSG_RESULT([$CRAY_PORTALS_INCLUDES])
+
+AC_MSG_CHECKING([for Cray portals libs])
+AC_ARG_WITH([cray-portals-libs],
+ AC_HELP_STRING([--with-cray-portals-libs=path],
+ [path to cray portals libs]),
+ [
+ if test "$with_cray_portals_libs" != no; then
+ CRAY_PORTALS_LIBS="$with_cray_portals_libs"
+ fi
+ ])
+AC_SUBST(CRAY_PORTALS_LIBS)
+AC_MSG_RESULT([$CRAY_PORTALS_LIBS])
+
+if test x$CRAY_PORTALS_INCLUDES != x ; then
+ if test ! -r $CRAY_PORTALS_INCLUDES/portals/api.h ; then
+ AC_MSG_ERROR([Cray portals headers were not found in $CRAY_PORTALS_INCLUDES. Please check the paths passed to --with-cray-portals or --with-cray-portals-includes.])
+ fi
+fi
+if test x$CRAY_PORTALS_LIBS != x ; then
+ if test ! -r $CRAY_PORTALS_LIBS/libportals.a ; then
+ AC_MSG_ERROR([Cray portals libraries were not found in $CRAY_PORTALS_LIBS. Please check the paths passed to --with-cray-portals or --with-cray-portals-libs.])
+ fi
+fi
+AC_MSG_CHECKING([whether to use Cray portals])
+if test x$CRAY_PORTALS_INCLUDES != x -a x$CRAY_PORTALS_LIBS != x ; then
+ with_cray_portals=yes
+ AC_DEFINE(CRAY_PORTALS, 1, [Building with Cray Portals])
+ CRAY_PORTALS_INCLUDES="-I$CRAY_PORTALS_INCLUDES"
+else
+ with_cray_portals=no
+fi
+AC_MSG_RESULT([$with_cray_portals])
AM_CONDITIONAL(CRAY_PORTALS, test x$with_cray_portals != xno)
+# ----------------------------------------
+# some tests for catamount-like systems
+# ----------------------------------------
+AC_ARG_ENABLE([sysio_init],
+ AC_HELP_STRING([--disable-sysio-init],
+ [call sysio init functions when initializing liblustre]),
+ [],[enable_sysio_init=yes])
+AC_MSG_CHECKING([whether to initialize libsysio])
+AC_MSG_RESULT([$enable_sysio_init])
+if test x$enable_sysio_init != xno ; then
+ AC_DEFINE([INIT_SYSIO], 1, [call sysio init functions])
+fi
+
+AC_ARG_ENABLE([urandom],
+ AC_HELP_STRING([--disable-urandom],
+ [disable use of /dev/urandom for liblustre]),
+ [],[enable_urandom=yes])
+AC_MSG_CHECKING([whether to use /dev/urandom for liblustre])
+AC_MSG_RESULT([$enable_urandom])
+if test x$enable_urandom != xno ; then
+ AC_DEFINE([LIBLUSTRE_USE_URANDOM], 1, [use /dev/urandom for random data])
+fi
+
+# -------- check for -lcap and -lpthread ----
+if test x$enable_liblustre = xyes ; then
+ AC_CHECK_LIB([cap], [cap_get_proc],
+ [
+ CAP_LIBS="-lcap"
+ AC_DEFINE([HAVE_LIBCAP], 1, [use libcap])
+ ],
+ [CAP_LIBS=""])
+ AC_SUBST(CAP_LIBS)
+ AC_CHECK_LIB([pthread], [pthread_create],
+ [
+ PTHREAD_LIBS="-lpthread"
+ AC_DEFINE([HAVE_LIBPTHREAD], 1, [use libpthread])
+ ],
+ [PTHREAD_LIBS=""])
+ AC_SUBST(PTHREAD_LIBS)
+fi
+
# -------- enable tests and utils? -------
if test x$enable_tests = xno ; then
AC_MSG_NOTICE([disabling tests])
# ------- Makeflags ------------------
-CPPFLAGS="$CRAY_PORTALS_INCLUDE $CRAY_PORTALS_COMMANDLINE -I\$(top_srcdir)/include -I\$(top_srcdir)/portals/include"
+CPPFLAGS="$CPPFLAGS $CRAY_PORTALS_INCLUDES -I\$(top_srcdir)/include -I\$(top_srcdir)/portals/include"
# liblustre are all the same
LLCPPFLAGS="-D__arch_lib__ -D_LARGEFILE64_SOURCE=1"
AC_DEFINE(CONFIG_LDISKFS_FS_SECURITY, 1, [enable fs security])
fi
-EXTRA_KCFLAGS="-g $CRAY_PORTALS_INCLUDE $CRAY_PORTALS_COMMANDLINE -I$PWD/portals/include -I$PWD/include"
+EXTRA_KCFLAGS="-g $CRAY_PORTALS_INCLUDES -I$PWD/portals/include -I$PWD/include"
# these are like AC_TRY_COMPILE, but try to build modules against the
# kernel, inside the kernel-tests directory
AC_SUBST(OPENIBCPPFLAGS)
AC_SUBST(OPENIBNAL)
+ #### Infinicon IB
+ AC_MSG_CHECKING([if Infinicon IB kernel headers are present])
+ # for how the only infinicon ib build has headers in /usr/include/iba
+ IIBCPPFLAGS="-I/usr/include -DIN_TREE_BUILD"
+ EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS"
+ EXTRA_KCFLAGS="$EXTRA_KCFLAGS $IIBCPPFLAGS"
+ LUSTRE_MODULE_TRY_COMPILE(
+ [
+ #include <linux/iba/ibt.h>
+ ],[
+ IBT_INTERFACE_UNION interfaces;
+ FSTATUS rc;
+
+ rc = IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2,
+ &interfaces);
+
+ return rc == FSUCCESS ? 0 : 1;
+ ],[
+ AC_MSG_RESULT([yes])
+ IIBNAL="iibnal"
+ ],[
+ AC_MSG_RESULT([no])
+ IIBNAL=""
+ IIBCPPFLAGS=""
+ ])
+ EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save"
+ AC_SUBST(IIBCPPFLAGS)
+ AC_SUBST(IIBNAL)
+
# ---------- Red Hat 2.4.18 has iobuf->dovary --------------
# But other kernels don't
AM_CONDITIONAL(BUILD_QSWNAL, test x$QSWNAL = "xqswnal")
AM_CONDITIONAL(BUILD_GMNAL, test x$GMNAL = "xgmnal")
AM_CONDITIONAL(BUILD_OPENIBNAL, test x$OPENIBNAL = "xopenibnal")
+AM_CONDITIONAL(BUILD_IIBNAL, test x$IIBNAL = "xiibnal")
+
+# portals/utils/portals.c
+AC_CHECK_HEADERS([netdb.h netinet/tcp.h asm/types.h])
+AC_CHECK_FUNCS([gethostbyname socket connect])
+
+# portals/utils/debug.c
+AC_CHECK_HEADERS([linux/version.h])
+
+# include/liblustre.h
+AC_CHECK_HEADERS([asm/page.h sys/user.h stdint.h])
+
+# liblustre/llite_lib.h
+AC_CHECK_HEADERS([xtio.h file.h])
+
+# liblustre/dir.c
+AC_CHECK_HEADERS([linux/types.h sys/types.h linux/unistd.h unistd.h])
+
+# liblustre/lutil.c
+AC_CHECK_HEADERS([netinet/in.h arpa/inet.h catamount/data.h])
+AC_CHECK_FUNCS([inet_ntoa])
CPPFLAGS="-include \$(top_builddir)/include/config.h $CPPFLAGS"
EXTRA_KCFLAGS="-include $PWD/include/config.h $EXTRA_KCFLAGS"
AC_SUBST(EXTRA_KCFLAGS)
-#echo "KCPPFLAGS: $KCPPFLAGS"
-#echo "KCFLAGS: $KCFLAGS"
-#echo "LLCPPFLAGS: $LLCPPFLAGS"
-#echo "LLCFLAGS: $LLCFLAGS"
-#echo "MOD_LINK: $MOD_LINK"
-#echo "CFLAGS: $CFLAGS"
-#echo "CPPFLAGS: $CPPFLAGS"
+echo "CPPFLAGS: $CPPFLAGS"
+echo "LLCPPFLAGS: $LLCPPFLAGS"
+echo "CFLAGS: $CFLAGS"
+echo "EXTRA_KCFLAGS: $EXTRA_KCFLAGS"
+echo "LLCFLAGS: $LLCFLAGS"
"gcc version 2.96 20000731 (Mandrake Linux 8.1 2.96-0.62mdk)")
bad_cc
;;
+ # unpatched 'gcc' on rh9. miscompiles a
+ # struct = (type) { .member = value, };
+ # asignment in the iibnal where the struct is a mix
+ # of u64 and u32 bit-fields.
+ "gcc version 3.2.2 20030222 (Red Hat Linux 3.2.2-5)")
+ bad_cc
+ ;;
*)
AC_MSG_RESULT([no known problems])
;;
LIBWRAP=""
fi
AC_SUBST(LIBWRAP)
+
+AC_SUBST(LIBS)
--- /dev/null
+Makefile
+Makefile.in
# include <unistd.h>
# include <time.h>
# include <limits.h>
-# include <asm/types.h>
# ifndef DEBUG_SUBSYSTEM
# define DEBUG_SUBSYSTEM S_UNDEFINED
# endif
printf("%02x:%06x (@%lu %s:%s,l. %d %d %lu): " format, \
(subsys), (mask), (long)time(0), file, fn, line, \
getpid() , stack, ## a);
+
+#undef CWARN
+#undef CERROR
+#define CWARN(format, a...) CDEBUG(D_WARNING, format, ## a)
+#define CERROR(format, a...) CDEBUG(D_ERROR, format, ## a)
#endif
/* support decl needed both by kernel and liblustre */
#define LWT_MEMORY (16<<20)
#if !KLWT_SUPPORT
+# if defined(__KERNEL__)
+# if !defined(BITS_PER_LONG)
+# error "BITS_PER_LONG not defined"
+# endif
+# elif !defined(__WORDSIZE)
+# error "__WORDSIZE not defined"
+# else
+# define BITS_PER_LONG __WORDSIZE
+# endif
+
/* kernel hasn't defined this? */
typedef struct {
long long lwte_when;
data = (struct portal_ioctl_data *)buf;
err = copy_from_user(buf, (void *)arg, sizeof(*hdr));
- if ( err ) {
- EXIT;
- return err;
- }
+ if (err)
+ RETURN(err);
if (hdr->ioc_version != PORTAL_IOCTL_VERSION) {
- CERROR ("PORTALS: version mismatch kernel vs application\n");
- return -EINVAL;
+ CERROR("PORTALS: version mismatch kernel vs application\n");
+ RETURN(-EINVAL);
}
if (hdr->ioc_len + buf >= end) {
- CERROR ("PORTALS: user buffer exceeds kernel buffer\n");
- return -EINVAL;
+ CERROR("PORTALS: user buffer exceeds kernel buffer\n");
+ RETURN(-EINVAL);
}
if (hdr->ioc_len < sizeof(struct portal_ioctl_data)) {
- CERROR ("PORTALS: user buffer too small for ioctl\n");
- return -EINVAL;
+ CERROR("PORTALS: user buffer too small for ioctl\n");
+ RETURN(-EINVAL);
}
err = copy_from_user(buf, (void *)arg, hdr->ioc_len);
- if ( err ) {
- EXIT;
- return err;
- }
+ if (err)
+ RETURN(err);
if (portal_ioctl_is_invalid(data)) {
- CERROR ("PORTALS: ioctl not correctly formatted\n");
- return -EINVAL;
+ CERROR("PORTALS: ioctl not correctly formatted\n");
+ RETURN(-EINVAL);
}
- if (data->ioc_inllen1) {
+ if (data->ioc_inllen1)
data->ioc_inlbuf1 = &data->ioc_bulk[0];
- }
- if (data->ioc_inllen2) {
+ if (data->ioc_inllen2)
data->ioc_inlbuf2 = &data->ioc_bulk[0] +
size_round(data->ioc_inllen1);
- }
- EXIT;
- return 0;
+ RETURN(0);
}
#endif
TCPNAL = 5,
ROUTER = 6,
OPENIBNAL = 7,
+ IIBNAL = 8,
NAL_ENUM_END_MARKER
};
-#define PTL_NALFMT_SIZE 30 /* %u:%u.%u.%u.%u,%u (10+4+4+4+3+4+1) */
+#define PTL_NALFMT_SIZE 32 /* %u:%u.%u.%u.%u,%u (10+4+4+4+3+5+1) */
#define NAL_MAX_NR (NAL_ENUM_END_MARKER - 1)
#ifndef _LIBCFS_H
#define _LIBCFS_H
+#ifdef HAVE_ASM_TYPES_H
#include <asm/types.h>
+#else
+#include "types.h"
+#endif
#ifdef __KERNEL__
# include <linux/time.h>
extern unsigned int portal_debug;
extern unsigned int portal_printk;
-#include <asm/types.h>
struct ptldebug_header {
__u32 ph_len;
__u32 ph_flags;
#define S_GMNAL 0x00080000
#define S_PTLROUTER 0x00100000
#define S_COBD 0x00200000
-#define S_OPENIBNAL 0x00400000
+#define S_IBNAL 0x00400000 /* All IB NALs */
#define S_SM 0x00800000
#define S_ASOBD 0x01000000
#define S_LMV 0x02000000
CDEBUG_STACK, format, ## a); \
} while (0)
-#define CWARN(format, a...) CDEBUG(D_WARNING, format, ## a)
-#define CERROR(format, a...) CDEBUG(D_ERROR, format, ## a)
+#define CDEBUG_MAX_LIMIT 600
+#define CDEBUG_LIMIT(cdebug_mask, cdebug_format, a...) \
+do { \
+ static unsigned long cdebug_next; \
+ static int cdebug_count, cdebug_delay = 1; \
+ \
+ CHECK_STACK(CDEBUG_STACK); \
+ if (time_after(jiffies, cdebug_next)) { \
+ portals_debug_msg(DEBUG_SUBSYSTEM, cdebug_mask, __FILE__, \
+ __FUNCTION__, __LINE__, CDEBUG_STACK, \
+ cdebug_format, ## a); \
+ if (cdebug_count) { \
+ portals_debug_msg(DEBUG_SUBSYSTEM, cdebug_mask, \
+ __FILE__, __FUNCTION__, __LINE__, \
+ CDEBUG_STACK, cdebug_format, ## a); \
+ cdebug_count = 0; \
+ } \
+ if (time_after(jiffies, cdebug_next+(CDEBUG_MAX_LIMIT+10)*HZ))\
+ cdebug_delay = cdebug_delay > 8 ? cdebug_delay/8 : 1; \
+ else \
+ cdebug_delay = cdebug_delay*2 >= CDEBUG_MAX_LIMIT*HZ ?\
+ CDEBUG_MAX_LIMIT*HZ : cdebug_delay*2; \
+ cdebug_next = jiffies + cdebug_delay; \
+ } else { \
+ portals_debug_msg(DEBUG_SUBSYSTEM, \
+ portal_debug & ~(D_EMERG|D_ERROR|D_WARNING),\
+ __FILE__, __FUNCTION__, __LINE__, \
+ CDEBUG_STACK, cdebug_format, ## a); \
+ cdebug_count++; \
+ } \
+} while (0)
+
+#define CWARN(format, a...) CDEBUG_LIMIT(D_WARNING, format, ## a)
+#define CERROR(format, a...) CDEBUG_LIMIT(D_ERROR, format, ## a)
#define CEMERG(format, a...) CDEBUG(D_EMERG, format, ## a)
#define GOTO(label, rc) \
/* initial pid */
# if CRAY_PORTALS
/*
+ * 1) ptl_pid_t in cray portals is only 16 bits, not 32 bits, therefore this
+ * is too big.
*
- * 1) ptl_pid_t in cray portals is only 16 bits, not 32 bits, therefore this is too
- * big.
- *
- * 2) the implementation of ernal in cray portals further restricts the pid space
- * that may be used to 0 <= pid <= 255 (an 8 bit value). Returns an error at nal
- * init time for any pid outside this range. Other nals in cray portals don't have
- * this restriction.
+ * 2) the implementation of ernal in cray portals further restricts the pid
+ * space that may be used to 0 <= pid <= 255 (an 8 bit value). Returns
+ * an error at nal init time for any pid outside this range. Other nals
+ * in cray portals don't have this restriction.
* */
#define LUSTRE_PTL_PID 9
# else
call_usermodehelper(path, argv, envp, 1)
# define RECALC_SIGPENDING recalc_sigpending()
# define CURRENT_SECONDS get_seconds()
+# define smp_num_cpus NR_CPUS
+
#elif defined(CONFIG_RH_2_4_20) /* RH 2.4.x */
--- /dev/null
+Makefile
+Makefile.in
#ifndef _BUILD_CHECK_H
#define _BUILD_CHECK_H
-#ifdef CRAY_PORTALS
+#if CRAY_PORTALS
#error "an application got to me instead of cray's includes"
#endif
#define PORTALS_DEV_PATH "/dev/portals"
#define OBD_DEV_ID 1
#define OBD_DEV_PATH "/dev/obd"
-#define SMFS_DEV_ID 2
-#define SMFS_DEV_PATH "/dev/snapdev"
int ptl_name2nal(char *str);
int ptl_parse_ipaddr (__u32 *ipaddrp, char *str);
int ptl_initialize(int argc, char **argv);
int jt_ptl_network(int argc, char **argv);
-int jt_ptl_print_autoconnects (int argc, char **argv);
-int jt_ptl_add_autoconnect (int argc, char **argv);
-int jt_ptl_del_autoconnect (int argc, char **argv);
int jt_ptl_print_interfaces(int argc, char **argv);
int jt_ptl_add_interface(int argc, char **argv);
int jt_ptl_del_interface(int argc, char **argv);
int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility */
int jt_ptl_close_uuid(int argc, char **argv);
int jt_ptl_del_uuid(int argc, char **argv);
-int jt_ptl_rxmem (int argc, char **argv);
-int jt_ptl_txmem (int argc, char **argv);
-int jt_ptl_nagle (int argc, char **argv);
int jt_ptl_add_route (int argc, char **argv);
int jt_ptl_del_route (int argc, char **argv);
int jt_ptl_notify_router (int argc, char **argv);
#define PORTALS_DEV_PATH "/dev/portals"
#define OBD_DEV_ID 1
#define OBD_DEV_PATH "/dev/obd"
-#define SMFS_DEV_ID 2
-#define SMFS_DEV_PATH "/dev/snapdev"
int ptl_name2nal(char *str);
int ptl_parse_ipaddr (__u32 *ipaddrp, char *str);
int ptl_initialize(int argc, char **argv);
int jt_ptl_network(int argc, char **argv);
-int jt_ptl_print_autoconnects (int argc, char **argv);
-int jt_ptl_add_autoconnect (int argc, char **argv);
-int jt_ptl_del_autoconnect (int argc, char **argv);
int jt_ptl_print_interfaces(int argc, char **argv);
int jt_ptl_add_interface(int argc, char **argv);
int jt_ptl_del_interface(int argc, char **argv);
int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility */
int jt_ptl_close_uuid(int argc, char **argv);
int jt_ptl_del_uuid(int argc, char **argv);
-int jt_ptl_rxmem (int argc, char **argv);
-int jt_ptl_txmem (int argc, char **argv);
-int jt_ptl_nagle (int argc, char **argv);
int jt_ptl_add_route (int argc, char **argv);
int jt_ptl_del_route (int argc, char **argv);
int jt_ptl_notify_router (int argc, char **argv);
@BUILD_GMNAL_TRUE@subdir-m += gmnal
@BUILD_OPENIBNAL_TRUE@subdir-m += openibnal
+@BUILD_IIBNAL_TRUE@subdir-m += iibnal
@BUILD_QSWNAL_TRUE@subdir-m += qswnal
subdir-m += socknal
# This code is issued under the GNU General Public License.
# See the file COPYING in this distribution
-SUBDIRS = gmnal openibnal qswnal socknal
+SUBDIRS = gmnal iibnal openibnal qswnal socknal
--- /dev/null
+.deps
+Makefile
+.*.cmd
+autoMakefile.in
+autoMakefile
+*.ko
+*.mod.c
+.*.flags
+.tmp_versions
+.depend
--- /dev/null
+MODULES := kiibnal
+kiibnal-objs := iibnal.o iibnal_cb.o
+
+EXTRA_POST_CFLAGS := @IIBCPPFLAGS@
+
+@INCLUDE_RULES@
--- /dev/null
+# Copyright (C) 2001 Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include $(src)/../../Kernelenv
+
+obj-y += kiibnal.o
+kiibnal-objs := iibnal.o iibnal_cb.o
+
--- /dev/null
+# Copyright (C) 2001 Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+if MODULES
+if !CRAY_PORTALS
+if BUILD_IIBNAL
+modulenet_DATA = kiibnal$(KMODEXT)
+endif
+endif
+endif
+
+MOSTLYCLEANFILES = *.o *.ko *.mod.c
+DIST_SOURCES = $(kiibnal-objs:%.o=%.c) iibnal.h
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "iibnal.h"
+
+nal_t kibnal_api;
+ptl_handle_ni_t kibnal_ni;
+kib_tunables_t kibnal_tunables;
+
+kib_data_t kibnal_data = {
+ .kib_service_id = IBNAL_SERVICE_NUMBER,
+};
+
+#ifdef CONFIG_SYSCTL
+#define IBNAL_SYSCTL 202
+
+#define IBNAL_SYSCTL_TIMEOUT 1
+
+static ctl_table kibnal_ctl_table[] = {
+ {IBNAL_SYSCTL_TIMEOUT, "timeout",
+ &kibnal_tunables.kib_io_timeout, sizeof (int),
+ 0644, NULL, &proc_dointvec},
+ { 0 }
+};
+
+static ctl_table kibnal_top_ctl_table[] = {
+ {IBNAL_SYSCTL, "iibnal", NULL, 0, 0555, kibnal_ctl_table},
+ { 0 }
+};
+#endif
+
+#ifdef unused
+void
+print_service(IB_SERVICE_RECORD *service, char *tag, int rc)
+{
+ char name[32];
+
+ if (service == NULL)
+ {
+ CWARN("tag : %s\n"
+ "status : %d (NULL)\n", tag, rc);
+ return;
+ }
+ strncpy (name, service->ServiceName, sizeof(name)-1);
+ name[sizeof(name)-1] = 0;
+
+ CWARN("tag : %s\n"
+ "status : %d\n"
+ "service id: "LPX64"\n"
+ "name : %s\n"
+ "NID : "LPX64"\n", tag, rc,
+ service->RID.ServiceID, name,
+ *kibnal_service_nid_field(service));
+}
+#endif
+
+static void
+kibnal_service_setunset_done (void *arg, FABRIC_OPERATION_DATA *fod,
+ FSTATUS frc, uint32 madrc)
+{
+ *(FSTATUS *)arg = frc;
+ up (&kibnal_data.kib_nid_signal);
+}
+
+#if IBNAL_CHECK_ADVERT
+static void
+kibnal_service_query_done (void *arg, QUERY *qry,
+ QUERY_RESULT_VALUES *qry_result)
+{
+ FSTATUS frc = qry_result->Status;
+
+ if (frc != FSUCCESS &&
+ qry_result->ResultDataSize == 0)
+ frc = FERROR;
+
+ *(FSTATUS *)arg = frc;
+ up (&kibnal_data.kib_nid_signal);
+}
+
+static void
+kibnal_check_advert (void)
+{
+ QUERY *qry;
+ IB_SERVICE_RECORD *svc;
+ FSTATUS frc;
+ FSTATUS frc2;
+
+ PORTAL_ALLOC(qry, sizeof(*qry));
+ if (qry == NULL)
+ return;
+
+ memset (qry, 0, sizeof(*qry));
+ qry->InputType = InputTypeServiceRecord;
+ qry->OutputType = OutputTypeServiceRecord;
+ qry->InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK;
+ svc = &qry->InputValue.ServiceRecordValue.ServiceRecord;
+ kibnal_set_service_keys(svc, kibnal_data.kib_nid);
+
+ frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd,
+ kibnal_data.kib_port_guid,
+ qry,
+ kibnal_service_query_done,
+ NULL, &frc2);
+ if (frc != FSUCCESS && frc != FPENDING) {
+ CERROR ("Immediate error %d checking SM service\n", frc);
+ } else {
+ down (&kibnal_data.kib_nid_signal);
+ frc = frc2;
+
+ if (frc != 0)
+ CERROR ("Error %d checking SM service\n", rc);
+ }
+
+ return (rc);
+}
+#endif
+
+static void fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type)
+{
+ IB_SERVICE_RECORD *svc;
+
+ memset (fod, 0, sizeof(*fod));
+ fod->Type = type;
+
+ svc = &fod->Value.ServiceRecordValue.ServiceRecord;
+ svc->RID.ServiceID = kibnal_data.kib_service_id;
+ svc->RID.ServiceGID.Type.Global.InterfaceID = kibnal_data.kib_port_guid;
+ svc->RID.ServiceGID.Type.Global.SubnetPrefix = DEFAULT_SUBNET_PREFIX;
+ svc->RID.ServiceP_Key = kibnal_data.kib_port_pkey;
+ svc->ServiceLease = 0xffffffff;
+
+ kibnal_set_service_keys(svc, kibnal_data.kib_nid);
+}
+
+static int
+kibnal_advertise (void)
+{
+ FABRIC_OPERATION_DATA *fod;
+ IB_SERVICE_RECORD *svc;
+ FSTATUS frc;
+ FSTATUS frc2;
+
+ LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
+
+ PORTAL_ALLOC(fod, sizeof(*fod));
+ if (fod == NULL)
+ return (-ENOMEM);
+
+ fill_fod(fod, FabOpSetServiceRecord);
+ svc = &fod->Value.ServiceRecordValue.ServiceRecord;
+
+ CDEBUG(D_NET, "Advertising service id "LPX64" %s:"LPX64"\n",
+ svc->RID.ServiceID,
+ svc->ServiceName, *kibnal_service_nid_field(svc));
+
+ frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd,
+ kibnal_data.kib_port_guid,
+ fod, kibnal_service_setunset_done,
+ NULL, &frc2);
+
+ if (frc != FSUCCESS && frc != FPENDING) {
+ CERROR ("Immediate error %d advertising NID "LPX64"\n",
+ frc, kibnal_data.kib_nid);
+ goto out;
+ }
+
+ down (&kibnal_data.kib_nid_signal);
+
+ frc = frc2;
+ if (frc != FSUCCESS)
+ CERROR ("Error %d advertising BUD "LPX64"\n",
+ frc, kibnal_data.kib_nid);
+out:
+ PORTAL_FREE(fod, sizeof(*fod));
+ return (frc == FSUCCESS) ? 0 : -EINVAL;
+}
+
+static void
+kibnal_unadvertise (int expect_success)
+{
+ FABRIC_OPERATION_DATA *fod;
+ IB_SERVICE_RECORD *svc;
+ FSTATUS frc;
+ FSTATUS frc2;
+
+ LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
+
+ PORTAL_ALLOC(fod, sizeof(*fod));
+ if (fod == NULL)
+ return;
+
+ fill_fod(fod, FabOpDeleteServiceRecord);
+ svc = &fod->Value.ServiceRecordValue.ServiceRecord;
+
+ CDEBUG(D_NET, "Unadvertising service %s:"LPX64"\n",
+ svc->ServiceName, *kibnal_service_nid_field(svc));
+
+ frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd,
+ kibnal_data.kib_port_guid,
+ fod, kibnal_service_setunset_done,
+ NULL, &frc2);
+
+ if (frc != FSUCCESS && frc != FPENDING) {
+ CERROR ("Immediate error %d unadvertising NID "LPX64"\n",
+ frc, kibnal_data.kib_nid);
+ goto out;
+ }
+
+ down (&kibnal_data.kib_nid_signal);
+
+ if ((frc2 == FSUCCESS) == !!expect_success)
+ goto out;
+
+ if (expect_success)
+ CERROR("Error %d unadvertising NID "LPX64"\n",
+ frc2, kibnal_data.kib_nid);
+ else
+ CWARN("Removed conflicting NID "LPX64"\n",
+ kibnal_data.kib_nid);
+ out:
+ PORTAL_FREE(fod, sizeof(*fod));
+}
+
+static int
+kibnal_set_mynid(ptl_nid_t nid)
+{
+ struct timeval tv;
+ lib_ni_t *ni = &kibnal_lib.libnal_ni;
+ int rc;
+ FSTATUS frc;
+
+ CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
+ nid, ni->ni_pid.nid);
+
+ do_gettimeofday(&tv);
+
+ down (&kibnal_data.kib_nid_mutex);
+
+ if (nid == kibnal_data.kib_nid) {
+ /* no change of NID */
+ up (&kibnal_data.kib_nid_mutex);
+ return (0);
+ }
+
+ CDEBUG(D_NET, "NID "LPX64"("LPX64")\n",
+ kibnal_data.kib_nid, nid);
+
+ if (kibnal_data.kib_nid != PTL_NID_ANY) {
+
+ kibnal_unadvertise (1);
+
+ frc = iibt_cm_cancel(kibnal_data.kib_cep);
+ if (frc != FSUCCESS && frc != FPENDING)
+ CERROR ("Error %d stopping listener\n", frc);
+
+ frc = iibt_cm_destroy_cep(kibnal_data.kib_cep);
+ if (frc != FSUCCESS)
+ CERROR ("Error %d destroying CEP\n", frc);
+
+ kibnal_data.kib_cep = NULL;
+ }
+
+ kibnal_data.kib_nid = ni->ni_pid.nid = nid;
+ kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+
+ /* Delete all existing peers and their connections after new
+ * NID/incarnation set to ensure no old connections in our brave
+ * new world. */
+ kibnal_del_peer (PTL_NID_ANY, 0);
+
+ if (kibnal_data.kib_nid == PTL_NID_ANY) {
+ /* No new NID to install */
+ up (&kibnal_data.kib_nid_mutex);
+ return (0);
+ }
+
+ /* remove any previous advert (crashed node etc) */
+ kibnal_unadvertise(0);
+
+ kibnal_data.kib_cep = iibt_cm_create_cep(CM_RC_TYPE);
+ if (kibnal_data.kib_cep == NULL) {
+ CERROR ("Can't create CEP\n");
+ rc = -ENOMEM;
+ } else {
+ CM_LISTEN_INFO info;
+ memset (&info, 0, sizeof(info));
+ info.ListenAddr.EndPt.SID = kibnal_data.kib_service_id;
+
+ frc = iibt_cm_listen(kibnal_data.kib_cep, &info,
+ kibnal_listen_callback, NULL);
+ if (frc != FSUCCESS && frc != FPENDING) {
+ CERROR ("iibt_cm_listen error: %d\n", frc);
+ rc = -EINVAL;
+ } else {
+ rc = 0;
+ }
+ }
+
+ if (rc == 0) {
+ rc = kibnal_advertise();
+ if (rc == 0) {
+#if IBNAL_CHECK_ADVERT
+ kibnal_check_advert();
+#endif
+ up (&kibnal_data.kib_nid_mutex);
+ return (0);
+ }
+
+ iibt_cm_cancel (kibnal_data.kib_cep);
+ iibt_cm_destroy_cep (kibnal_data.kib_cep);
+ /* remove any peers that sprung up while I failed to
+ * advertise myself */
+ kibnal_del_peer (PTL_NID_ANY, 0);
+ }
+
+ kibnal_data.kib_nid = PTL_NID_ANY;
+ up (&kibnal_data.kib_nid_mutex);
+ return (rc);
+}
+
+kib_peer_t *
+kibnal_create_peer (ptl_nid_t nid)
+{
+ kib_peer_t *peer;
+
+ LASSERT (nid != PTL_NID_ANY);
+
+ PORTAL_ALLOC (peer, sizeof (*peer));
+ if (peer == NULL)
+ return (NULL);
+
+ memset(peer, 0, sizeof(*peer)); /* zero flags etc */
+
+ peer->ibp_nid = nid;
+ atomic_set (&peer->ibp_refcount, 1); /* 1 ref for caller */
+
+ INIT_LIST_HEAD (&peer->ibp_list); /* not in the peer table yet */
+ INIT_LIST_HEAD (&peer->ibp_conns);
+ INIT_LIST_HEAD (&peer->ibp_tx_queue);
+
+ peer->ibp_reconnect_time = jiffies;
+ peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
+
+ atomic_inc (&kibnal_data.kib_npeers);
+ return (peer);
+}
+
+void
+kibnal_destroy_peer (kib_peer_t *peer)
+{
+
+ LASSERT (atomic_read (&peer->ibp_refcount) == 0);
+ LASSERT (peer->ibp_persistence == 0);
+ LASSERT (!kibnal_peer_active(peer));
+ LASSERT (peer->ibp_connecting == 0);
+ LASSERT (list_empty (&peer->ibp_conns));
+ LASSERT (list_empty (&peer->ibp_tx_queue));
+
+ PORTAL_FREE (peer, sizeof (*peer));
+
+ /* NB a peer's connections keep a reference on their peer until
+ * they are destroyed, so we can be assured that _all_ state to do
+ * with this peer has been cleaned up when its refcount drops to
+ * zero. */
+ atomic_dec (&kibnal_data.kib_npeers);
+}
+
+/* the caller is responsible for accounting for the additional reference
+ * that this creates */
+kib_peer_t *
+kibnal_find_peer_locked (ptl_nid_t nid)
+{
+ struct list_head *peer_list = kibnal_nid2peerlist (nid);
+ struct list_head *tmp;
+ kib_peer_t *peer;
+
+ list_for_each (tmp, peer_list) {
+
+ peer = list_entry (tmp, kib_peer_t, ibp_list);
+
+ LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
+ peer->ibp_connecting != 0 || /* creating conns */
+ !list_empty (&peer->ibp_conns)); /* active conn */
+
+ if (peer->ibp_nid != nid)
+ continue;
+
+ CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n",
+ peer, nid, atomic_read (&peer->ibp_refcount));
+ return (peer);
+ }
+ return (NULL);
+}
+
+kib_peer_t *
+kibnal_get_peer (ptl_nid_t nid)
+{
+ kib_peer_t *peer;
+
+ read_lock (&kibnal_data.kib_global_lock);
+ peer = kibnal_find_peer_locked (nid);
+ if (peer != NULL) /* +1 ref for caller? */
+ kib_peer_addref(peer);
+ read_unlock (&kibnal_data.kib_global_lock);
+
+ return (peer);
+}
+
+void
+kibnal_unlink_peer_locked (kib_peer_t *peer)
+{
+ LASSERT (peer->ibp_persistence == 0);
+ LASSERT (list_empty(&peer->ibp_conns));
+
+ LASSERT (kibnal_peer_active(peer));
+ list_del_init (&peer->ibp_list);
+ /* lose peerlist's ref */
+ kib_peer_decref(peer);
+}
+
+static int
+kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
+{
+ kib_peer_t *peer;
+ struct list_head *ptmp;
+ int i;
+
+ read_lock (&kibnal_data.kib_global_lock);
+
+ for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
+
+ list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
+
+ peer = list_entry (ptmp, kib_peer_t, ibp_list);
+ LASSERT (peer->ibp_persistence != 0 ||
+ peer->ibp_connecting != 0 ||
+ !list_empty (&peer->ibp_conns));
+
+ if (index-- > 0)
+ continue;
+
+ *nidp = peer->ibp_nid;
+ *persistencep = peer->ibp_persistence;
+
+ read_unlock (&kibnal_data.kib_global_lock);
+ return (0);
+ }
+ }
+
+ read_unlock (&kibnal_data.kib_global_lock);
+ return (-ENOENT);
+}
+
+static int
+kibnal_add_persistent_peer (ptl_nid_t nid)
+{
+ unsigned long flags;
+ kib_peer_t *peer;
+ kib_peer_t *peer2;
+
+ if (nid == PTL_NID_ANY)
+ return (-EINVAL);
+
+ peer = kibnal_create_peer (nid);
+ if (peer == NULL)
+ return (-ENOMEM);
+
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+ peer2 = kibnal_find_peer_locked (nid);
+ if (peer2 != NULL) {
+ kib_peer_decref (peer);
+ peer = peer2;
+ } else {
+ /* peer table takes existing ref on peer */
+ list_add_tail (&peer->ibp_list,
+ kibnal_nid2peerlist (nid));
+ }
+
+ peer->ibp_persistence++;
+
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+ return (0);
+}
+
+static void
+kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
+{
+ struct list_head *ctmp;
+ struct list_head *cnxt;
+ kib_conn_t *conn;
+
+ if (!single_share)
+ peer->ibp_persistence = 0;
+ else if (peer->ibp_persistence > 0)
+ peer->ibp_persistence--;
+
+ if (peer->ibp_persistence != 0)
+ return;
+
+ list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+ conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+ kibnal_close_conn_locked (conn, 0);
+ }
+
+ /* NB peer unlinks itself when last conn is closed */
+}
+
+int
+kibnal_del_peer (ptl_nid_t nid, int single_share)
+{
+ unsigned long flags;
+ struct list_head *ptmp;
+ struct list_head *pnxt;
+ kib_peer_t *peer;
+ int lo;
+ int hi;
+ int i;
+ int rc = -ENOENT;
+
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+ if (nid != PTL_NID_ANY)
+ lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
+ else {
+ lo = 0;
+ hi = kibnal_data.kib_peer_hash_size - 1;
+ }
+
+ for (i = lo; i <= hi; i++) {
+ list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
+ peer = list_entry (ptmp, kib_peer_t, ibp_list);
+ LASSERT (peer->ibp_persistence != 0 ||
+ peer->ibp_connecting != 0 ||
+ !list_empty (&peer->ibp_conns));
+
+ if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid))
+ continue;
+
+ kibnal_del_peer_locked (peer, single_share);
+ rc = 0; /* matched something */
+
+ if (single_share)
+ goto out;
+ }
+ }
+ out:
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+ return (rc);
+}
+
+static kib_conn_t *
+kibnal_get_conn_by_idx (int index)
+{
+ kib_peer_t *peer;
+ struct list_head *ptmp;
+ kib_conn_t *conn;
+ struct list_head *ctmp;
+ int i;
+
+ read_lock (&kibnal_data.kib_global_lock);
+
+ for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
+ list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
+
+ peer = list_entry (ptmp, kib_peer_t, ibp_list);
+ LASSERT (peer->ibp_persistence > 0 ||
+ peer->ibp_connecting != 0 ||
+ !list_empty (&peer->ibp_conns));
+
+ list_for_each (ctmp, &peer->ibp_conns) {
+ if (index-- > 0)
+ continue;
+
+ conn = list_entry (ctmp, kib_conn_t, ibc_list);
+ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+ atomic_inc (&conn->ibc_refcount);
+ read_unlock (&kibnal_data.kib_global_lock);
+ return (conn);
+ }
+ }
+ }
+
+ read_unlock (&kibnal_data.kib_global_lock);
+ return (NULL);
+}
+
+kib_conn_t *
+kibnal_create_conn (void)
+{
+ kib_conn_t *conn;
+ int i;
+ __u64 vaddr = 0;
+ __u64 vaddr_base;
+ int page_offset;
+ int ipage;
+ int rc;
+ FSTATUS frc;
+ union {
+ IB_QP_ATTRIBUTES_CREATE qp_create;
+ IB_QP_ATTRIBUTES_MODIFY qp_attr;
+ } params;
+
+ PORTAL_ALLOC (conn, sizeof (*conn));
+ if (conn == NULL) {
+ CERROR ("Can't allocate connection\n");
+ return (NULL);
+ }
+
+ /* zero flags, NULL pointers etc... */
+ memset (conn, 0, sizeof (*conn));
+
+ INIT_LIST_HEAD (&conn->ibc_tx_queue);
+ INIT_LIST_HEAD (&conn->ibc_active_txs);
+ spin_lock_init (&conn->ibc_lock);
+
+ atomic_inc (&kibnal_data.kib_nconns);
+ /* well not really, but I call destroy() on failure, which decrements */
+
+ PORTAL_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
+ if (conn->ibc_rxs == NULL)
+ goto failed;
+ memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
+
+ rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES, 1);
+ if (rc != 0)
+ goto failed;
+
+ vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr;
+
+ for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
+ struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
+ kib_rx_t *rx = &conn->ibc_rxs[i];
+
+ rx->rx_conn = conn;
+ rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) +
+ page_offset);
+
+ if (kibnal_whole_mem())
+ rx->rx_vaddr = kibnal_page2phys(page) +
+ page_offset +
+ kibnal_data.kib_md.md_addr;
+ else
+ rx->rx_vaddr = vaddr;
+
+ vaddr += IBNAL_MSG_SIZE;
+ LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
+
+ page_offset += IBNAL_MSG_SIZE;
+ LASSERT (page_offset <= PAGE_SIZE);
+
+ if (page_offset == PAGE_SIZE) {
+ page_offset = 0;
+ ipage++;
+ LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
+ }
+ }
+
+ params.qp_create = (IB_QP_ATTRIBUTES_CREATE) {
+ .Type = QPTypeReliableConnected,
+ .SendQDepth = IBNAL_TX_MAX_SG *
+ IBNAL_MSG_QUEUE_SIZE,
+ .RecvQDepth = IBNAL_MSG_QUEUE_SIZE,
+ .SendDSListDepth = 1,
+ .RecvDSListDepth = 1,
+ .SendCQHandle = kibnal_data.kib_cq,
+ .RecvCQHandle = kibnal_data.kib_cq,
+ .PDHandle = kibnal_data.kib_pd,
+ .SendSignaledCompletions = TRUE,
+ };
+ frc = iibt_qp_create(kibnal_data.kib_hca, ¶ms.qp_create, NULL,
+ &conn->ibc_qp, &conn->ibc_qp_attrs);
+ if (rc != 0) {
+ CERROR ("Failed to create queue pair: %d\n", rc);
+ goto failed;
+ }
+
+ /* Mark QP created */
+ conn->ibc_state = IBNAL_CONN_INIT_QP;
+
+ params.qp_attr = (IB_QP_ATTRIBUTES_MODIFY) {
+ .RequestState = QPStateInit,
+ .Attrs = (IB_QP_ATTR_PORTGUID |
+ IB_QP_ATTR_PKEYINDEX |
+ IB_QP_ATTR_ACCESSCONTROL),
+ .PortGUID = kibnal_data.kib_port_guid,
+ .PkeyIndex = 0,
+ .AccessControl = {
+ .s = {
+ .RdmaWrite = 1,
+ .RdmaRead = 1,
+ },
+ },
+ };
+ rc = iibt_qp_modify(conn->ibc_qp, ¶ms.qp_attr, NULL);
+ if (rc != 0) {
+ CERROR ("Failed to modify queue pair: %d\n", rc);
+ goto failed;
+ }
+
+ /* 1 ref for caller */
+ atomic_set (&conn->ibc_refcount, 1);
+ return (conn);
+
+ failed:
+ kibnal_destroy_conn (conn);
+ return (NULL);
+}
+
+void
+kibnal_destroy_conn (kib_conn_t *conn)
+{
+ int rc;
+ FSTATUS frc;
+
+ CDEBUG (D_NET, "connection %p\n", conn);
+
+ LASSERT (atomic_read (&conn->ibc_refcount) == 0);
+ LASSERT (list_empty(&conn->ibc_tx_queue));
+ LASSERT (list_empty(&conn->ibc_active_txs));
+ LASSERT (conn->ibc_nsends_posted == 0);
+ LASSERT (conn->ibc_connreq == NULL);
+
+ switch (conn->ibc_state) {
+ case IBNAL_CONN_DISCONNECTED:
+ /* called after connection sequence initiated */
+ /* fall through */
+
+ case IBNAL_CONN_INIT_QP:
+ /* _destroy includes an implicit Reset of the QP which
+ * discards posted work */
+ rc = iibt_qp_destroy(conn->ibc_qp);
+ if (rc != 0)
+ CERROR("Can't destroy QP: %d\n", rc);
+ /* fall through */
+
+ case IBNAL_CONN_INIT_NOTHING:
+ break;
+
+ default:
+ LASSERT (0);
+ }
+
+ if (conn->ibc_cep != NULL) {
+ frc = iibt_cm_destroy_cep(conn->ibc_cep);
+ if (frc != 0)
+ CERROR("Can't destroy CEP %p: %d\n", conn->ibc_cep,
+ frc);
+ }
+
+ if (conn->ibc_rx_pages != NULL)
+ kibnal_free_pages(conn->ibc_rx_pages);
+
+ if (conn->ibc_rxs != NULL)
+ PORTAL_FREE(conn->ibc_rxs,
+ IBNAL_RX_MSGS * sizeof(kib_rx_t));
+
+ if (conn->ibc_peer != NULL)
+ kib_peer_decref(conn->ibc_peer);
+
+ PORTAL_FREE(conn, sizeof (*conn));
+
+ atomic_dec(&kibnal_data.kib_nconns);
+
+ if (atomic_read (&kibnal_data.kib_nconns) == 0 &&
+ kibnal_data.kib_shutdown) {
+ /* I just nuked the last connection on shutdown; wake up
+ * everyone so they can exit. */
+ wake_up_all(&kibnal_data.kib_sched_waitq);
+ wake_up_all(&kibnal_data.kib_connd_waitq);
+ }
+}
+
+void
+kibnal_put_conn (kib_conn_t *conn)
+{
+ unsigned long flags;
+
+ CDEBUG (D_NET, "putting conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+
+ LASSERT (atomic_read (&conn->ibc_refcount) > 0);
+ if (!atomic_dec_and_test (&conn->ibc_refcount))
+ return;
+
+ /* must disconnect before dropping the final ref */
+ LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECTED);
+
+ spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+
+ list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns);
+ wake_up (&kibnal_data.kib_connd_waitq);
+
+ spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+}
+
+static int
+kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
+{
+ kib_conn_t *conn;
+ struct list_head *ctmp;
+ struct list_head *cnxt;
+ int count = 0;
+
+ list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+ conn = list_entry (ctmp, kib_conn_t, ibc_list);
+
+ count++;
+ kibnal_close_conn_locked (conn, why);
+ }
+
+ return (count);
+}
+
+int
+kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
+{
+ kib_conn_t *conn;
+ struct list_head *ctmp;
+ struct list_head *cnxt;
+ int count = 0;
+
+ list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+ conn = list_entry (ctmp, kib_conn_t, ibc_list);
+
+ if (conn->ibc_incarnation == incarnation)
+ continue;
+
+ CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n",
+ peer->ibp_nid, conn->ibc_incarnation, incarnation);
+
+ count++;
+ kibnal_close_conn_locked (conn, -ESTALE);
+ }
+
+ return (count);
+}
+
+static int
+kibnal_close_matching_conns (ptl_nid_t nid)
+{
+ unsigned long flags;
+ kib_peer_t *peer;
+ struct list_head *ptmp;
+ struct list_head *pnxt;
+ int lo;
+ int hi;
+ int i;
+ int count = 0;
+
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+ if (nid != PTL_NID_ANY)
+ lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
+ else {
+ lo = 0;
+ hi = kibnal_data.kib_peer_hash_size - 1;
+ }
+
+ for (i = lo; i <= hi; i++) {
+ list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
+
+ peer = list_entry (ptmp, kib_peer_t, ibp_list);
+ LASSERT (peer->ibp_persistence != 0 ||
+ peer->ibp_connecting != 0 ||
+ !list_empty (&peer->ibp_conns));
+
+ if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid))
+ continue;
+
+ count += kibnal_close_peer_conns_locked (peer, 0);
+ }
+ }
+
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+ /* wildcards always succeed */
+ if (nid == PTL_NID_ANY)
+ return (0);
+
+ return (count == 0 ? -ENOENT : 0);
+}
+
+static int
+kibnal_cmd(struct portals_cfg *pcfg, void * private)
+{
+ int rc = -EINVAL;
+ ENTRY;
+
+ LASSERT (pcfg != NULL);
+
+ switch(pcfg->pcfg_command) {
+ case NAL_CMD_GET_PEER: {
+ ptl_nid_t nid = 0;
+ int share_count = 0;
+
+ rc = kibnal_get_peer_info(pcfg->pcfg_count,
+ &nid, &share_count);
+ pcfg->pcfg_nid = nid;
+ pcfg->pcfg_size = 0;
+ pcfg->pcfg_id = 0;
+ pcfg->pcfg_misc = 0;
+ pcfg->pcfg_count = 0;
+ pcfg->pcfg_wait = share_count;
+ break;
+ }
+ case NAL_CMD_ADD_PEER: {
+ rc = kibnal_add_persistent_peer (pcfg->pcfg_nid);
+ break;
+ }
+ case NAL_CMD_DEL_PEER: {
+ rc = kibnal_del_peer (pcfg->pcfg_nid,
+ /* flags == single_share */
+ pcfg->pcfg_flags != 0);
+ break;
+ }
+ case NAL_CMD_GET_CONN: {
+ kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count);
+
+ if (conn == NULL)
+ rc = -ENOENT;
+ else {
+ rc = 0;
+ pcfg->pcfg_nid = conn->ibc_peer->ibp_nid;
+ pcfg->pcfg_id = 0;
+ pcfg->pcfg_misc = 0;
+ pcfg->pcfg_flags = 0;
+ kibnal_put_conn (conn);
+ }
+ break;
+ }
+ case NAL_CMD_CLOSE_CONNECTION: {
+ rc = kibnal_close_matching_conns (pcfg->pcfg_nid);
+ break;
+ }
+ case NAL_CMD_REGISTER_MYNID: {
+ if (pcfg->pcfg_nid == PTL_NID_ANY)
+ rc = -EINVAL;
+ else
+ rc = kibnal_set_mynid (pcfg->pcfg_nid);
+ break;
+ }
+ }
+
+ RETURN(rc);
+}
+
+void
+kibnal_free_pages (kib_pages_t *p)
+{
+ int npages = p->ibp_npages;
+ int rc;
+ int i;
+
+ if (p->ibp_mapped) {
+ rc = iibt_deregister_memory(p->ibp_handle);
+ if (rc != 0)
+ CERROR ("Deregister error: %d\n", rc);
+ }
+
+ for (i = 0; i < npages; i++)
+ if (p->ibp_pages[i] != NULL)
+ __free_page(p->ibp_pages[i]);
+
+ PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
+}
+
+int
+kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write)
+{
+ kib_pages_t *p;
+ __u64 *phys_pages;
+ int i;
+ FSTATUS frc;
+ IB_ACCESS_CONTROL access;
+
+ memset(&access, 0, sizeof(access));
+ access.s.MWBindable = 1;
+ access.s.LocalWrite = 1;
+ access.s.RdmaRead = 1;
+ access.s.RdmaWrite = 1;
+
+ PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
+ if (p == NULL) {
+ CERROR ("Can't allocate buffer %d\n", npages);
+ return (-ENOMEM);
+ }
+
+ memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
+ p->ibp_npages = npages;
+
+ for (i = 0; i < npages; i++) {
+ p->ibp_pages[i] = alloc_page (GFP_KERNEL);
+ if (p->ibp_pages[i] == NULL) {
+ CERROR ("Can't allocate page %d of %d\n", i, npages);
+ kibnal_free_pages(p);
+ return (-ENOMEM);
+ }
+ }
+
+ if (kibnal_whole_mem())
+ goto out;
+
+ PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages));
+ if (phys_pages == NULL) {
+ CERROR ("Can't allocate physarray for %d pages\n", npages);
+ /* XXX free ibp_pages? */
+ kibnal_free_pages(p);
+ return (-ENOMEM);
+ }
+
+ /* if we were using the _contig_ registration variant we would have
+ * an array of PhysAddr/Length pairs, but the discontiguous variant
+ * just takes the PhysAddr */
+ for (i = 0; i < npages; i++)
+ phys_pages[i] = kibnal_page2phys(p->ibp_pages[i]);
+
+ frc = iibt_register_physical_memory(kibnal_data.kib_hca,
+ 0, /* requested vaddr */
+ phys_pages, npages,
+ 0, /* offset */
+ kibnal_data.kib_pd,
+ access,
+ &p->ibp_handle, &p->ibp_vaddr,
+ &p->ibp_lkey, &p->ibp_rkey);
+
+ PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages));
+
+ if (frc != FSUCCESS) {
+ CERROR ("Error %d mapping %d pages\n", frc, npages);
+ kibnal_free_pages(p);
+ return (-ENOMEM);
+ }
+
+ CDEBUG(D_NET, "registered %d pages; handle: %p vaddr "LPX64" "
+ "lkey %x rkey %x\n", npages, p->ibp_handle,
+ p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey);
+
+ p->ibp_mapped = 1;
+out:
+ *pp = p;
+ return (0);
+}
+
+static int
+kibnal_setup_tx_descs (void)
+{
+ int ipage = 0;
+ int page_offset = 0;
+ __u64 vaddr;
+ __u64 vaddr_base;
+ struct page *page;
+ kib_tx_t *tx;
+ int i;
+ int rc;
+
+ /* pre-mapped messages are not bigger than 1 page */
+ LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
+
+ /* No fancy arithmetic when we do the buffer calculations */
+ LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
+
+ rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES,
+ 0);
+ if (rc != 0)
+ return (rc);
+
+ /* ignored for the whole_mem case */
+ vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
+
+ for (i = 0; i < IBNAL_TX_MSGS; i++) {
+ page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
+ tx = &kibnal_data.kib_tx_descs[i];
+
+ memset (tx, 0, sizeof(*tx)); /* zero flags etc */
+
+ tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) +
+ page_offset);
+
+ if (kibnal_whole_mem())
+ tx->tx_vaddr = kibnal_page2phys(page) +
+ page_offset +
+ kibnal_data.kib_md.md_addr;
+ else
+ tx->tx_vaddr = vaddr;
+
+ tx->tx_isnblk = (i >= IBNAL_NTX);
+ tx->tx_mapped = KIB_TX_UNMAPPED;
+
+ CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n",
+ i, tx, tx->tx_msg, tx->tx_vaddr);
+
+ if (tx->tx_isnblk)
+ list_add (&tx->tx_list,
+ &kibnal_data.kib_idle_nblk_txs);
+ else
+ list_add (&tx->tx_list,
+ &kibnal_data.kib_idle_txs);
+
+ vaddr += IBNAL_MSG_SIZE;
+ LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
+
+ page_offset += IBNAL_MSG_SIZE;
+ LASSERT (page_offset <= PAGE_SIZE);
+
+ if (page_offset == PAGE_SIZE) {
+ page_offset = 0;
+ ipage++;
+ LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
+ }
+ }
+
+ return (0);
+}
+
+static void
+kibnal_api_shutdown (nal_t *nal)
+{
+ int i;
+ int rc;
+
+ if (nal->nal_refct != 0) {
+ /* This module got the first ref */
+ PORTAL_MODULE_UNUSE;
+ return;
+ }
+
+ CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
+ atomic_read (&portal_kmemory));
+
+ LASSERT(nal == &kibnal_api);
+
+ switch (kibnal_data.kib_init) {
+ default:
+ CERROR ("Unexpected state %d\n", kibnal_data.kib_init);
+ LBUG();
+
+ case IBNAL_INIT_ALL:
+ /* stop calls to nal_cmd */
+ libcfs_nal_cmd_unregister(IIBNAL);
+ /* No new peers */
+
+ /* resetting my NID to unadvertises me, removes my
+ * listener and nukes all current peers */
+ kibnal_set_mynid (PTL_NID_ANY);
+
+ /* Wait for all peer state to clean up (crazy) */
+ i = 2;
+ while (atomic_read (&kibnal_data.kib_npeers) != 0) {
+ i++;
+ CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+ "waiting for %d peers to disconnect (can take a few seconds)\n",
+ atomic_read (&kibnal_data.kib_npeers));
+ set_current_state (TASK_UNINTERRUPTIBLE);
+ schedule_timeout (HZ);
+ }
+ /* fall through */
+
+ case IBNAL_INIT_CQ:
+ rc = iibt_cq_destroy(kibnal_data.kib_cq);
+ if (rc != 0)
+ CERROR ("Destroy CQ error: %d\n", rc);
+ /* fall through */
+
+ case IBNAL_INIT_TXD:
+ kibnal_free_pages (kibnal_data.kib_tx_pages);
+ /* fall through */
+
+ case IBNAL_INIT_MR:
+ if (kibnal_data.kib_md.md_handle != NULL) {
+ rc = iibt_deregister_memory(kibnal_data.kib_md.md_handle);
+ if (rc != FSUCCESS)
+ CERROR ("Deregister memory: %d\n", rc);
+ }
+ /* fall through */
+
+#if IBNAL_FMR
+ case IBNAL_INIT_FMR:
+ rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool);
+ if (rc != 0)
+ CERROR ("Destroy FMR pool error: %d\n", rc);
+ /* fall through */
+#endif
+ case IBNAL_INIT_PD:
+ rc = iibt_pd_free(kibnal_data.kib_pd);
+ if (rc != 0)
+ CERROR ("Destroy PD error: %d\n", rc);
+ /* fall through */
+
+ case IBNAL_INIT_SD:
+ rc = iibt_sd_deregister(kibnal_data.kib_sd);
+ if (rc != 0)
+ CERROR ("Deregister SD error: %d\n", rc);
+ /* fall through */
+
+ case IBNAL_INIT_PORT:
+ /* XXX ??? */
+ /* fall through */
+
+ case IBNAL_INIT_PORTATTRS:
+ PORTAL_FREE(kibnal_data.kib_hca_attrs.PortAttributesList,
+ kibnal_data.kib_hca_attrs.PortAttributesListSize);
+ /* fall through */
+
+ case IBNAL_INIT_HCA:
+ rc = iibt_close_hca(kibnal_data.kib_hca);
+ if (rc != 0)
+ CERROR ("Close HCA error: %d\n", rc);
+ /* fall through */
+
+ case IBNAL_INIT_LIB:
+ lib_fini(&kibnal_lib);
+ /* fall through */
+
+ case IBNAL_INIT_DATA:
+ /* Module refcount only gets to zero when all peers
+ * have been closed so all lists must be empty */
+ LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
+ LASSERT (kibnal_data.kib_peers != NULL);
+ for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
+ LASSERT (list_empty (&kibnal_data.kib_peers[i]));
+ }
+ LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
+ LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
+ LASSERT (list_empty (&kibnal_data.kib_sched_txq));
+ LASSERT (list_empty (&kibnal_data.kib_connd_conns));
+ LASSERT (list_empty (&kibnal_data.kib_connd_peers));
+
+ /* flag threads to terminate; wake and wait for them to die */
+ kibnal_data.kib_shutdown = 1;
+ wake_up_all (&kibnal_data.kib_sched_waitq);
+ wake_up_all (&kibnal_data.kib_connd_waitq);
+
+ i = 2;
+ while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
+ i++;
+ CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+ "Waiting for %d threads to terminate\n",
+ atomic_read (&kibnal_data.kib_nthreads));
+ set_current_state (TASK_INTERRUPTIBLE);
+ schedule_timeout (HZ);
+ }
+ /* fall through */
+
+ case IBNAL_INIT_NOTHING:
+ break;
+ }
+
+ if (kibnal_data.kib_tx_descs != NULL)
+ PORTAL_FREE (kibnal_data.kib_tx_descs,
+ IBNAL_TX_MSGS * sizeof(kib_tx_t));
+
+ if (kibnal_data.kib_peers != NULL)
+ PORTAL_FREE (kibnal_data.kib_peers,
+ sizeof (struct list_head) *
+ kibnal_data.kib_peer_hash_size);
+
+ CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+ atomic_read (&portal_kmemory));
+ printk(KERN_INFO "Lustre: Infinicon IB NAL unloaded (final mem %d)\n",
+ atomic_read(&portal_kmemory));
+
+ kibnal_data.kib_init = IBNAL_INIT_NOTHING;
+}
+
+#define roundup_power(val, power) \
+ ( (val + (__u64)(power - 1)) & ~((__u64)(power - 1)) )
+
+/* this isn't very portable or sturdy in the face of funny mem/bus configs */
+static __u64 max_phys_mem(IB_CA_ATTRIBUTES *ca_attr)
+{
+ struct sysinfo si;
+ __u64 ret;
+
+ /* XXX we don't bother with first-gen cards */
+ if (ca_attr->VendorId == 0xd0b7 && ca_attr->DeviceId == 0x3101)
+ return 0ULL;
+
+ si_meminfo(&si);
+ ret = (__u64)max(si.totalram, max_mapnr) * si.mem_unit;
+ return roundup_power(ret, 128 * 1024 * 1024);
+}
+#undef roundup_power
+
+static int
+kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
+ ptl_ni_limits_t *requested_limits,
+ ptl_ni_limits_t *actual_limits)
+{
+ ptl_process_id_t process_id;
+ int pkmem = atomic_read(&portal_kmemory);
+ IB_PORT_ATTRIBUTES *pattr;
+ FSTATUS frc;
+ int rc;
+ int n;
+ int i;
+
+ LASSERT (nal == &kibnal_api);
+
+ if (nal->nal_refct != 0) {
+ if (actual_limits != NULL)
+ *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits;
+ /* This module got the first ref */
+ PORTAL_MODULE_USE;
+ return (PTL_OK);
+ }
+
+ LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING);
+
+ frc = IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2,
+ &kibnal_data.kib_interfaces);
+ if (frc != FSUCCESS) {
+ CERROR("IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2) = %d\n",
+ frc);
+ return -ENOSYS;
+ }
+
+ init_MUTEX (&kibnal_data.kib_nid_mutex);
+ init_MUTEX_LOCKED (&kibnal_data.kib_nid_signal);
+ kibnal_data.kib_nid = PTL_NID_ANY;
+
+ rwlock_init(&kibnal_data.kib_global_lock);
+
+ kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
+ PORTAL_ALLOC (kibnal_data.kib_peers,
+ sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
+ if (kibnal_data.kib_peers == NULL) {
+ goto failed;
+ }
+ for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
+ INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
+
+ spin_lock_init (&kibnal_data.kib_connd_lock);
+ INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
+ INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
+ init_waitqueue_head (&kibnal_data.kib_connd_waitq);
+
+ spin_lock_init (&kibnal_data.kib_sched_lock);
+ INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
+ INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
+ init_waitqueue_head (&kibnal_data.kib_sched_waitq);
+
+ spin_lock_init (&kibnal_data.kib_tx_lock);
+ INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
+ INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs);
+ init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq);
+
+ PORTAL_ALLOC (kibnal_data.kib_tx_descs,
+ IBNAL_TX_MSGS * sizeof(kib_tx_t));
+ if (kibnal_data.kib_tx_descs == NULL) {
+ CERROR ("Can't allocate tx descs\n");
+ goto failed;
+ }
+
+ /* lists/ptrs/locks initialised */
+ kibnal_data.kib_init = IBNAL_INIT_DATA;
+ /*****************************************************/
+
+ process_id.pid = 0;
+ process_id.nid = kibnal_data.kib_nid;
+
+ rc = lib_init(&kibnal_lib, nal, process_id,
+ requested_limits, actual_limits);
+ if (rc != PTL_OK) {
+ CERROR("lib_init failed: error %d\n", rc);
+ goto failed;
+ }
+
+ /* lib interface initialised */
+ kibnal_data.kib_init = IBNAL_INIT_LIB;
+ /*****************************************************/
+
+ for (i = 0; i < IBNAL_N_SCHED; i++) {
+ rc = kibnal_thread_start (kibnal_scheduler, (void *)i);
+ if (rc != 0) {
+ CERROR("Can't spawn iibnal scheduler[%d]: %d\n",
+ i, rc);
+ goto failed;
+ }
+ }
+
+ rc = kibnal_thread_start (kibnal_connd, NULL);
+ if (rc != 0) {
+ CERROR ("Can't spawn iibnal connd: %d\n", rc);
+ goto failed;
+ }
+
+ n = sizeof(kibnal_data.kib_hca_guids) /
+ sizeof(kibnal_data.kib_hca_guids[0]);
+ frc = iibt_get_hca_guids(&n, kibnal_data.kib_hca_guids);
+ if (frc != FSUCCESS) {
+ CERROR ("Can't get channel adapter guids: %d\n", frc);
+ goto failed;
+ }
+ if (n == 0) {
+ CERROR ("No channel adapters found\n");
+ goto failed;
+ }
+
+ /* Infinicon has per-HCA rather than per CQ completion handlers */
+ frc = iibt_open_hca(kibnal_data.kib_hca_guids[0],
+ kibnal_ca_callback,
+ kibnal_ca_async_callback,
+ &kibnal_data.kib_hca,
+ &kibnal_data.kib_hca);
+ if (frc != FSUCCESS) {
+ CERROR ("Can't open CA[0]: %d\n", frc);
+ goto failed;
+ }
+
+ /* Channel Adapter opened */
+ kibnal_data.kib_init = IBNAL_INIT_HCA;
+ /*****************************************************/
+
+ kibnal_data.kib_hca_attrs.PortAttributesList = NULL;
+ kibnal_data.kib_hca_attrs.PortAttributesListSize = 0;
+ frc = iibt_query_hca(kibnal_data.kib_hca,
+ &kibnal_data.kib_hca_attrs, NULL);
+ if (frc != FSUCCESS) {
+ CERROR ("Can't size port attrs: %d\n", frc);
+ goto failed;
+ }
+
+ PORTAL_ALLOC(kibnal_data.kib_hca_attrs.PortAttributesList,
+ kibnal_data.kib_hca_attrs.PortAttributesListSize);
+ if (kibnal_data.kib_hca_attrs.PortAttributesList == NULL)
+ goto failed;
+
+ /* Port attrs allocated */
+ kibnal_data.kib_init = IBNAL_INIT_PORTATTRS;
+ /*****************************************************/
+
+ frc = iibt_query_hca(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs,
+ NULL);
+ if (frc != FSUCCESS) {
+ CERROR ("Can't get port attrs for CA 0: %d\n", frc);
+ goto failed;
+ }
+
+ for (i = 0, pattr = kibnal_data.kib_hca_attrs.PortAttributesList;
+ pattr != NULL;
+ i++, pattr = pattr->Next) {
+ switch (pattr->PortState) {
+ default:
+ CERROR("Unexpected port[%d] state %d\n",
+ i, pattr->PortState);
+ continue;
+ case PortStateDown:
+ CDEBUG(D_NET, "port[%d] Down\n", i);
+ continue;
+ case PortStateInit:
+ CDEBUG(D_NET, "port[%d] Init\n", i);
+ continue;
+ case PortStateArmed:
+ CDEBUG(D_NET, "port[%d] Armed\n", i);
+ continue;
+
+ case PortStateActive:
+ CDEBUG(D_NET, "port[%d] Active\n", i);
+ kibnal_data.kib_port = i;
+ kibnal_data.kib_port_guid = pattr->GUID;
+ kibnal_data.kib_port_pkey = pattr->PkeyTable[0];
+ break;
+ }
+ break;
+ }
+
+ if (pattr == NULL) {
+ CERROR ("Can't find an active port\n");
+ goto failed;
+ }
+
+ CDEBUG(D_NET, "got guid "LPX64"\n", kibnal_data.kib_port_guid);
+
+ /* Active port found */
+ kibnal_data.kib_init = IBNAL_INIT_PORT;
+ /*****************************************************/
+
+ frc = iibt_sd_register(&kibnal_data.kib_sd, NULL);
+ if (frc != FSUCCESS) {
+ CERROR ("Can't register with SD: %d\n", frc);
+ goto failed;
+ }
+
+ /* Registered with SD OK */
+ kibnal_data.kib_init = IBNAL_INIT_SD;
+ /*****************************************************/
+
+ frc = iibt_pd_allocate(kibnal_data.kib_hca, 0, &kibnal_data.kib_pd);
+ if (frc != FSUCCESS) {
+ CERROR ("Can't create PD: %d\n", rc);
+ goto failed;
+ }
+
+ /* flag PD initialised */
+ kibnal_data.kib_init = IBNAL_INIT_PD;
+ /*****************************************************/
+
+#if IBNAL_FMR
+ {
+ const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK;
+ struct ib_fmr_pool_param params = {
+ .max_pages_per_fmr = PTL_MTU/PAGE_SIZE,
+ .access = (IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_REMOTE_READ),
+ .pool_size = pool_size,
+ .dirty_watermark = (pool_size * 3)/4,
+ .flush_function = NULL,
+ .flush_arg = NULL,
+ .cache = 1,
+ };
+ rc = ib_fmr_pool_create(kibnal_data.kib_pd, ¶ms,
+ &kibnal_data.kib_fmr_pool);
+ if (rc != 0) {
+ CERROR ("Can't create FMR pool size %d: %d\n",
+ pool_size, rc);
+ goto failed;
+ }
+ }
+
+ /* flag FMR pool initialised */
+ kibnal_data.kib_init = IBNAL_INIT_FMR;
+#endif
+ /*****************************************************/
+ if (IBNAL_WHOLE_MEM) {
+ IB_MR_PHYS_BUFFER phys;
+ IB_ACCESS_CONTROL access;
+ kib_md_t *md = &kibnal_data.kib_md;
+
+ memset(&access, 0, sizeof(access));
+ access.s.MWBindable = 1;
+ access.s.LocalWrite = 1;
+ access.s.RdmaRead = 1;
+ access.s.RdmaWrite = 1;
+
+ phys.PhysAddr = 0;
+ phys.Length = max_phys_mem(&kibnal_data.kib_hca_attrs);
+ if (phys.Length == 0) {
+ CERROR ("couldn't determine the end of phys mem\n");
+ goto failed;
+ }
+
+ rc = iibt_register_contig_physical_memory(kibnal_data.kib_hca,
+ 0,
+ &phys, 1,
+ 0,
+ kibnal_data.kib_pd,
+ access,
+ &md->md_handle,
+ &md->md_addr,
+ &md->md_lkey,
+ &md->md_rkey);
+ if (rc != FSUCCESS) {
+ CERROR("registering physical memory failed: %d\n",
+ rc);
+ CERROR("falling back to registration per-rdma\n");
+ md->md_handle = NULL;
+ } else {
+ CDEBUG(D_NET, "registered "LPU64" bytes of mem\n",
+ phys.Length);
+ kibnal_data.kib_init = IBNAL_INIT_MR;
+ }
+ }
+
+ /*****************************************************/
+
+ rc = kibnal_setup_tx_descs();
+ if (rc != 0) {
+ CERROR ("Can't register tx descs: %d\n", rc);
+ goto failed;
+ }
+
+ /* flag TX descs initialised */
+ kibnal_data.kib_init = IBNAL_INIT_TXD;
+ /*****************************************************/
+
+ {
+ uint32 nentries;
+
+ frc = iibt_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES,
+ &kibnal_data.kib_cq, &kibnal_data.kib_cq,
+ &nentries);
+ if (frc != FSUCCESS) {
+ CERROR ("Can't create RX CQ: %d\n", frc);
+ goto failed;
+ }
+
+ /* flag CQ initialised */
+ kibnal_data.kib_init = IBNAL_INIT_CQ;
+
+ if (nentries < IBNAL_CQ_ENTRIES) {
+ CERROR ("CQ only has %d entries, need %d\n",
+ nentries, IBNAL_CQ_ENTRIES);
+ goto failed;
+ }
+
+ rc = iibt_cq_rearm(kibnal_data.kib_cq, CQEventSelNextWC);
+ if (rc != 0) {
+ CERROR ("Failed to re-arm completion queue: %d\n", rc);
+ goto failed;
+ }
+ }
+
+ /*****************************************************/
+
+ rc = libcfs_nal_cmd_register(IIBNAL, &kibnal_cmd, NULL);
+ if (rc != 0) {
+ CERROR ("Can't initialise command interface (rc = %d)\n", rc);
+ goto failed;
+ }
+
+ /* flag everything initialised */
+ kibnal_data.kib_init = IBNAL_INIT_ALL;
+ /*****************************************************/
+
+ printk(KERN_INFO "Lustre: Infinicon IB NAL loaded "
+ "(initial mem %d)\n", pkmem);
+
+ return (PTL_OK);
+
+ failed:
+ kibnal_api_shutdown (&kibnal_api);
+ return (PTL_FAIL);
+}
+
+void __exit
+kibnal_module_fini (void)
+{
+#ifdef CONFIG_SYSCTL
+ if (kibnal_tunables.kib_sysctl != NULL)
+ unregister_sysctl_table (kibnal_tunables.kib_sysctl);
+#endif
+ PtlNIFini(kibnal_ni);
+
+ ptl_unregister_nal(IIBNAL);
+}
+
+int __init
+kibnal_module_init (void)
+{
+ int rc;
+
+ if (sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN) {
+ CERROR("sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN\n");
+ return -EINVAL;
+ }
+
+ /* the following must be sizeof(int) for proc_dointvec() */
+ if (sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)) {
+ CERROR("sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)\n");
+ return -EINVAL;
+ }
+
+ kibnal_api.nal_ni_init = kibnal_api_startup;
+ kibnal_api.nal_ni_fini = kibnal_api_shutdown;
+
+ /* Initialise dynamic tunables to defaults once only */
+ kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
+
+ rc = ptl_register_nal(IIBNAL, &kibnal_api);
+ if (rc != PTL_OK) {
+ CERROR("Can't register IBNAL: %d\n", rc);
+ return (-ENOMEM); /* or something... */
+ }
+
+ /* Pure gateways want the NAL started up at module load time... */
+ rc = PtlNIInit(IIBNAL, 0, NULL, NULL, &kibnal_ni);
+ if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
+ ptl_unregister_nal(IIBNAL);
+ return (-ENODEV);
+ }
+
+#ifdef CONFIG_SYSCTL
+ /* Press on regardless even if registering sysctl doesn't work */
+ kibnal_tunables.kib_sysctl =
+ register_sysctl_table (kibnal_top_ctl_table, 0);
+#endif
+ return (0);
+}
+
+MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("Kernel Infinicon IB NAL v0.01");
+MODULE_LICENSE("GPL");
+
+module_init(kibnal_module_init);
+module_exit(kibnal_module_fini);
+
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+#include <linux/uio.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+
+#define DEBUG_SUBSYSTEM S_IBNAL
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+#include <portals/nal.h>
+
+#include <linux/iba/ibt.h>
+
+#define GCC_VERSION (__GNUC__ * 10000 \
+ + __GNUC_MINOR__ * 100 \
+ + __GNUC_PATCHLEVEL__)
+
+/* Test for GCC > 3.2.2 */
+#if GCC_VERSION <= 30202
+/* GCC 3.2.2, and presumably several versions before it, will
+ * miscompile this driver. See
+ * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9853. */
+#error Invalid GCC version. Must use GCC >= 3.2.3
+#endif
+
+#define IBNAL_SERVICE_NAME "iibnal"
+#define IBNAL_SERVICE_NUMBER 0x11b9a1
+
+#if CONFIG_SMP
+# define IBNAL_N_SCHED num_online_cpus() /* # schedulers */
+#else
+# define IBNAL_N_SCHED 1 /* # schedulers */
+#endif
+
+#define IBNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */
+#define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */
+
+#define IBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */
+
+#define IBNAL_MSG_QUEUE_SIZE 8 /* # messages/RDMAs in-flight */
+#define IBNAL_CREDIT_HIGHWATER 7 /* when to eagerly return credits */
+/* 7 indicates infinite retry attempts, Infinicon recommended 5 */
+#define IBNAL_RETRY 5 /* # times to retry */
+#define IBNAL_RNR_RETRY 5 /* */
+#define IBNAL_CM_RETRY 5 /* # times to retry connection */
+#define IBNAL_FLOW_CONTROL 1
+#define IBNAL_ACK_TIMEOUT 20 /* supposedly 4 secs */
+
+#define IBNAL_NTX 64 /* # tx descs */
+/* this had to be dropped down so that we only register < 255 pages per
+ * region. this will change if we register all memory. */
+#define IBNAL_NTX_NBLK 128 /* # reserved tx descs */
+
+#define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */
+
+#define IBNAL_RESCHED 100 /* # scheduler loops before reschedule */
+
+#define IBNAL_CONCURRENT_PEERS 1000 /* # nodes all talking at once to me */
+
+/* default vals for runtime tunables */
+#define IBNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */
+
+/************************/
+/* derived constants... */
+
+/* TX messages (shared by all connections) */
+#define IBNAL_TX_MSGS (IBNAL_NTX + IBNAL_NTX_NBLK)
+#define IBNAL_TX_MSG_BYTES (IBNAL_TX_MSGS * IBNAL_MSG_SIZE)
+#define IBNAL_TX_MSG_PAGES ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
+
+#define IBNAL_TX_MAX_SG (PTL_MD_MAX_IOV + 1)
+
+/* RX messages (per connection) */
+#define IBNAL_RX_MSGS IBNAL_MSG_QUEUE_SIZE
+#define IBNAL_RX_MSG_BYTES (IBNAL_RX_MSGS * IBNAL_MSG_SIZE)
+#define IBNAL_RX_MSG_PAGES ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
+
+
+/* we may have up to 2 completions per transmit +
+ 1 completion per receive, per connection */
+#define IBNAL_CQ_ENTRIES ((2*IBNAL_TX_MSGS) + \
+ (IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS))
+
+#define IBNAL_RDMA_BASE 0x0eeb0000
+#define IBNAL_FMR 0
+#define IBNAL_WHOLE_MEM 1
+#define IBNAL_CKSUM 0
+//#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_PROCESS
+#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_INTERRUPT
+
+/* XXX I have no idea. */
+#define IBNAL_STARTING_PSN 1
+
+typedef struct
+{
+ int kib_io_timeout; /* comms timeout (seconds) */
+ struct ctl_table_header *kib_sysctl; /* sysctl interface */
+} kib_tunables_t;
+
+/* some of these have specific types in the stack that just map back
+ * to the uFOO types, like IB_{L,R}_KEY. */
+typedef struct
+{
+ int ibp_npages; /* # pages */
+ int ibp_mapped; /* mapped? */
+ __u64 ibp_vaddr; /* mapped region vaddr */
+ __u32 ibp_lkey; /* mapped region lkey */
+ __u32 ibp_rkey; /* mapped region rkey */
+ IB_HANDLE ibp_handle; /* mapped region handle */
+ struct page *ibp_pages[0];
+} kib_pages_t;
+
+typedef struct
+{
+ IB_HANDLE md_handle;
+ __u32 md_lkey;
+ __u32 md_rkey;
+ __u64 md_addr;
+} kib_md_t __attribute__((packed));
+
+typedef struct
+{
+ int kib_init; /* initialisation state */
+ __u64 kib_incarnation; /* which one am I */
+ int kib_shutdown; /* shut down? */
+ atomic_t kib_nthreads; /* # live threads */
+
+ __u64 kib_service_id; /* service number I listen on */
+ __u64 kib_port_guid; /* my GUID (lo 64 of GID)*/
+ __u16 kib_port_pkey; /* my pkey, whatever that is */
+ ptl_nid_t kib_nid; /* my NID */
+ struct semaphore kib_nid_mutex; /* serialise NID ops */
+ struct semaphore kib_nid_signal; /* signal completion */
+ IB_HANDLE kib_cep; /* connection end point */
+
+ rwlock_t kib_global_lock; /* stabilize peer/conn ops */
+
+ struct list_head *kib_peers; /* hash table of all my known peers */
+ int kib_peer_hash_size; /* size of kib_peers */
+ atomic_t kib_npeers; /* # peers extant */
+ atomic_t kib_nconns; /* # connections extant */
+
+ struct list_head kib_connd_conns; /* connections to progress */
+ struct list_head kib_connd_peers; /* peers waiting for a connection */
+ wait_queue_head_t kib_connd_waitq; /* connection daemons sleep here */
+ unsigned long kib_connd_waketime; /* when connd will wake */
+ spinlock_t kib_connd_lock; /* serialise */
+
+ wait_queue_head_t kib_sched_waitq; /* schedulers sleep here */
+ struct list_head kib_sched_txq; /* tx requiring attention */
+ struct list_head kib_sched_rxq; /* rx requiring attention */
+ spinlock_t kib_sched_lock; /* serialise */
+
+ struct kib_tx *kib_tx_descs; /* all the tx descriptors */
+ kib_pages_t *kib_tx_pages; /* premapped tx msg pages */
+
+ struct list_head kib_idle_txs; /* idle tx descriptors */
+ struct list_head kib_idle_nblk_txs; /* idle reserved tx descriptors */
+ wait_queue_head_t kib_idle_tx_waitq; /* block here for tx descriptor */
+ __u64 kib_next_tx_cookie; /* RDMA completion cookie */
+ spinlock_t kib_tx_lock; /* serialise */
+
+ IB_HANDLE kib_hca; /* The HCA */
+ int kib_port; /* port on the device */
+ IB_HANDLE kib_pd; /* protection domain */
+ IB_HANDLE kib_sd; /* SD handle */
+ IB_HANDLE kib_cq; /* completion queue */
+ kib_md_t kib_md; /* full-mem registration */
+
+ void *kib_listen_handle; /* where I listen for connections */
+
+ IBT_INTERFACE_UNION kib_interfaces; /* The Infinicon IBT interface */
+
+ uint64 kib_hca_guids[8]; /* all the HCA guids */
+ IB_CA_ATTRIBUTES kib_hca_attrs; /* where to get HCA attrs */
+ FABRIC_OPERATION_DATA kib_fabopdata; /* (un)advertise service record */
+} kib_data_t;
+
+#define IBNAL_INIT_NOTHING 0
+#define IBNAL_INIT_DATA 1
+#define IBNAL_INIT_LIB 2
+#define IBNAL_INIT_HCA 3
+#define IBNAL_INIT_PORTATTRS 4
+#define IBNAL_INIT_PORT 5
+#define IBNAL_INIT_SD 6
+#define IBNAL_INIT_PD 7
+#define IBNAL_INIT_FMR 8
+#define IBNAL_INIT_MR 9
+#define IBNAL_INIT_TXD 10
+#define IBNAL_INIT_CQ 11
+#define IBNAL_INIT_ALL 12
+
+/************************************************************************
+ * Wire message structs.
+ * These are sent in sender's byte order (i.e. receiver flips).
+ * CAVEAT EMPTOR: other structs communicated between nodes (e.g. MAD
+ * private data and SM service info), is LE on the wire.
+ */
+
+/* also kib_md_t above */
+
+typedef struct
+{
+ __u32 rd_key; /* remote key */
+ __u32 rd_nob; /* # of bytes */
+ __u64 rd_addr; /* remote io vaddr */
+} kib_rdma_desc_t __attribute__((packed));
+
+typedef struct
+{
+ ptl_hdr_t ibim_hdr; /* portals header */
+ char ibim_payload[0]; /* piggy-backed payload */
+} kib_immediate_msg_t __attribute__((packed));
+
+/* these arrays serve two purposes during rdma. they are built on the passive
+ * side and sent to the active side as remote arguments. On the active side
+ * the descs are used as a data structure on the way to local gather items.
+ * the different roles result in split local/remote meaning of desc->rd_key */
+typedef struct
+{
+ ptl_hdr_t ibrm_hdr; /* portals header */
+ __u64 ibrm_cookie; /* opaque completion cookie */
+ __u32 ibrm_num_descs; /* how many descs */
+ kib_rdma_desc_t ibrm_desc[0]; /* where to suck/blow */
+} kib_rdma_msg_t __attribute__((packed));
+
+#define kib_rdma_msg_len(num_descs) \
+ offsetof(kib_msg_t, ibm_u.rdma.ibrm_desc[num_descs])
+
+typedef struct
+{
+ __u64 ibcm_cookie; /* opaque completion cookie */
+ __u32 ibcm_status; /* completion status */
+} kib_completion_msg_t __attribute__((packed));
+
+typedef struct
+{
+ __u32 ibm_magic; /* I'm an openibnal message */
+ __u16 ibm_version; /* this is my version number */
+ __u8 ibm_type; /* msg type */
+ __u8 ibm_credits; /* returned credits */
+#if IBNAL_CKSUM
+ __u32 ibm_nob;
+ __u32 ibm_cksum;
+#endif
+ union {
+ kib_immediate_msg_t immediate;
+ kib_rdma_msg_t rdma;
+ kib_completion_msg_t completion;
+ } ibm_u __attribute__((packed));
+} kib_msg_t __attribute__((packed));
+
+#define IBNAL_MSG_MAGIC 0x0be91b91 /* unique magic */
+#define IBNAL_MSG_VERSION 1 /* current protocol version */
+
+#define IBNAL_MSG_NOOP 0xd0 /* nothing (just credits) */
+#define IBNAL_MSG_IMMEDIATE 0xd1 /* portals hdr + payload */
+#define IBNAL_MSG_PUT_RDMA 0xd2 /* portals PUT hdr + source rdma desc */
+#define IBNAL_MSG_PUT_DONE 0xd3 /* signal PUT rdma completion */
+#define IBNAL_MSG_GET_RDMA 0xd4 /* portals GET hdr + sink rdma desc */
+#define IBNAL_MSG_GET_DONE 0xd5 /* signal GET rdma completion */
+
+/***********************************************************************/
+
+typedef struct kib_rx /* receive message */
+{
+ struct list_head rx_list; /* queue for attention */
+ struct kib_conn *rx_conn; /* owning conn */
+ int rx_rdma; /* RDMA completion posted? */
+ int rx_posted; /* posted? */
+ __u64 rx_vaddr; /* pre-mapped buffer (hca vaddr) */
+ kib_msg_t *rx_msg; /* pre-mapped buffer (host vaddr) */
+ IB_WORK_REQ rx_wrq;
+ IB_LOCAL_DATASEGMENT rx_gl; /* and it's memory */
+} kib_rx_t;
+
+typedef struct kib_tx /* transmit message */
+{
+ struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */
+ int tx_isnblk; /* I'm reserved for non-blocking sends */
+ struct kib_conn *tx_conn; /* owning conn */
+ int tx_mapped; /* mapped for RDMA? */
+ int tx_sending; /* # tx callbacks outstanding */
+ int tx_status; /* completion status */
+ unsigned long tx_deadline; /* completion deadline */
+ int tx_passive_rdma; /* peer sucks/blows */
+ int tx_passive_rdma_wait; /* waiting for peer to complete */
+ __u64 tx_passive_rdma_cookie; /* completion cookie */
+ lib_msg_t *tx_libmsg[2]; /* lib msgs to finalize on completion */
+ kib_md_t tx_md; /* RDMA mapping (active/passive) */
+ __u64 tx_vaddr; /* pre-mapped buffer (hca vaddr) */
+ kib_msg_t *tx_msg; /* pre-mapped buffer (host vaddr) */
+ int tx_nsp; /* # send work items */
+ IB_WORK_REQ tx_wrq[IBNAL_TX_MAX_SG]; /* send work items... */
+ IB_LOCAL_DATASEGMENT tx_gl[IBNAL_TX_MAX_SG]; /* ...and their memory */
+} kib_tx_t;
+
+#define KIB_TX_UNMAPPED 0
+#define KIB_TX_MAPPED 1
+#define KIB_TX_MAPPED_FMR 2
+
+typedef struct kib_wire_connreq
+{
+ __u32 wcr_magic; /* I'm an openibnal connreq */
+ __u16 wcr_version; /* this is my version number */
+ __u16 wcr_queue_depth; /* this is my receive queue size */
+ __u64 wcr_nid; /* peer's NID */
+ __u64 wcr_incarnation; /* peer's incarnation */
+} kib_wire_connreq_t;
+
+typedef struct kib_gid
+{
+ __u64 hi, lo;
+} kib_gid_t;
+
+typedef struct kib_connreq
+{
+ /* connection-in-progress */
+ struct kib_conn *cr_conn;
+ kib_wire_connreq_t cr_wcr;
+ __u64 cr_tid;
+ IB_SERVICE_RECORD cr_service;
+ kib_gid_t cr_gid;
+ IB_PATH_RECORD cr_path;
+ CM_REQUEST_INFO cr_cmreq;
+ CM_CONN_INFO cr_discarded;
+ CM_REJECT_INFO cr_rej_info;
+} kib_connreq_t;
+
+typedef struct kib_conn
+{
+ struct kib_peer *ibc_peer; /* owning peer */
+ struct list_head ibc_list; /* stash on peer's conn list */
+ __u64 ibc_incarnation; /* which instance of the peer */
+ atomic_t ibc_refcount; /* # users */
+ int ibc_state; /* what's happening */
+ atomic_t ibc_nob; /* # bytes buffered */
+ int ibc_nsends_posted; /* # uncompleted sends */
+ int ibc_credits; /* # credits I have */
+ int ibc_outstanding_credits; /* # credits to return */
+ int ibc_rcvd_disconnect;/* received discon request */
+ int ibc_sent_disconnect;/* sent discon request */
+ struct list_head ibc_tx_queue; /* send queue */
+ struct list_head ibc_active_txs; /* active tx awaiting completion */
+ spinlock_t ibc_lock; /* serialise */
+ kib_rx_t *ibc_rxs; /* the rx descs */
+ kib_pages_t *ibc_rx_pages; /* premapped rx msg pages */
+ IB_HANDLE ibc_qp; /* queue pair */
+ IB_HANDLE ibc_cep; /* connection ID? */
+ IB_QP_ATTRIBUTES_QUERY ibc_qp_attrs; /* QP attrs */
+ kib_connreq_t *ibc_connreq; /* connection request state */
+} kib_conn_t;
+
+#define IBNAL_CONN_INIT_NOTHING 0 /* initial state */
+#define IBNAL_CONN_INIT_QP 1 /* ibc_qp set up */
+#define IBNAL_CONN_CONNECTING 2 /* started to connect */
+#define IBNAL_CONN_ESTABLISHED 3 /* connection established */
+#define IBNAL_CONN_SEND_DREQ 4 /* to send disconnect req */
+#define IBNAL_CONN_DREQ 5 /* sent disconnect req */
+#define IBNAL_CONN_DREP 6 /* sent disconnect rep */
+#define IBNAL_CONN_DISCONNECTED 7 /* no more QP or CM traffic */
+
+#define KIB_ASSERT_CONN_STATE(conn, state) do { \
+ LASSERTF((conn)->ibc_state == state, "%d\n", conn->ibc_state); \
+} while (0)
+
+#define KIB_ASSERT_CONN_STATE_RANGE(conn, low, high) do { \
+ LASSERTF(low <= high, "%d %d\n", low, high); \
+ LASSERTF((conn)->ibc_state >= low && (conn)->ibc_state <= high, \
+ "%d\n", conn->ibc_state); \
+} while (0)
+
+typedef struct kib_peer
+{
+ struct list_head ibp_list; /* stash on global peer list */
+ struct list_head ibp_connd_list; /* schedule on kib_connd_peers */
+ ptl_nid_t ibp_nid; /* who's on the other end(s) */
+ atomic_t ibp_refcount; /* # users */
+ int ibp_persistence; /* "known" peer refs */
+ struct list_head ibp_conns; /* all active connections */
+ struct list_head ibp_tx_queue; /* msgs waiting for a conn */
+ int ibp_connecting; /* connecting+accepting */
+ unsigned long ibp_reconnect_time; /* when reconnect may be attempted */
+ unsigned long ibp_reconnect_interval; /* exponential backoff */
+} kib_peer_t;
+
+
+extern lib_nal_t kibnal_lib;
+extern kib_data_t kibnal_data;
+extern kib_tunables_t kibnal_tunables;
+
+/******************************************************************************/
+/* Infinicon IBT interface wrappers */
+#define IIBT_IF (kibnal_data.kib_interfaces.ver2)
+
+static inline FSTATUS
+iibt_get_hca_guids(uint32 *hca_count, EUI64 *hca_guid_list)
+{
+ return IIBT_IF.GetCaGuids(hca_count, hca_guid_list);
+}
+
+static inline FSTATUS
+iibt_open_hca(EUI64 hca_guid,
+ IB_COMPLETION_CALLBACK completion_callback,
+ IB_ASYNC_EVENT_CALLBACK async_event_callback,
+ void *arg,
+ IB_HANDLE *handle)
+{
+ return IIBT_IF.Vpi.OpenCA(hca_guid, completion_callback,
+ async_event_callback, arg, handle);
+}
+
+static inline FSTATUS
+iibt_query_hca(IB_HANDLE hca_handle, IB_CA_ATTRIBUTES *attrs, void **argp)
+{
+ return IIBT_IF.Vpi.QueryCA(hca_handle, attrs, argp);
+}
+
+static inline FSTATUS
+iibt_close_hca(IB_HANDLE hca_handle)
+{
+ return IIBT_IF.Vpi.CloseCA(hca_handle);
+}
+
+static inline FSTATUS
+iibt_pd_allocate(IB_HANDLE hca_handle, __u32 max_avs, IB_HANDLE *pd_handle)
+{
+ return IIBT_IF.Vpi.AllocatePD(hca_handle, max_avs, pd_handle);
+}
+
+static inline FSTATUS
+iibt_pd_free(IB_HANDLE pd_handle)
+{
+ return IIBT_IF.Vpi.FreePD(pd_handle);
+}
+
+static inline FSTATUS
+iibt_register_physical_memory(IB_HANDLE hca_handle,
+ IB_VIRT_ADDR requested_io_va,
+ void *phys_buffers, uint64 nphys_buffers,
+ uint32 io_va_offset, IB_HANDLE pd_handle,
+ IB_ACCESS_CONTROL access,
+ IB_HANDLE *mem_handle,
+ IB_VIRT_ADDR *actual_io_va,
+ IB_L_KEY *lkey, IB_R_KEY *rkey)
+{
+ return IIBT_IF.Vpi.RegisterPhysMemRegion(hca_handle, requested_io_va,
+ phys_buffers, nphys_buffers,
+ io_va_offset, pd_handle,
+ access,
+ mem_handle, actual_io_va,
+ lkey, rkey);
+}
+
+static inline FSTATUS
+iibt_register_contig_physical_memory(IB_HANDLE hca_handle,
+ IB_VIRT_ADDR requested_io_va,
+ IB_MR_PHYS_BUFFER *phys_buffers,
+ uint64 nphys_buffers,
+ uint32 io_va_offset, IB_HANDLE pd_handle,
+ IB_ACCESS_CONTROL access,
+ IB_HANDLE *mem_handle,
+ IB_VIRT_ADDR *actual_io_va,
+ IB_L_KEY *lkey, IB_R_KEY *rkey)
+{
+ return IIBT_IF.Vpi.RegisterContigPhysMemRegion(hca_handle,
+ requested_io_va,
+ phys_buffers,
+ nphys_buffers,
+ io_va_offset, pd_handle,
+ access,
+ mem_handle, actual_io_va,
+ lkey, rkey);
+}
+
+static inline FSTATUS
+iibt_register_memory(IB_HANDLE hca_handle,
+ void *virt_addr, unsigned int length,
+ IB_HANDLE pd_handle,
+ IB_ACCESS_CONTROL access,
+ IB_HANDLE *mem_handle,
+ IB_L_KEY *lkey, IB_R_KEY *rkey)
+{
+ return IIBT_IF.Vpi.RegisterMemRegion(hca_handle,
+ virt_addr, length,
+ pd_handle,
+ access,
+ mem_handle,
+ lkey, rkey);
+}
+
+static inline FSTATUS
+iibt_deregister_memory(IB_HANDLE mem_handle)
+{
+ return IIBT_IF.Vpi.DeregisterMemRegion(mem_handle);
+}
+
+static inline FSTATUS
+iibt_cq_create(IB_HANDLE hca_handle, uint32 requested_size,
+ void *arg, IB_HANDLE *cq_handle, uint32 *actual_size)
+{
+ return IIBT_IF.Vpi.CreateCQ(hca_handle, requested_size,
+ arg, cq_handle, actual_size);
+}
+
+static inline FSTATUS
+iibt_cq_poll(IB_HANDLE cq_handle, IB_WORK_COMPLETION *wc)
+{
+ return IIBT_IF.Vpi.PollCQ(cq_handle, wc);
+}
+
+static inline FSTATUS
+iibt_cq_rearm(IB_HANDLE cq_handle, IB_CQ_EVENT_SELECT select)
+{
+ return IIBT_IF.Vpi.RearmCQ(cq_handle, select);
+}
+
+static inline FSTATUS
+iibt_cq_destroy(IB_HANDLE cq_handle)
+{
+ return IIBT_IF.Vpi.DestroyCQ(cq_handle);
+}
+
+static inline FSTATUS
+iibt_qp_create(IB_HANDLE hca_handle, IB_QP_ATTRIBUTES_CREATE *create_attr,
+ void *arg, IB_HANDLE *cq_handle,
+ IB_QP_ATTRIBUTES_QUERY *query_attr)
+{
+ return IIBT_IF.Vpi.CreateQP(hca_handle, create_attr, arg, cq_handle,
+ query_attr);
+}
+
+static inline FSTATUS
+iibt_qp_query(IB_HANDLE qp_handle, IB_QP_ATTRIBUTES_QUERY *query_attr,
+ void **arg_ptr)
+{
+ return IIBT_IF.Vpi.QueryQP(qp_handle, query_attr, arg_ptr);
+}
+
+static inline FSTATUS
+iibt_qp_modify(IB_HANDLE qp_handle, IB_QP_ATTRIBUTES_MODIFY *modify_attr,
+ IB_QP_ATTRIBUTES_QUERY *query_attr)
+{
+ return IIBT_IF.Vpi.ModifyQP(qp_handle, modify_attr, query_attr);
+}
+
+static inline FSTATUS
+iibt_qp_destroy(IB_HANDLE qp_handle)
+{
+ return IIBT_IF.Vpi.DestroyQP(qp_handle);
+}
+
+static inline FSTATUS
+iibt_postrecv(IB_HANDLE qp_handle, IB_WORK_REQ *work_req)
+{
+ return IIBT_IF.Vpi.PostRecv(qp_handle, work_req);
+}
+
+static inline FSTATUS
+iibt_postsend(IB_HANDLE qp_handle, IB_WORK_REQ *work_req)
+{
+ return IIBT_IF.Vpi.PostSend(qp_handle, work_req);
+}
+
+static inline FSTATUS
+iibt_sd_register(IB_HANDLE *sd_handle, CLIENT_CONTROL_PARAMETERS *p)
+{
+ return IIBT_IF.Sdi.Register(sd_handle, p);
+}
+
+static inline FSTATUS
+iibt_sd_deregister(IB_HANDLE sd_handle)
+{
+ return IIBT_IF.Sdi.Deregister(sd_handle);
+}
+
+static inline FSTATUS
+iibt_sd_port_fabric_operation(IB_HANDLE sd_handle, EUI64 port_guid,
+ FABRIC_OPERATION_DATA *fod,
+ PFABRIC_OPERATION_CALLBACK callback,
+ COMMAND_CONTROL_PARAMETERS *p, void *arg)
+{
+ return IIBT_IF.Sdi.PortFabricOperation(sd_handle, port_guid,
+ fod, callback, p, arg);
+}
+
+static inline FSTATUS
+iibt_sd_query_port_fabric_information(IB_HANDLE sd_handle, EUI64 port_guid,
+ QUERY *qry,
+ PQUERY_CALLBACK callback,
+ COMMAND_CONTROL_PARAMETERS *p, void *arg)
+{
+ return IIBT_IF.Sdi.QueryPortFabricInformation(sd_handle, port_guid,
+ qry, callback, p, arg);
+}
+
+static inline IB_HANDLE
+iibt_cm_create_cep(CM_CEP_TYPE type)
+{
+ return IIBT_IF.Cmi.CmCreateCEP(type);
+}
+
+static inline FSTATUS
+iibt_cm_modify_cep(IB_HANDLE cep, uint32 attr, char* value, uint32 len,
+ uint32 offset)
+{
+ return IIBT_IF.Cmi.CmModifyCEP(cep, attr, value, len, offset);
+}
+
+static inline FSTATUS
+iibt_cm_destroy_cep(IB_HANDLE cep_handle)
+{
+ return IIBT_IF.Cmi.CmDestroyCEP(cep_handle);
+}
+
+static inline FSTATUS
+iibt_cm_listen(IB_HANDLE cep, CM_LISTEN_INFO *info,
+ PFN_CM_CALLBACK callback, void *arg)
+{
+ return IIBT_IF.Cmi.CmListen(cep, info, callback, arg);
+}
+
+static inline FSTATUS
+iibt_cm_cancel(IB_HANDLE cep)
+{
+ return IIBT_IF.Cmi.CmCancel(cep);
+}
+
+static inline FSTATUS
+iibt_cm_accept(IB_HANDLE cep,
+ CM_CONN_INFO *send_info, CM_CONN_INFO *recv_info,
+ PFN_CM_CALLBACK callback, void *arg,
+ IB_HANDLE *new_cep)
+{
+ return IIBT_IF.Cmi.CmAccept(cep,
+ send_info, recv_info,
+ callback, arg, new_cep);
+}
+
+static inline FSTATUS
+iibt_cm_reject(IB_HANDLE cep, CM_REJECT_INFO *rej)
+{
+ return IIBT_IF.Cmi.CmReject(cep, rej);
+}
+
+static inline FSTATUS
+iibt_cm_disconnect(IB_HANDLE cep, CM_DREQUEST_INFO *req,
+ CM_DREPLY_INFO *reply)
+{
+ return IIBT_IF.Cmi.CmDisconnect(cep, req, reply);
+}
+
+static inline FSTATUS
+iibt_cm_connect (IB_HANDLE cep, CM_REQUEST_INFO *req,
+ PFN_CM_CALLBACK callback, void *arg)
+{
+ return IIBT_IF.Cmi.CmConnect (cep, req, callback, arg);
+}
+
+static inline int wrq_signals_completion(IB_WORK_REQ *wrq)
+{
+ return wrq->Req.SendRC.Options.s.SignaledCompletion == 1;
+}
+
+
+/******************************************************************************/
+
+/* these are purposely avoiding using local vars so they don't increase
+ * stack consumption. */
+
+#define kib_peer_addref(peer) do { \
+ LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n", \
+ atomic_read(&peer->ibp_refcount)); \
+ CDEBUG(D_NET, "++peer[%p] -> "LPX64" (%d)\n", \
+ peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \
+ atomic_inc(&peer->ibp_refcount); \
+} while (0)
+
+#define kib_peer_decref(peer) do { \
+ LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n", \
+ atomic_read(&peer->ibp_refcount)); \
+ CDEBUG(D_NET, "--peer[%p] -> "LPX64" (%d)\n", \
+ peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \
+ if (atomic_dec_and_test (&peer->ibp_refcount)) { \
+ CDEBUG (D_NET, "destroying peer "LPX64" %p\n", \
+ peer->ibp_nid, peer); \
+ kibnal_destroy_peer (peer); \
+ } \
+} while (0)
+
+/******************************************************************************/
+
+static inline struct list_head *
+kibnal_nid2peerlist (ptl_nid_t nid)
+{
+ unsigned int hash = ((unsigned int)nid) % kibnal_data.kib_peer_hash_size;
+
+ return (&kibnal_data.kib_peers [hash]);
+}
+
+static inline int
+kibnal_peer_active(kib_peer_t *peer)
+{
+ /* Am I in the peer hash table? */
+ return (!list_empty(&peer->ibp_list));
+}
+
+static inline void
+kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn)
+{
+ /* CAVEAT EMPTOR: tx takes caller's ref on conn */
+
+ LASSERT (tx->tx_nsp > 0); /* work items set up */
+ LASSERT (tx->tx_conn == NULL); /* only set here */
+
+ tx->tx_conn = conn;
+ tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ;
+ list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
+}
+
+#define KIBNAL_SERVICE_KEY_MASK (IB_SERVICE_RECORD_COMP_SERVICENAME | \
+ IB_SERVICE_RECORD_COMP_SERVICEDATA8_1 | \
+ IB_SERVICE_RECORD_COMP_SERVICEDATA8_2 | \
+ IB_SERVICE_RECORD_COMP_SERVICEDATA8_3 | \
+ IB_SERVICE_RECORD_COMP_SERVICEDATA8_4 | \
+ IB_SERVICE_RECORD_COMP_SERVICEDATA8_5 | \
+ IB_SERVICE_RECORD_COMP_SERVICEDATA8_6 | \
+ IB_SERVICE_RECORD_COMP_SERVICEDATA8_7 | \
+ IB_SERVICE_RECORD_COMP_SERVICEDATA8_8)
+
+static inline __u64*
+kibnal_service_nid_field(IB_SERVICE_RECORD *srv)
+{
+ /* must be consistent with KIBNAL_SERVICE_KEY_MASK */
+ return (__u64 *)srv->ServiceData8;
+}
+
+
+static inline void
+kibnal_set_service_keys(IB_SERVICE_RECORD *srv, ptl_nid_t nid)
+{
+ LASSERT (strlen(IBNAL_SERVICE_NAME) < sizeof(srv->ServiceName));
+ memset (srv->ServiceName, 0, sizeof(srv->ServiceName));
+ strcpy (srv->ServiceName, IBNAL_SERVICE_NAME);
+
+ *kibnal_service_nid_field(srv) = cpu_to_le64(nid);
+}
+
+#if 0
+static inline void
+kibnal_show_rdma_attr (kib_conn_t *conn)
+{
+ struct ib_qp_attribute qp_attr;
+ int rc;
+
+ memset (&qp_attr, 0, sizeof(qp_attr));
+ rc = ib_qp_query(conn->ibc_qp, &qp_attr);
+ if (rc != 0) {
+ CERROR ("Can't get qp attrs: %d\n", rc);
+ return;
+ }
+
+ CWARN ("RDMA CAPABILITY: write %s read %s\n",
+ (qp_attr.valid_fields & TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE) ?
+ (qp_attr.enable_rdma_write ? "enabled" : "disabled") : "invalid",
+ (qp_attr.valid_fields & TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE) ?
+ (qp_attr.enable_rdma_read ? "enabled" : "disabled") : "invalid");
+}
+#endif
+
+#if CONFIG_X86
+static inline __u64
+kibnal_page2phys (struct page *p)
+{
+ __u64 page_number = p - mem_map;
+
+ return (page_number << PAGE_SHIFT);
+}
+#else
+# error "no page->phys"
+#endif
+
+/* CAVEAT EMPTOR:
+ * We rely on tx/rx descriptor alignment to allow us to use the lowest bit
+ * of the work request id as a flag to determine if the completion is for a
+ * transmit or a receive. It seems that that the CQ entry's 'op' field
+ * isn't always set correctly on completions that occur after QP teardown. */
+
+static inline __u64
+kibnal_ptr2wreqid (void *ptr, int isrx)
+{
+ unsigned long lptr = (unsigned long)ptr;
+
+ LASSERT ((lptr & 1) == 0);
+ return (__u64)(lptr | (isrx ? 1 : 0));
+}
+
+static inline void *
+kibnal_wreqid2ptr (__u64 wreqid)
+{
+ return (void *)(((unsigned long)wreqid) & ~1UL);
+}
+
+static inline int
+kibnal_wreqid_is_rx (__u64 wreqid)
+{
+ return (wreqid & 1) != 0;
+}
+
+static inline int
+kibnal_whole_mem(void)
+{
+ return kibnal_data.kib_md.md_handle != NULL;
+}
+
+extern kib_peer_t *kibnal_create_peer (ptl_nid_t nid);
+extern void kibnal_destroy_peer (kib_peer_t *peer);
+extern int kibnal_del_peer (ptl_nid_t nid, int single_share);
+extern kib_peer_t *kibnal_find_peer_locked (ptl_nid_t nid);
+extern void kibnal_unlink_peer_locked (kib_peer_t *peer);
+extern int kibnal_close_stale_conns_locked (kib_peer_t *peer,
+ __u64 incarnation);
+extern kib_conn_t *kibnal_create_conn (void);
+extern void kibnal_put_conn (kib_conn_t *conn);
+extern void kibnal_destroy_conn (kib_conn_t *conn);
+void kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg);
+
+extern int kibnal_alloc_pages (kib_pages_t **pp, int npages, int access);
+extern void kibnal_free_pages (kib_pages_t *p);
+
+extern void kibnal_check_sends (kib_conn_t *conn);
+extern void kibnal_close_conn_locked (kib_conn_t *conn, int error);
+extern void kibnal_destroy_conn (kib_conn_t *conn);
+extern int kibnal_thread_start (int (*fn)(void *arg), void *arg);
+extern int kibnal_scheduler(void *arg);
+extern int kibnal_connd (void *arg);
+extern void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob);
+extern void kibnal_close_conn (kib_conn_t *conn, int why);
+extern void kibnal_start_active_rdma (int type, int status,
+ kib_rx_t *rx, lib_msg_t *libmsg,
+ unsigned int niov,
+ struct iovec *iov, ptl_kiov_t *kiov,
+ size_t offset, size_t nob);
+
+void kibnal_ca_async_callback (void *ca_arg, IB_EVENT_RECORD *ev);
+void kibnal_ca_callback (void *ca_arg, void *cq_arg);
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "iibnal.h"
+
+/*
+ * LIB functions follow
+ *
+ */
+static void
+kibnal_schedule_tx_done (kib_tx_t *tx)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags);
+
+ list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq);
+ wake_up (&kibnal_data.kib_sched_waitq);
+
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+}
+
+static void
+kibnal_tx_done (kib_tx_t *tx)
+{
+ ptl_err_t ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL;
+ unsigned long flags;
+ int i;
+ FSTATUS frc;
+
+ LASSERT (tx->tx_sending == 0); /* mustn't be awaiting callback */
+ LASSERT (!tx->tx_passive_rdma_wait); /* mustn't be awaiting RDMA */
+
+ switch (tx->tx_mapped) {
+ default:
+ LBUG();
+
+ case KIB_TX_UNMAPPED:
+ break;
+
+ case KIB_TX_MAPPED:
+ if (in_interrupt()) {
+ /* can't deregister memory in IRQ context... */
+ kibnal_schedule_tx_done(tx);
+ return;
+ }
+ frc = iibt_deregister_memory(tx->tx_md.md_handle);
+ LASSERT (frc == FSUCCESS);
+ tx->tx_mapped = KIB_TX_UNMAPPED;
+ break;
+
+#if IBNAL_FMR
+ case KIB_TX_MAPPED_FMR:
+ if (in_interrupt() && tx->tx_status != 0) {
+ /* can't flush FMRs in IRQ context... */
+ kibnal_schedule_tx_done(tx);
+ return;
+ }
+
+ rc = ib_fmr_deregister(tx->tx_md.md_handle.fmr);
+ LASSERT (rc == 0);
+
+ if (tx->tx_status != 0)
+ ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool);
+ tx->tx_mapped = KIB_TX_UNMAPPED;
+ break;
+#endif
+ }
+
+ for (i = 0; i < 2; i++) {
+ /* tx may have up to 2 libmsgs to finalise */
+ if (tx->tx_libmsg[i] == NULL)
+ continue;
+
+ lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc);
+ tx->tx_libmsg[i] = NULL;
+ }
+
+ if (tx->tx_conn != NULL) {
+ kibnal_put_conn (tx->tx_conn);
+ tx->tx_conn = NULL;
+ }
+
+ tx->tx_nsp = 0;
+ tx->tx_passive_rdma = 0;
+ tx->tx_status = 0;
+
+ spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
+
+ if (tx->tx_isnblk) {
+ list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs);
+ } else {
+ list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs);
+ wake_up (&kibnal_data.kib_idle_tx_waitq);
+ }
+
+ spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+}
+
+static kib_tx_t *
+kibnal_get_idle_tx (int may_block)
+{
+ unsigned long flags;
+ kib_tx_t *tx = NULL;
+ ENTRY;
+
+ for (;;) {
+ spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
+
+ /* "normal" descriptor is free */
+ if (!list_empty (&kibnal_data.kib_idle_txs)) {
+ tx = list_entry (kibnal_data.kib_idle_txs.next,
+ kib_tx_t, tx_list);
+ break;
+ }
+
+ if (!may_block) {
+ /* may dip into reserve pool */
+ if (list_empty (&kibnal_data.kib_idle_nblk_txs)) {
+ CERROR ("reserved tx desc pool exhausted\n");
+ break;
+ }
+
+ tx = list_entry (kibnal_data.kib_idle_nblk_txs.next,
+ kib_tx_t, tx_list);
+ break;
+ }
+
+ /* block for idle tx */
+ spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+
+ wait_event (kibnal_data.kib_idle_tx_waitq,
+ !list_empty (&kibnal_data.kib_idle_txs) ||
+ kibnal_data.kib_shutdown);
+ }
+
+ if (tx != NULL) {
+ list_del (&tx->tx_list);
+
+ /* Allocate a new passive RDMA completion cookie. It might
+ * not be needed, but we've got a lock right now and we're
+ * unlikely to wrap... */
+ tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++;
+
+ LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+ LASSERT (tx->tx_nsp == 0);
+ LASSERT (tx->tx_sending == 0);
+ LASSERT (tx->tx_status == 0);
+ LASSERT (tx->tx_conn == NULL);
+ LASSERT (!tx->tx_passive_rdma);
+ LASSERT (!tx->tx_passive_rdma_wait);
+ LASSERT (tx->tx_libmsg[0] == NULL);
+ LASSERT (tx->tx_libmsg[1] == NULL);
+ }
+
+ spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+
+ RETURN(tx);
+}
+
+static int
+kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
+{
+ /* I would guess that if kibnal_get_peer (nid) == NULL,
+ and we're not routing, then 'nid' is very distant :) */
+ if ( nal->libnal_ni.ni_pid.nid == nid ) {
+ *dist = 0;
+ } else {
+ *dist = 1;
+ }
+
+ return 0;
+}
+
+static void
+kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status)
+{
+ struct list_head *ttmp;
+ unsigned long flags;
+ int idle;
+
+ spin_lock_irqsave (&conn->ibc_lock, flags);
+
+ list_for_each (ttmp, &conn->ibc_active_txs) {
+ kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list);
+
+ LASSERT (tx->tx_passive_rdma ||
+ !tx->tx_passive_rdma_wait);
+
+ LASSERT (tx->tx_passive_rdma_wait ||
+ tx->tx_sending != 0);
+
+ if (!tx->tx_passive_rdma_wait ||
+ tx->tx_passive_rdma_cookie != cookie)
+ continue;
+
+ CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status);
+
+ tx->tx_status = status;
+ tx->tx_passive_rdma_wait = 0;
+ idle = (tx->tx_sending == 0);
+
+ if (idle)
+ list_del (&tx->tx_list);
+
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+ /* I could be racing with tx callbacks. It's whoever
+ * _makes_ tx idle that frees it */
+ if (idle)
+ kibnal_tx_done (tx);
+ return;
+ }
+
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+ CERROR ("Unmatched (late?) RDMA completion "LPX64" from "LPX64"\n",
+ cookie, conn->ibc_peer->ibp_nid);
+}
+
+static __u32
+kibnal_lkey(kib_pages_t *ibp)
+{
+ if (kibnal_whole_mem())
+ return kibnal_data.kib_md.md_lkey;
+
+ return ibp->ibp_lkey;
+}
+
+static void
+kibnal_post_rx (kib_rx_t *rx, int do_credits)
+{
+ kib_conn_t *conn = rx->rx_conn;
+ int rc = 0;
+ unsigned long flags;
+ FSTATUS frc;
+ ENTRY;
+
+ rx->rx_gl = (IB_LOCAL_DATASEGMENT) {
+ .Address = rx->rx_vaddr,
+ .Length = IBNAL_MSG_SIZE,
+ .Lkey = kibnal_lkey(conn->ibc_rx_pages),
+ };
+
+ rx->rx_wrq = (IB_WORK_REQ) {
+ .Operation = WROpRecv,
+ .DSListDepth = 1,
+ .MessageLen = IBNAL_MSG_SIZE,
+ .WorkReqId = kibnal_ptr2wreqid(rx, 1),
+ .DSList = &rx->rx_gl,
+ };
+
+ KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED,
+ IBNAL_CONN_DREP);
+ LASSERT (!rx->rx_posted);
+ rx->rx_posted = 1;
+ mb();
+
+ if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
+ rc = -ECONNABORTED;
+ else {
+ frc = iibt_postrecv(conn->ibc_qp, &rx->rx_wrq);
+ if (frc != FSUCCESS) {
+ CDEBUG(D_NET, "post failed %d\n", frc);
+ rc = -EINVAL;
+ }
+ CDEBUG(D_NET, "posted rx %p\n", &rx->rx_wrq);
+ }
+
+ if (rc == 0) {
+ if (do_credits) {
+ spin_lock_irqsave(&conn->ibc_lock, flags);
+ conn->ibc_outstanding_credits++;
+ spin_unlock_irqrestore(&conn->ibc_lock, flags);
+
+ kibnal_check_sends(conn);
+ }
+ EXIT;
+ return;
+ }
+
+ if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
+ CERROR ("Error posting receive -> "LPX64": %d\n",
+ conn->ibc_peer->ibp_nid, rc);
+ kibnal_close_conn (rx->rx_conn, rc);
+ } else {
+ CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n",
+ conn->ibc_peer->ibp_nid, rc);
+ }
+
+ /* Drop rx's ref */
+ kibnal_put_conn (conn);
+ EXIT;
+}
+
+#if IBNAL_CKSUM
+static inline __u32 kibnal_cksum (void *ptr, int nob)
+{
+ char *c = ptr;
+ __u32 sum = 0;
+
+ while (nob-- > 0)
+ sum = ((sum << 1) | (sum >> 31)) + *c++;
+
+ return (sum);
+}
+#endif
+
+static void hexdump(char *string, void *ptr, int len)
+{
+ unsigned char *c = ptr;
+ int i;
+
+ return;
+
+ if (len < 0 || len > 2048) {
+ printk("XXX what the hell? %d\n",len);
+ return;
+ }
+
+ printk("%d bytes of '%s' from 0x%p\n", len, string, ptr);
+
+ for (i = 0; i < len;) {
+ printk("%02x",*(c++));
+ i++;
+ if (!(i & 15)) {
+ printk("\n");
+ } else if (!(i&1)) {
+ printk(" ");
+ }
+ }
+
+ if(len & 15) {
+ printk("\n");
+ }
+}
+
+static void
+kibnal_rx_callback (IB_WORK_COMPLETION *wc)
+{
+ kib_rx_t *rx = (kib_rx_t *)kibnal_wreqid2ptr(wc->WorkReqId);
+ kib_msg_t *msg = rx->rx_msg;
+ kib_conn_t *conn = rx->rx_conn;
+ int nob = wc->Length;
+ const int base_nob = offsetof(kib_msg_t, ibm_u);
+ int credits;
+ int flipped;
+ unsigned long flags;
+ __u32 i;
+#if IBNAL_CKSUM
+ __u32 msg_cksum;
+ __u32 computed_cksum;
+#endif
+
+ /* we set the QP to erroring after we've finished disconnecting,
+ * maybe we should do so sooner. */
+ KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED,
+ IBNAL_CONN_DISCONNECTED);
+
+ CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
+ LASSERT (rx->rx_posted);
+ rx->rx_posted = 0;
+ mb();
+
+ /* receives complete with error in any case after we've started
+ * disconnecting */
+ if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
+ goto failed;
+
+ if (wc->Status != WRStatusSuccess) {
+ CERROR("Rx from "LPX64" failed: %d\n",
+ conn->ibc_peer->ibp_nid, wc->Status);
+ goto failed;
+ }
+
+ if (nob < base_nob) {
+ CERROR ("Short rx from "LPX64": %d < expected %d\n",
+ conn->ibc_peer->ibp_nid, nob, base_nob);
+ goto failed;
+ }
+
+ hexdump("rx", rx->rx_msg, sizeof(kib_msg_t));
+
+ /* Receiver does any byte flipping if necessary... */
+
+ if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
+ flipped = 0;
+ } else {
+ if (msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) {
+ CERROR ("Unrecognised magic: %08x from "LPX64"\n",
+ msg->ibm_magic, conn->ibc_peer->ibp_nid);
+ goto failed;
+ }
+ flipped = 1;
+ __swab16s (&msg->ibm_version);
+ LASSERT (sizeof(msg->ibm_type) == 1);
+ LASSERT (sizeof(msg->ibm_credits) == 1);
+ }
+
+ if (msg->ibm_version != IBNAL_MSG_VERSION) {
+ CERROR ("Incompatible msg version %d (%d expected)\n",
+ msg->ibm_version, IBNAL_MSG_VERSION);
+ goto failed;
+ }
+
+#if IBNAL_CKSUM
+ if (nob != msg->ibm_nob) {
+ CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->ibm_nob);
+ goto failed;
+ }
+
+ msg_cksum = le32_to_cpu(msg->ibm_cksum);
+ msg->ibm_cksum = 0;
+ computed_cksum = kibnal_cksum (msg, nob);
+
+ if (msg_cksum != computed_cksum) {
+ CERROR ("Checksum failure %d: (%d expected)\n",
+ computed_cksum, msg_cksum);
+// goto failed;
+ }
+ CDEBUG(D_NET, "cksum %x, nob %d\n", computed_cksum, nob);
+#endif
+
+ /* Have I received credits that will let me send? */
+ credits = msg->ibm_credits;
+ if (credits != 0) {
+ spin_lock_irqsave(&conn->ibc_lock, flags);
+ conn->ibc_credits += credits;
+ spin_unlock_irqrestore(&conn->ibc_lock, flags);
+
+ kibnal_check_sends(conn);
+ }
+
+ switch (msg->ibm_type) {
+ case IBNAL_MSG_NOOP:
+ kibnal_post_rx (rx, 1);
+ return;
+
+ case IBNAL_MSG_IMMEDIATE:
+ if (nob < base_nob + sizeof (kib_immediate_msg_t)) {
+ CERROR ("Short IMMEDIATE from "LPX64": %d\n",
+ conn->ibc_peer->ibp_nid, nob);
+ goto failed;
+ }
+ break;
+
+ case IBNAL_MSG_PUT_RDMA:
+ case IBNAL_MSG_GET_RDMA:
+ if (nob < base_nob + sizeof (kib_rdma_msg_t)) {
+ CERROR ("Short RDMA msg from "LPX64": %d\n",
+ conn->ibc_peer->ibp_nid, nob);
+ goto failed;
+ }
+ if (flipped)
+ __swab32(msg->ibm_u.rdma.ibrm_num_descs);
+
+ CDEBUG(D_NET, "%d RDMA: cookie "LPX64":\n",
+ msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie);
+
+ if ((msg->ibm_u.rdma.ibrm_num_descs > PTL_MD_MAX_IOV) ||
+ (kib_rdma_msg_len(msg->ibm_u.rdma.ibrm_num_descs) >
+ min(nob, IBNAL_MSG_SIZE))) {
+ CERROR ("num_descs %d too large\n",
+ msg->ibm_u.rdma.ibrm_num_descs);
+ goto failed;
+ }
+
+ for(i = 0; i < msg->ibm_u.rdma.ibrm_num_descs; i++) {
+ kib_rdma_desc_t *desc = &msg->ibm_u.rdma.ibrm_desc[i];
+
+ if (flipped) {
+ __swab32(desc->rd_key);
+ __swab32(desc->rd_nob);
+ __swab64(desc->rd_addr);
+ }
+
+ CDEBUG(D_NET, " key %x, " "addr "LPX64", nob %u\n",
+ desc->rd_key, desc->rd_addr, desc->rd_nob);
+ }
+ break;
+
+ case IBNAL_MSG_PUT_DONE:
+ case IBNAL_MSG_GET_DONE:
+ if (nob < base_nob + sizeof (kib_completion_msg_t)) {
+ CERROR ("Short COMPLETION msg from "LPX64": %d\n",
+ conn->ibc_peer->ibp_nid, nob);
+ goto failed;
+ }
+ if (flipped)
+ __swab32s(&msg->ibm_u.completion.ibcm_status);
+
+ CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n",
+ msg->ibm_type, msg->ibm_u.completion.ibcm_cookie,
+ msg->ibm_u.completion.ibcm_status);
+
+ kibnal_complete_passive_rdma (conn,
+ msg->ibm_u.completion.ibcm_cookie,
+ msg->ibm_u.completion.ibcm_status);
+ kibnal_post_rx (rx, 1);
+ return;
+
+ default:
+ CERROR ("Can't parse type from "LPX64": %d\n",
+ conn->ibc_peer->ibp_nid, msg->ibm_type);
+ goto failed;
+ }
+
+ /* schedule for kibnal_rx() in thread context */
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
+
+ list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq);
+ wake_up (&kibnal_data.kib_sched_waitq);
+
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+ return;
+
+ failed:
+ CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
+ kibnal_close_conn(conn, -ECONNABORTED);
+
+ /* Don't re-post rx & drop its ref on conn */
+ kibnal_put_conn(conn);
+}
+
+void
+kibnal_rx (kib_rx_t *rx)
+{
+ kib_msg_t *msg = rx->rx_msg;
+
+ /* Clear flag so I can detect if I've sent an RDMA completion */
+ rx->rx_rdma = 0;
+
+ switch (msg->ibm_type) {
+ case IBNAL_MSG_GET_RDMA:
+ lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
+ /* If the incoming get was matched, I'll have initiated the
+ * RDMA and the completion message... */
+ if (rx->rx_rdma)
+ break;
+
+ /* Otherwise, I'll send a failed completion now to prevent
+ * the peer's GET blocking for the full timeout. */
+ CERROR ("Completing unmatched RDMA GET from "LPX64"\n",
+ rx->rx_conn->ibc_peer->ibp_nid);
+ kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO,
+ rx, NULL, 0, NULL, NULL, 0, 0);
+ break;
+
+ case IBNAL_MSG_PUT_RDMA:
+ lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
+ if (rx->rx_rdma)
+ break;
+ /* This is most unusual, since even if lib_parse() didn't
+ * match anything, it should have asked us to read (and
+ * discard) the payload. The portals header must be
+ * inconsistent with this message type, so it's the
+ * sender's fault for sending garbage and she can time
+ * herself out... */
+ CERROR ("Uncompleted RMDA PUT from "LPX64"\n",
+ rx->rx_conn->ibc_peer->ibp_nid);
+ break;
+
+ case IBNAL_MSG_IMMEDIATE:
+ lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx);
+ LASSERT (!rx->rx_rdma);
+ break;
+
+ default:
+ LBUG();
+ break;
+ }
+
+ kibnal_post_rx (rx, 1);
+}
+
+static struct page *
+kibnal_kvaddr_to_page (unsigned long vaddr)
+{
+ struct page *page;
+
+ if (vaddr >= VMALLOC_START &&
+ vaddr < VMALLOC_END)
+ page = vmalloc_to_page ((void *)vaddr);
+#if CONFIG_HIGHMEM
+ else if (vaddr >= PKMAP_BASE &&
+ vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
+ page = vmalloc_to_page ((void *)vaddr);
+ /* in 2.4 ^ just walks the page tables */
+#endif
+ else
+ page = virt_to_page (vaddr);
+
+ if (!VALID_PAGE (page))
+ page = NULL;
+
+ return page;
+}
+
+static void
+kibnal_fill_ibrm(kib_tx_t *tx, struct page *page, unsigned long page_offset,
+ unsigned long len, int active)
+{
+ kib_rdma_msg_t *ibrm = &tx->tx_msg->ibm_u.rdma;
+ kib_rdma_desc_t *desc;
+
+ LASSERTF(ibrm->ibrm_num_descs < PTL_MD_MAX_IOV, "%u\n",
+ ibrm->ibrm_num_descs);
+
+ desc = &ibrm->ibrm_desc[ibrm->ibrm_num_descs];
+ if (active)
+ desc->rd_key = kibnal_data.kib_md.md_lkey;
+ else
+ desc->rd_key = kibnal_data.kib_md.md_rkey;
+ desc->rd_nob = len; /*PAGE_SIZE - kiov->kiov_offset; */
+ desc->rd_addr = kibnal_page2phys(page) + page_offset +
+ kibnal_data.kib_md.md_addr;
+
+ ibrm->ibrm_num_descs++;
+}
+
+static int
+kibnal_map_rdma_iov(kib_tx_t *tx, unsigned long vaddr, int nob, int active)
+{
+ struct page *page;
+ int page_offset, len;
+
+ while (nob > 0) {
+ page = kibnal_kvaddr_to_page(vaddr);
+ if (page == NULL)
+ return -EFAULT;
+
+ page_offset = vaddr & (PAGE_SIZE - 1);
+ len = min(nob, (int)PAGE_SIZE - page_offset);
+
+ kibnal_fill_ibrm(tx, page, page_offset, len, active);
+ nob -= len;
+ vaddr += len;
+ }
+ return 0;
+}
+
+static int
+kibnal_map_iov (kib_tx_t *tx, IB_ACCESS_CONTROL access,
+ int niov, struct iovec *iov, int offset, int nob, int active)
+
+{
+ void *vaddr;
+ FSTATUS frc;
+
+ LASSERT (nob > 0);
+ LASSERT (niov > 0);
+ LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+
+ while (offset >= iov->iov_len) {
+ offset -= iov->iov_len;
+ niov--;
+ iov++;
+ LASSERT (niov > 0);
+ }
+
+ if (nob > iov->iov_len - offset) {
+ CERROR ("Can't map multiple vaddr fragments\n");
+ return (-EMSGSIZE);
+ }
+
+ /* our large contiguous iov could be backed by multiple physical
+ * pages. */
+ if (kibnal_whole_mem()) {
+ int rc;
+ tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0;
+ rc = kibnal_map_rdma_iov(tx, (unsigned long)iov->iov_base +
+ offset, nob, active);
+ if (rc != 0) {
+ CERROR ("Can't map iov: %d\n", rc);
+ return rc;
+ }
+ return 0;
+ }
+
+ vaddr = (void *)(((unsigned long)iov->iov_base) + offset);
+ tx->tx_md.md_addr = (__u64)((unsigned long)vaddr);
+
+ frc = iibt_register_memory(kibnal_data.kib_hca, vaddr, nob,
+ kibnal_data.kib_pd, access,
+ &tx->tx_md.md_handle, &tx->tx_md.md_lkey,
+ &tx->tx_md.md_rkey);
+ if (frc != 0) {
+ CERROR ("Can't map vaddr %p: %d\n", vaddr, frc);
+ return -EINVAL;
+ }
+
+ tx->tx_mapped = KIB_TX_MAPPED;
+ return (0);
+}
+
+static int
+kibnal_map_kiov (kib_tx_t *tx, IB_ACCESS_CONTROL access,
+ int nkiov, ptl_kiov_t *kiov,
+ int offset, int nob, int active)
+{
+ __u64 *phys = NULL;
+ int page_offset;
+ int nphys;
+ int resid;
+ int phys_size = 0;
+ FSTATUS frc;
+ int i, rc = 0;
+
+ CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
+
+ LASSERT (nob > 0);
+ LASSERT (nkiov > 0);
+ LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+
+ while (offset >= kiov->kiov_len) {
+ offset -= kiov->kiov_len;
+ nkiov--;
+ kiov++;
+ LASSERT (nkiov > 0);
+ }
+
+ page_offset = kiov->kiov_offset + offset;
+ nphys = 1;
+
+ if (!kibnal_whole_mem()) {
+ phys_size = nkiov * sizeof (*phys);
+ PORTAL_ALLOC(phys, phys_size);
+ if (phys == NULL) {
+ CERROR ("Can't allocate tmp phys\n");
+ return (-ENOMEM);
+ }
+
+ phys[0] = kibnal_page2phys(kiov->kiov_page);
+ } else {
+ tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0;
+ kibnal_fill_ibrm(tx, kiov->kiov_page, kiov->kiov_offset,
+ kiov->kiov_len, active);
+ }
+
+ resid = nob - (kiov->kiov_len - offset);
+
+ while (resid > 0) {
+ kiov++;
+ nkiov--;
+ LASSERT (nkiov > 0);
+
+ if (kiov->kiov_offset != 0 ||
+ ((resid > PAGE_SIZE) &&
+ kiov->kiov_len < PAGE_SIZE)) {
+ /* Can't have gaps */
+ CERROR ("Can't make payload contiguous in I/O VM:"
+ "page %d, offset %d, len %d \n", nphys,
+ kiov->kiov_offset, kiov->kiov_len);
+
+ for (i = -nphys; i < nkiov; i++)
+ {
+ CERROR("kiov[%d] %p +%d for %d\n",
+ i, kiov[i].kiov_page, kiov[i].kiov_offset, kiov[i].kiov_len);
+ }
+
+ rc = -EINVAL;
+ goto out;
+ }
+
+ if (nphys == PTL_MD_MAX_IOV) {
+ CERROR ("payload too big (%d)\n", nphys);
+ rc = -EMSGSIZE;
+ goto out;
+ }
+
+ if (!kibnal_whole_mem()) {
+ LASSERT (nphys * sizeof (*phys) < phys_size);
+ phys[nphys] = kibnal_page2phys(kiov->kiov_page);
+ } else {
+ if (kib_rdma_msg_len(nphys) > IBNAL_MSG_SIZE) {
+ CERROR ("payload too big (%d)\n", nphys);
+ rc = -EMSGSIZE;
+ goto out;
+ }
+ kibnal_fill_ibrm(tx, kiov->kiov_page,
+ kiov->kiov_offset, kiov->kiov_len,
+ active);
+ }
+
+ nphys ++;
+ resid -= PAGE_SIZE;
+ }
+
+ if (kibnal_whole_mem())
+ goto out;
+
+#if 0
+ CWARN ("nphys %d, nob %d, page_offset %d\n", nphys, nob, page_offset);
+ for (i = 0; i < nphys; i++)
+ CWARN (" [%d] "LPX64"\n", i, phys[i]);
+#endif
+
+#if IBNAL_FMR
+#error "iibnal hasn't learned about FMR yet"
+ rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool,
+ phys, nphys,
+ &tx->tx_md.md_addr,
+ page_offset,
+ &tx->tx_md.md_handle.fmr,
+ &tx->tx_md.md_lkey,
+ &tx->tx_md.md_rkey);
+#else
+ frc = iibt_register_physical_memory(kibnal_data.kib_hca,
+ IBNAL_RDMA_BASE,
+ phys, nphys,
+ 0, /* offset */
+ kibnal_data.kib_pd,
+ access,
+ &tx->tx_md.md_handle,
+ &tx->tx_md.md_addr,
+ &tx->tx_md.md_lkey,
+ &tx->tx_md.md_rkey);
+#endif
+ if (frc == FSUCCESS) {
+ CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: lkey %x, rkey %x\n",
+ nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey);
+#if IBNAL_FMR
+ tx->tx_mapped = KIB_TX_MAPPED_FMR;
+#else
+ tx->tx_mapped = KIB_TX_MAPPED;
+#endif
+ } else {
+ CERROR ("Can't map phys: %d\n", rc);
+ rc = -EFAULT;
+ }
+
+ out:
+ if (phys != NULL)
+ PORTAL_FREE(phys, phys_size);
+ return (rc);
+}
+
+static kib_conn_t *
+kibnal_find_conn_locked (kib_peer_t *peer)
+{
+ struct list_head *tmp;
+
+ /* just return the first connection */
+ list_for_each (tmp, &peer->ibp_conns) {
+ return (list_entry(tmp, kib_conn_t, ibc_list));
+ }
+
+ return (NULL);
+}
+
+void
+kibnal_check_sends (kib_conn_t *conn)
+{
+ unsigned long flags;
+ kib_tx_t *tx;
+ int rc;
+ int i;
+ int done;
+ int nwork;
+ ENTRY;
+
+ spin_lock_irqsave (&conn->ibc_lock, flags);
+
+ LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE);
+
+ if (list_empty(&conn->ibc_tx_queue) &&
+ conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) {
+ spin_unlock_irqrestore(&conn->ibc_lock, flags);
+
+ tx = kibnal_get_idle_tx(0); /* don't block */
+ if (tx != NULL)
+ kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
+
+ spin_lock_irqsave(&conn->ibc_lock, flags);
+
+ if (tx != NULL) {
+ atomic_inc(&conn->ibc_refcount);
+ kibnal_queue_tx_locked(tx, conn);
+ }
+ }
+
+ while (!list_empty (&conn->ibc_tx_queue)) {
+ tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list);
+
+ /* We rely on this for QP sizing */
+ LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= IBNAL_TX_MAX_SG);
+
+ LASSERT (conn->ibc_outstanding_credits >= 0);
+ LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
+ LASSERT (conn->ibc_credits >= 0);
+ LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
+
+ /* Not on ibc_rdma_queue */
+ LASSERT (!tx->tx_passive_rdma_wait);
+
+ if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE)
+ GOTO(out, 0);
+
+ if (conn->ibc_credits == 0) /* no credits */
+ GOTO(out, 1);
+
+ if (conn->ibc_credits == 1 && /* last credit reserved for */
+ conn->ibc_outstanding_credits == 0) /* giving back credits */
+ GOTO(out, 2);
+
+ list_del (&tx->tx_list);
+
+ if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
+ (!list_empty(&conn->ibc_tx_queue) ||
+ conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) {
+ /* redundant NOOP */
+ spin_unlock_irqrestore(&conn->ibc_lock, flags);
+ kibnal_tx_done(tx);
+ spin_lock_irqsave(&conn->ibc_lock, flags);
+ continue;
+ }
+
+ tx->tx_msg->ibm_credits = conn->ibc_outstanding_credits;
+ conn->ibc_outstanding_credits = 0;
+
+ conn->ibc_nsends_posted++;
+ conn->ibc_credits--;
+
+ /* we only get a tx completion for the final rdma op */
+ tx->tx_sending = min(tx->tx_nsp, 2);
+ tx->tx_passive_rdma_wait = tx->tx_passive_rdma;
+ list_add (&tx->tx_list, &conn->ibc_active_txs);
+#if IBNAL_CKSUM
+ tx->tx_msg->ibm_cksum = 0;
+ tx->tx_msg->ibm_cksum = kibnal_cksum(tx->tx_msg, tx->tx_msg->ibm_nob);
+ CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->ibm_cksum, tx->tx_msg->ibm_nob);
+#endif
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+ /* NB the gap between removing tx from the queue and sending it
+ * allows message re-ordering to occur */
+
+ LASSERT (tx->tx_nsp > 0);
+
+ rc = -ECONNABORTED;
+ nwork = 0;
+ if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
+ tx->tx_status = 0;
+ /* Driver only accepts 1 item at a time */
+ for (i = 0; i < tx->tx_nsp; i++) {
+ hexdump("tx", tx->tx_msg, sizeof(kib_msg_t));
+ rc = iibt_postsend(conn->ibc_qp,
+ &tx->tx_wrq[i]);
+ if (rc != 0)
+ break;
+ if (wrq_signals_completion(&tx->tx_wrq[i]))
+ nwork++;
+ CDEBUG(D_NET, "posted tx wrq %p\n",
+ &tx->tx_wrq[i]);
+ }
+ }
+
+ spin_lock_irqsave (&conn->ibc_lock, flags);
+ if (rc != 0) {
+ /* NB credits are transferred in the actual
+ * message, which can only be the last work item */
+ conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
+ conn->ibc_credits++;
+ conn->ibc_nsends_posted--;
+
+ tx->tx_status = rc;
+ tx->tx_passive_rdma_wait = 0;
+ tx->tx_sending -= tx->tx_nsp - nwork;
+
+ done = (tx->tx_sending == 0);
+ if (done)
+ list_del (&tx->tx_list);
+
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+ if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
+ CERROR ("Error %d posting transmit to "LPX64"\n",
+ rc, conn->ibc_peer->ibp_nid);
+ else
+ CDEBUG (D_NET, "Error %d posting transmit to "
+ LPX64"\n", rc, conn->ibc_peer->ibp_nid);
+
+ kibnal_close_conn (conn, rc);
+
+ if (done)
+ kibnal_tx_done (tx);
+ return;
+ }
+
+ }
+
+ EXIT;
+out:
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+}
+
+static void
+kibnal_tx_callback (IB_WORK_COMPLETION *wc)
+{
+ kib_tx_t *tx = (kib_tx_t *)kibnal_wreqid2ptr(wc->WorkReqId);
+ kib_conn_t *conn;
+ unsigned long flags;
+ int idle;
+
+ conn = tx->tx_conn;
+ LASSERT (conn != NULL);
+ LASSERT (tx->tx_sending != 0);
+
+ spin_lock_irqsave(&conn->ibc_lock, flags);
+
+ CDEBUG(D_NET, "conn %p tx %p [%d/%d]: %d\n", conn, tx,
+ tx->tx_sending, tx->tx_nsp, wc->Status);
+
+ /* I could be racing with rdma completion. Whoever makes 'tx' idle
+ * gets to free it, which also drops its ref on 'conn'. If it's
+ * not me, then I take an extra ref on conn so it can't disappear
+ * under me. */
+
+ tx->tx_sending--;
+ idle = (tx->tx_sending == 0) && /* This is the final callback */
+ (!tx->tx_passive_rdma_wait); /* Not waiting for RDMA completion */
+ if (idle)
+ list_del(&tx->tx_list);
+
+ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+ atomic_inc (&conn->ibc_refcount);
+
+ if (tx->tx_sending == 0)
+ conn->ibc_nsends_posted--;
+
+ if (wc->Status != WRStatusSuccess &&
+ tx->tx_status == 0)
+ tx->tx_status = -ECONNABORTED;
+
+ spin_unlock_irqrestore(&conn->ibc_lock, flags);
+
+ if (idle)
+ kibnal_tx_done (tx);
+
+ if (wc->Status != WRStatusSuccess) {
+ CERROR ("Tx completion to "LPX64" failed: %d\n",
+ conn->ibc_peer->ibp_nid, wc->Status);
+ kibnal_close_conn (conn, -ENETDOWN);
+ } else {
+ /* can I shovel some more sends out the door? */
+ kibnal_check_sends(conn);
+ }
+
+ kibnal_put_conn (conn);
+}
+
+void
+kibnal_ca_async_callback (void *ca_arg, IB_EVENT_RECORD *ev)
+{
+ /* XXX flesh out. this seems largely for async errors */
+ CERROR("type: %d code: %u\n", ev->EventType, ev->EventCode);
+}
+
+void
+kibnal_ca_callback (void *ca_arg, void *cq_arg)
+{
+ IB_HANDLE cq = *(IB_HANDLE *)cq_arg;
+ IB_HANDLE ca = *(IB_HANDLE *)ca_arg;
+ IB_WORK_COMPLETION wc;
+ int armed = 0;
+
+ CDEBUG(D_NET, "ca %p cq %p\n", ca, cq);
+
+ for(;;) {
+ while (iibt_cq_poll(cq, &wc) == FSUCCESS) {
+ if (kibnal_wreqid_is_rx(wc.WorkReqId))
+ kibnal_rx_callback(&wc);
+ else
+ kibnal_tx_callback(&wc);
+ }
+ if (armed)
+ return;
+ if (iibt_cq_rearm(cq, CQEventSelNextWC) != FSUCCESS) {
+ CERROR("rearm failed?\n");
+ return;
+ }
+ armed = 1;
+ }
+}
+
+void
+kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
+{
+ IB_LOCAL_DATASEGMENT *gl = &tx->tx_gl[tx->tx_nsp];
+ IB_WORK_REQ *wrq = &tx->tx_wrq[tx->tx_nsp];
+ int fence;
+ int nob = offsetof (kib_msg_t, ibm_u) + body_nob;
+
+ LASSERT (tx->tx_nsp >= 0 &&
+ tx->tx_nsp < sizeof(tx->tx_wrq)/sizeof(tx->tx_wrq[0]));
+ LASSERT (nob <= IBNAL_MSG_SIZE);
+
+ tx->tx_msg->ibm_magic = IBNAL_MSG_MAGIC;
+ tx->tx_msg->ibm_version = IBNAL_MSG_VERSION;
+ tx->tx_msg->ibm_type = type;
+#if IBNAL_CKSUM
+ tx->tx_msg->ibm_nob = nob;
+#endif
+ /* Fence the message if it's bundled with an RDMA read */
+ fence = (tx->tx_nsp > 0) &&
+ (type == IBNAL_MSG_PUT_DONE);
+
+ *gl = (IB_LOCAL_DATASEGMENT) {
+ .Address = tx->tx_vaddr,
+ .Length = IBNAL_MSG_SIZE,
+ .Lkey = kibnal_lkey(kibnal_data.kib_tx_pages),
+ };
+
+ wrq->WorkReqId = kibnal_ptr2wreqid(tx, 0);
+ wrq->Operation = WROpSend;
+ wrq->DSList = gl;
+ wrq->DSListDepth = 1;
+ wrq->MessageLen = nob;
+ wrq->Req.SendRC.ImmediateData = 0;
+ wrq->Req.SendRC.Options.s.SolicitedEvent = 1;
+ wrq->Req.SendRC.Options.s.SignaledCompletion = 1;
+ wrq->Req.SendRC.Options.s.ImmediateData = 0;
+ wrq->Req.SendRC.Options.s.Fence = fence;
+
+ tx->tx_nsp++;
+}
+
+static void
+kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&conn->ibc_lock, flags);
+
+ kibnal_queue_tx_locked (tx, conn);
+
+ spin_unlock_irqrestore(&conn->ibc_lock, flags);
+
+ kibnal_check_sends(conn);
+}
+
+static void
+kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
+{
+ unsigned long flags;
+ kib_peer_t *peer;
+ kib_conn_t *conn;
+ rwlock_t *g_lock = &kibnal_data.kib_global_lock;
+
+ /* If I get here, I've committed to send, so I complete the tx with
+ * failure on any problems */
+
+ LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */
+ LASSERT (tx->tx_nsp > 0); /* work items have been set up */
+
+ read_lock (g_lock);
+
+ peer = kibnal_find_peer_locked (nid);
+ if (peer == NULL) {
+ read_unlock (g_lock);
+ tx->tx_status = -EHOSTUNREACH;
+ kibnal_tx_done (tx);
+ return;
+ }
+
+ conn = kibnal_find_conn_locked (peer);
+ if (conn != NULL) {
+ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+ atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
+ read_unlock (g_lock);
+
+ kibnal_queue_tx (tx, conn);
+ return;
+ }
+
+ /* Making one or more connections; I'll need a write lock... */
+ read_unlock (g_lock);
+ write_lock_irqsave (g_lock, flags);
+
+ peer = kibnal_find_peer_locked (nid);
+ if (peer == NULL) {
+ write_unlock_irqrestore (g_lock, flags);
+ tx->tx_status = -EHOSTUNREACH;
+ kibnal_tx_done (tx);
+ return;
+ }
+
+ conn = kibnal_find_conn_locked (peer);
+ if (conn != NULL) {
+ /* Connection exists; queue message on it */
+ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+ atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
+ write_unlock_irqrestore (g_lock, flags);
+
+ kibnal_queue_tx (tx, conn);
+ return;
+ }
+
+ if (peer->ibp_connecting == 0) {
+ if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) {
+ write_unlock_irqrestore (g_lock, flags);
+ tx->tx_status = -EHOSTUNREACH;
+ kibnal_tx_done (tx);
+ return;
+ }
+
+ peer->ibp_connecting = 1;
+ kib_peer_addref(peer); /* extra ref for connd */
+
+ spin_lock (&kibnal_data.kib_connd_lock);
+
+ list_add_tail (&peer->ibp_connd_list,
+ &kibnal_data.kib_connd_peers);
+ wake_up (&kibnal_data.kib_connd_waitq);
+
+ spin_unlock (&kibnal_data.kib_connd_lock);
+ }
+
+ /* A connection is being established; queue the message... */
+ list_add_tail (&tx->tx_list, &peer->ibp_tx_queue);
+
+ write_unlock_irqrestore (g_lock, flags);
+}
+
+static ptl_err_t
+kibnal_start_passive_rdma (int type, ptl_nid_t nid,
+ lib_msg_t *libmsg, ptl_hdr_t *hdr)
+{
+ int nob = libmsg->md->length;
+ kib_tx_t *tx;
+ kib_msg_t *ibmsg;
+ int rc;
+ IB_ACCESS_CONTROL access = {0,};
+
+ LASSERT (type == IBNAL_MSG_PUT_RDMA || type == IBNAL_MSG_GET_RDMA);
+ LASSERT (nob > 0);
+ LASSERT (!in_interrupt()); /* Mapping could block */
+
+ access.s.MWBindable = 1;
+ access.s.LocalWrite = 1;
+ access.s.RdmaRead = 1;
+ access.s.RdmaWrite = 1;
+
+ tx = kibnal_get_idle_tx (1); /* May block; caller is an app thread */
+ LASSERT (tx != NULL);
+
+ if ((libmsg->md->options & PTL_MD_KIOV) == 0)
+ rc = kibnal_map_iov (tx, access,
+ libmsg->md->md_niov,
+ libmsg->md->md_iov.iov,
+ 0, nob, 0);
+ else
+ rc = kibnal_map_kiov (tx, access,
+ libmsg->md->md_niov,
+ libmsg->md->md_iov.kiov,
+ 0, nob, 0);
+
+ if (rc != 0) {
+ CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc);
+ goto failed;
+ }
+
+ if (type == IBNAL_MSG_GET_RDMA) {
+ /* reply gets finalized when tx completes */
+ tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib,
+ nid, libmsg);
+ if (tx->tx_libmsg[1] == NULL) {
+ CERROR ("Can't create reply for GET -> "LPX64"\n",
+ nid);
+ rc = -ENOMEM;
+ goto failed;
+ }
+ }
+
+ tx->tx_passive_rdma = 1;
+
+ ibmsg = tx->tx_msg;
+
+ ibmsg->ibm_u.rdma.ibrm_hdr = *hdr;
+ ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie;
+ /* map_kiov alrady filled the rdma descs for the whole_mem case */
+ if (!kibnal_whole_mem()) {
+ ibmsg->ibm_u.rdma.ibrm_desc[0].rd_key = tx->tx_md.md_rkey;
+ ibmsg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr;
+ ibmsg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob;
+ ibmsg->ibm_u.rdma.ibrm_num_descs = 1;
+ }
+
+ kibnal_init_tx_msg (tx, type,
+ kib_rdma_msg_len(ibmsg->ibm_u.rdma.ibrm_num_descs));
+
+ CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr "
+ LPX64", nob %d\n",
+ tx, tx->tx_passive_rdma_cookie, tx->tx_md.md_rkey,
+ tx->tx_md.md_addr, nob);
+
+ /* libmsg gets finalized when tx completes. */
+ tx->tx_libmsg[0] = libmsg;
+
+ kibnal_launch_tx(tx, nid);
+ return (PTL_OK);
+
+ failed:
+ tx->tx_status = rc;
+ kibnal_tx_done (tx);
+ return (PTL_FAIL);
+}
+
+void
+kibnal_start_active_rdma (int type, int status,
+ kib_rx_t *rx, lib_msg_t *libmsg,
+ unsigned int niov,
+ struct iovec *iov, ptl_kiov_t *kiov,
+ size_t offset, size_t nob)
+{
+ kib_msg_t *rxmsg = rx->rx_msg;
+ kib_msg_t *txmsg;
+ kib_tx_t *tx;
+ IB_ACCESS_CONTROL access = {0,};
+ IB_WR_OP rdma_op;
+ int rc;
+ __u32 i;
+
+ CDEBUG(D_NET, "type %d, status %d, niov %d, offset %d, nob %d\n",
+ type, status, niov, offset, nob);
+
+ /* Called by scheduler */
+ LASSERT (!in_interrupt ());
+
+ /* Either all pages or all vaddrs */
+ LASSERT (!(kiov != NULL && iov != NULL));
+
+ /* No data if we're completing with failure */
+ LASSERT (status == 0 || nob == 0);
+
+ LASSERT (type == IBNAL_MSG_GET_DONE ||
+ type == IBNAL_MSG_PUT_DONE);
+
+ /* Flag I'm completing the RDMA. Even if I fail to send the
+ * completion message, I will have tried my best so further
+ * attempts shouldn't be tried. */
+ LASSERT (!rx->rx_rdma);
+ rx->rx_rdma = 1;
+
+ if (type == IBNAL_MSG_GET_DONE) {
+ rdma_op = WROpRdmaWrite;
+ LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA);
+ } else {
+ access.s.LocalWrite = 1;
+ rdma_op = WROpRdmaRead;
+ LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA);
+ }
+
+ tx = kibnal_get_idle_tx (0); /* Mustn't block */
+ if (tx == NULL) {
+ CERROR ("tx descs exhausted on RDMA from "LPX64
+ " completing locally with failure\n",
+ rx->rx_conn->ibc_peer->ibp_nid);
+ lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE);
+ return;
+ }
+ LASSERT (tx->tx_nsp == 0);
+
+ if (nob == 0)
+ GOTO(init_tx, 0);
+
+ /* We actually need to transfer some data (the transfer
+ * size could get truncated to zero when the incoming
+ * message is matched) */
+ if (kiov != NULL)
+ rc = kibnal_map_kiov (tx, access, niov, kiov, offset, nob, 1);
+ else
+ rc = kibnal_map_iov (tx, access, niov, iov, offset, nob, 1);
+
+ if (rc != 0) {
+ CERROR ("Can't map RDMA -> "LPX64": %d\n",
+ rx->rx_conn->ibc_peer->ibp_nid, rc);
+ /* We'll skip the RDMA and complete with failure. */
+ status = rc;
+ nob = 0;
+ GOTO(init_tx, rc);
+ }
+
+ if (!kibnal_whole_mem()) {
+ tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_key = tx->tx_md.md_lkey;
+ tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr;
+ tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob;
+ tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 1;
+ }
+
+ /* XXX ugh. different page-sized hosts. */
+ if (tx->tx_msg->ibm_u.rdma.ibrm_num_descs !=
+ rxmsg->ibm_u.rdma.ibrm_num_descs) {
+ CERROR("tx descs (%u) != rx descs (%u)\n",
+ tx->tx_msg->ibm_u.rdma.ibrm_num_descs,
+ rxmsg->ibm_u.rdma.ibrm_num_descs);
+ /* We'll skip the RDMA and complete with failure. */
+ status = rc;
+ nob = 0;
+ GOTO(init_tx, rc);
+ }
+
+ /* map_kiov filled in the rdma descs which describe our side of the
+ * rdma transfer. */
+ /* ibrm_num_descs was verified in rx_callback */
+ for(i = 0; i < rxmsg->ibm_u.rdma.ibrm_num_descs; i++) {
+ kib_rdma_desc_t *ldesc, *rdesc; /* local, remote */
+ IB_LOCAL_DATASEGMENT *ds = &tx->tx_gl[i];
+ IB_WORK_REQ *wrq = &tx->tx_wrq[i];
+
+ ldesc = &tx->tx_msg->ibm_u.rdma.ibrm_desc[i];
+ rdesc = &rxmsg->ibm_u.rdma.ibrm_desc[i];
+
+ ds->Address = ldesc->rd_addr;
+ ds->Length = ldesc->rd_nob;
+ ds->Lkey = ldesc->rd_key;
+
+ memset(wrq, 0, sizeof(*wrq));
+ wrq->WorkReqId = kibnal_ptr2wreqid(tx, 0);
+ wrq->Operation = rdma_op;
+ wrq->DSList = ds;
+ wrq->DSListDepth = 1;
+ wrq->MessageLen = ds->Length;
+ wrq->Req.SendRC.ImmediateData = 0;
+ wrq->Req.SendRC.Options.s.SolicitedEvent = 0;
+ wrq->Req.SendRC.Options.s.SignaledCompletion = 0;
+ wrq->Req.SendRC.Options.s.ImmediateData = 0;
+ wrq->Req.SendRC.Options.s.Fence = 0;
+ wrq->Req.SendRC.RemoteDS.Address = rdesc->rd_addr;
+ wrq->Req.SendRC.RemoteDS.Rkey = rdesc->rd_key;
+
+ /* only the last rdma post triggers tx completion */
+ if (i == rxmsg->ibm_u.rdma.ibrm_num_descs - 1)
+ wrq->Req.SendRC.Options.s.SignaledCompletion = 1;
+
+ tx->tx_nsp++;
+ }
+
+init_tx:
+ txmsg = tx->tx_msg;
+
+ txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie;
+ txmsg->ibm_u.completion.ibcm_status = status;
+
+ kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
+
+ if (status == 0 && nob != 0) {
+ LASSERT (tx->tx_nsp > 1);
+ /* RDMA: libmsg gets finalized when the tx completes. This
+ * is after the completion message has been sent, which in
+ * turn is after the RDMA has finished. */
+ tx->tx_libmsg[0] = libmsg;
+ } else {
+ LASSERT (tx->tx_nsp == 1);
+ /* No RDMA: local completion happens now! */
+ CDEBUG(D_WARNING,"No data: immediate completion\n");
+ lib_finalize (&kibnal_lib, NULL, libmsg,
+ status == 0 ? PTL_OK : PTL_FAIL);
+ }
+
+ /* +1 ref for this tx... */
+ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+ rx->rx_conn, rx->rx_conn->ibc_state,
+ rx->rx_conn->ibc_peer->ibp_nid,
+ atomic_read (&rx->rx_conn->ibc_refcount));
+ atomic_inc (&rx->rx_conn->ibc_refcount);
+ /* ...and queue it up */
+ kibnal_queue_tx(tx, rx->rx_conn);
+}
+
+static ptl_err_t
+kibnal_sendmsg(lib_nal_t *nal,
+ void *private,
+ lib_msg_t *libmsg,
+ ptl_hdr_t *hdr,
+ int type,
+ ptl_nid_t nid,
+ ptl_pid_t pid,
+ unsigned int payload_niov,
+ struct iovec *payload_iov,
+ ptl_kiov_t *payload_kiov,
+ size_t payload_offset,
+ size_t payload_nob)
+{
+ kib_msg_t *ibmsg;
+ kib_tx_t *tx;
+ int nob;
+
+ /* NB 'private' is different depending on what we're sending.... */
+
+ CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid:"LPX64
+ " pid %d\n", payload_nob, payload_niov, nid , pid);
+
+ LASSERT (payload_nob == 0 || payload_niov > 0);
+ LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+
+ /* Thread context if we're sending payload */
+ LASSERT (!in_interrupt() || payload_niov == 0);
+ /* payload is either all vaddrs or all pages */
+ LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+
+ switch (type) {
+ default:
+ LBUG();
+ return (PTL_FAIL);
+
+ case PTL_MSG_REPLY: {
+ /* reply's 'private' is the incoming receive */
+ kib_rx_t *rx = private;
+
+ /* RDMA reply expected? */
+ if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) {
+ kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0,
+ rx, libmsg, payload_niov,
+ payload_iov, payload_kiov,
+ payload_offset, payload_nob);
+ return (PTL_OK);
+ }
+
+ /* Incoming message consistent with immediate reply? */
+ if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) {
+ CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n",
+ nid, rx->rx_msg->ibm_type);
+ return (PTL_FAIL);
+ }
+
+ /* Will it fit in a message? */
+ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+ if (nob >= IBNAL_MSG_SIZE) {
+ CERROR("REPLY for "LPX64" too big (RDMA not requested): %d\n",
+ nid, payload_nob);
+ return (PTL_FAIL);
+ }
+ break;
+ }
+
+ case PTL_MSG_GET:
+ /* might the REPLY message be big enough to need RDMA? */
+ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]);
+ if (nob > IBNAL_MSG_SIZE)
+ return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA,
+ nid, libmsg, hdr));
+ break;
+
+ case PTL_MSG_ACK:
+ LASSERT (payload_nob == 0);
+ break;
+
+ case PTL_MSG_PUT:
+ /* Is the payload big enough to need RDMA? */
+ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+ if (nob > IBNAL_MSG_SIZE)
+ return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA,
+ nid, libmsg, hdr));
+
+ break;
+ }
+
+ tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
+ type == PTL_MSG_REPLY ||
+ in_interrupt()));
+ if (tx == NULL) {
+ CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n",
+ type, nid, in_interrupt() ? " (intr)" : "");
+ return (PTL_NO_SPACE);
+ }
+
+ ibmsg = tx->tx_msg;
+ ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
+
+ if (payload_nob > 0) {
+ if (payload_kiov != NULL)
+ lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload,
+ payload_niov, payload_kiov,
+ payload_offset, payload_nob);
+ else
+ lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload,
+ payload_niov, payload_iov,
+ payload_offset, payload_nob);
+ }
+
+ kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE,
+ offsetof(kib_immediate_msg_t,
+ ibim_payload[payload_nob]));
+
+ /* libmsg gets finalized when tx completes */
+ tx->tx_libmsg[0] = libmsg;
+
+ kibnal_launch_tx(tx, nid);
+ return (PTL_OK);
+}
+
+static ptl_err_t
+kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
+ ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+ unsigned int payload_niov, struct iovec *payload_iov,
+ size_t payload_offset, size_t payload_len)
+{
+ return (kibnal_sendmsg(nal, private, cookie,
+ hdr, type, nid, pid,
+ payload_niov, payload_iov, NULL,
+ payload_offset, payload_len));
+}
+
+static ptl_err_t
+kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie,
+ ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+ unsigned int payload_niov, ptl_kiov_t *payload_kiov,
+ size_t payload_offset, size_t payload_len)
+{
+ return (kibnal_sendmsg(nal, private, cookie,
+ hdr, type, nid, pid,
+ payload_niov, NULL, payload_kiov,
+ payload_offset, payload_len));
+}
+
+static ptl_err_t
+kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
+ unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov,
+ size_t offset, size_t mlen, size_t rlen)
+{
+ kib_rx_t *rx = private;
+ kib_msg_t *rxmsg = rx->rx_msg;
+ int msg_nob;
+
+ LASSERT (mlen <= rlen);
+ LASSERT (!in_interrupt ());
+ /* Either all pages or all vaddrs */
+ LASSERT (!(kiov != NULL && iov != NULL));
+
+ switch (rxmsg->ibm_type) {
+ default:
+ LBUG();
+ return (PTL_FAIL);
+
+ case IBNAL_MSG_IMMEDIATE:
+ msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
+ if (msg_nob > IBNAL_MSG_SIZE) {
+ CERROR ("Immediate message from "LPX64" too big: %d\n",
+ rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen);
+ return (PTL_FAIL);
+ }
+
+ if (kiov != NULL)
+ lib_copy_buf2kiov(niov, kiov, offset,
+ rxmsg->ibm_u.immediate.ibim_payload,
+ mlen);
+ else
+ lib_copy_buf2iov(niov, iov, offset,
+ rxmsg->ibm_u.immediate.ibim_payload,
+ mlen);
+
+ lib_finalize (nal, NULL, libmsg, PTL_OK);
+ return (PTL_OK);
+
+ case IBNAL_MSG_GET_RDMA:
+ /* We get called here just to discard any junk after the
+ * GET hdr. */
+ LASSERT (libmsg == NULL);
+ lib_finalize (nal, NULL, libmsg, PTL_OK);
+ return (PTL_OK);
+
+ case IBNAL_MSG_PUT_RDMA:
+ kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0,
+ rx, libmsg,
+ niov, iov, kiov, offset, mlen);
+ return (PTL_OK);
+ }
+}
+
+static ptl_err_t
+kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
+ unsigned int niov, struct iovec *iov,
+ size_t offset, size_t mlen, size_t rlen)
+{
+ return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL,
+ offset, mlen, rlen));
+}
+
+static ptl_err_t
+kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
+ unsigned int niov, ptl_kiov_t *kiov,
+ size_t offset, size_t mlen, size_t rlen)
+{
+ return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov,
+ offset, mlen, rlen));
+}
+
+/*****************************************************************************
+ * the rest of this file concerns connection management. active connetions
+ * start with connect_peer, passive connections start with passive_callback.
+ * active disconnects start with conn_close, cm_callback starts passive
+ * disconnects and contains the guts of how the disconnect state machine
+ * progresses.
+ *****************************************************************************/
+
+int
+kibnal_thread_start (int (*fn)(void *arg), void *arg)
+{
+ long pid = kernel_thread (fn, arg, 0);
+
+ if (pid < 0)
+ return ((int)pid);
+
+ atomic_inc (&kibnal_data.kib_nthreads);
+ return (0);
+}
+
+static void
+kibnal_thread_fini (void)
+{
+ atomic_dec (&kibnal_data.kib_nthreads);
+}
+
+/* this can be called by anyone at any time to close a connection. if
+ * the connection is still established it heads to the connd to start
+ * the disconnection in a safe context. It has no effect if called
+ * on a connection that is already disconnecting */
+void
+kibnal_close_conn_locked (kib_conn_t *conn, int error)
+{
+ /* This just does the immmediate housekeeping, and schedules the
+ * connection for the connd to finish off.
+ * Caller holds kib_global_lock exclusively in irq context */
+ kib_peer_t *peer = conn->ibc_peer;
+
+ KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_CONNECTING,
+ IBNAL_CONN_DISCONNECTED);
+
+ if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
+ return; /* already disconnecting */
+
+ CDEBUG (error == 0 ? D_NET : D_ERROR,
+ "closing conn to "LPX64": error %d\n", peer->ibp_nid, error);
+
+ if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
+ /* kib_connd_conns takes ibc_list's ref */
+ list_del (&conn->ibc_list);
+ } else {
+ /* new ref for kib_connd_conns */
+ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+ atomic_inc (&conn->ibc_refcount);
+ }
+
+ if (list_empty (&peer->ibp_conns) &&
+ peer->ibp_persistence == 0) {
+ /* Non-persistent peer with no more conns... */
+ kibnal_unlink_peer_locked (peer);
+ }
+
+ conn->ibc_state = IBNAL_CONN_SEND_DREQ;
+
+ spin_lock (&kibnal_data.kib_connd_lock);
+
+ list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
+ wake_up (&kibnal_data.kib_connd_waitq);
+
+ spin_unlock (&kibnal_data.kib_connd_lock);
+}
+
+void
+kibnal_close_conn (kib_conn_t *conn, int error)
+{
+ unsigned long flags;
+
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+ kibnal_close_conn_locked (conn, error);
+
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+}
+
+static void
+kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc)
+{
+ LIST_HEAD (zombies);
+ kib_tx_t *tx;
+ unsigned long flags;
+
+ LASSERT (rc != 0);
+ LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL);
+
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+ LASSERT (peer->ibp_connecting != 0);
+ peer->ibp_connecting--;
+
+ if (peer->ibp_connecting != 0) {
+ /* another connection attempt under way (loopback?)... */
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+ return;
+ }
+
+ if (list_empty(&peer->ibp_conns)) {
+ /* Say when active connection can be re-attempted */
+ peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval;
+ /* Increase reconnection interval */
+ peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2,
+ IBNAL_MAX_RECONNECT_INTERVAL);
+
+ /* Take peer's blocked blocked transmits; I'll complete
+ * them with error */
+ while (!list_empty (&peer->ibp_tx_queue)) {
+ tx = list_entry (peer->ibp_tx_queue.next,
+ kib_tx_t, tx_list);
+
+ list_del (&tx->tx_list);
+ list_add_tail (&tx->tx_list, &zombies);
+ }
+
+ if (kibnal_peer_active(peer) &&
+ (peer->ibp_persistence == 0)) {
+ /* failed connection attempt on non-persistent peer */
+ kibnal_unlink_peer_locked (peer);
+ }
+ } else {
+ /* Can't have blocked transmits if there are connections */
+ LASSERT (list_empty(&peer->ibp_tx_queue));
+ }
+
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+ if (!list_empty (&zombies))
+ CERROR ("Deleting messages for "LPX64": connection failed\n",
+ peer->ibp_nid);
+
+ while (!list_empty (&zombies)) {
+ tx = list_entry (zombies.next, kib_tx_t, tx_list);
+
+ list_del (&tx->tx_list);
+ /* complete now */
+ tx->tx_status = -EHOSTUNREACH;
+ kibnal_tx_done (tx);
+ }
+}
+
+static void
+kibnal_connreq_done (kib_conn_t *conn, int active, int status)
+{
+ int state = conn->ibc_state;
+ kib_peer_t *peer = conn->ibc_peer;
+ kib_tx_t *tx;
+ unsigned long flags;
+ int i;
+
+ /* passive connection has no connreq & vice versa */
+ LASSERTF(!active == !(conn->ibc_connreq != NULL),
+ "%d %p\n", active, conn->ibc_connreq);
+ if (active) {
+ PORTAL_FREE (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
+ conn->ibc_connreq = NULL;
+ }
+
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+ LASSERT (peer->ibp_connecting != 0);
+
+ if (status == 0) {
+ /* connection established... */
+ KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_CONNECTING);
+ conn->ibc_state = IBNAL_CONN_ESTABLISHED;
+
+ if (!kibnal_peer_active(peer)) {
+ /* ...but peer deleted meantime */
+ status = -ECONNABORTED;
+ }
+ } else {
+ KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_INIT_QP,
+ IBNAL_CONN_CONNECTING);
+ }
+
+ if (status == 0) {
+ /* Everything worked! */
+
+ peer->ibp_connecting--;
+
+ /* +1 ref for ibc_list; caller(== CM)'s ref remains until
+ * the IB_CM_IDLE callback */
+ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+ atomic_inc (&conn->ibc_refcount);
+ list_add (&conn->ibc_list, &peer->ibp_conns);
+
+ /* reset reconnect interval for next attempt */
+ peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
+
+ /* post blocked sends to the new connection */
+ spin_lock (&conn->ibc_lock);
+
+ while (!list_empty (&peer->ibp_tx_queue)) {
+ tx = list_entry (peer->ibp_tx_queue.next,
+ kib_tx_t, tx_list);
+
+ list_del (&tx->tx_list);
+
+ /* +1 ref for each tx */
+ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+ atomic_inc (&conn->ibc_refcount);
+ kibnal_queue_tx_locked (tx, conn);
+ }
+
+ spin_unlock (&conn->ibc_lock);
+
+ /* Nuke any dangling conns from a different peer instance... */
+ kibnal_close_stale_conns_locked (conn->ibc_peer,
+ conn->ibc_incarnation);
+
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+ /* queue up all the receives */
+ for (i = 0; i < IBNAL_RX_MSGS; i++) {
+ /* +1 ref for rx desc */
+ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+ atomic_inc (&conn->ibc_refcount);
+
+ CDEBUG(D_NET, "RX[%d] %p->%p - "LPX64"\n",
+ i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg,
+ conn->ibc_rxs[i].rx_vaddr);
+
+ kibnal_post_rx (&conn->ibc_rxs[i], 0);
+ }
+
+ kibnal_check_sends (conn);
+ return;
+ }
+
+ /* connection failed */
+ if (state == IBNAL_CONN_CONNECTING) {
+ /* schedule for connd to close */
+ kibnal_close_conn_locked (conn, status);
+ } else {
+ /* Don't have a CM comm_id; just wait for refs to drain */
+ conn->ibc_state = IBNAL_CONN_DISCONNECTED;
+ }
+
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+ kibnal_peer_connect_failed (conn->ibc_peer, active, status);
+
+ /* If we didn't establish the connection we don't have to pass
+ * through the disconnect protocol before dropping the CM ref */
+ if (state < IBNAL_CONN_CONNECTING)
+ kibnal_put_conn (conn);
+}
+
+static int
+kibnal_accept (kib_conn_t **connp, IB_HANDLE *cep,
+ ptl_nid_t nid, __u64 incarnation, int queue_depth)
+{
+ kib_conn_t *conn = kibnal_create_conn();
+ kib_peer_t *peer;
+ kib_peer_t *peer2;
+ unsigned long flags;
+
+ if (conn == NULL)
+ return (-ENOMEM);
+
+ if (queue_depth != IBNAL_MSG_QUEUE_SIZE) {
+ CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n",
+ nid, queue_depth, IBNAL_MSG_QUEUE_SIZE);
+ atomic_dec (&conn->ibc_refcount);
+ kibnal_destroy_conn(conn);
+ return (-EPROTO);
+ }
+
+ /* assume 'nid' is a new peer */
+ peer = kibnal_create_peer (nid);
+ if (peer == NULL) {
+ CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+ atomic_dec (&conn->ibc_refcount);
+ kibnal_destroy_conn(conn);
+ return (-ENOMEM);
+ }
+
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+ peer2 = kibnal_find_peer_locked(nid);
+ if (peer2 == NULL) {
+ /* peer table takes my ref on peer */
+ list_add_tail (&peer->ibp_list, kibnal_nid2peerlist(nid));
+ } else {
+ kib_peer_decref (peer);
+ peer = peer2;
+ }
+
+ kib_peer_addref(peer); /* +1 ref for conn */
+ peer->ibp_connecting++;
+
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+ conn->ibc_peer = peer;
+ conn->ibc_state = IBNAL_CONN_CONNECTING;
+ /* conn->ibc_cep is set when cm_accept is called */
+ conn->ibc_incarnation = incarnation;
+ conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
+
+ *connp = conn;
+ return (0);
+}
+
+static void kibnal_set_qp_state(IB_HANDLE *qp, IB_QP_STATE state)
+{
+ IB_QP_ATTRIBUTES_MODIFY modify_attr = {0,};
+ FSTATUS frc;
+
+ modify_attr.RequestState = state;
+
+ frc = iibt_qp_modify(qp, &modify_attr, NULL);
+ if (frc != FSUCCESS)
+ CERROR("couldn't set qp state to %d, error %d\n", state, frc);
+}
+
+static void kibnal_flush_pending(kib_conn_t *conn)
+{
+ LIST_HEAD (zombies);
+ struct list_head *tmp;
+ struct list_head *nxt;
+ kib_tx_t *tx;
+ unsigned long flags;
+ int done;
+
+ /* NB we wait until the connection has closed before completing
+ * outstanding passive RDMAs so we can be sure the network can't
+ * touch the mapped memory any more. */
+ KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_DISCONNECTED);
+
+ /* set the QP to the error state so that we get flush callbacks
+ * on our posted receives which can then drop their conn refs */
+ kibnal_set_qp_state(conn->ibc_qp, QPStateError);
+
+ spin_lock_irqsave (&conn->ibc_lock, flags);
+
+ /* grab passive RDMAs not waiting for the tx callback */
+ list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) {
+ tx = list_entry (tmp, kib_tx_t, tx_list);
+
+ LASSERT (tx->tx_passive_rdma ||
+ !tx->tx_passive_rdma_wait);
+
+ LASSERT (tx->tx_passive_rdma_wait ||
+ tx->tx_sending != 0);
+
+ /* still waiting for tx callback? */
+ if (!tx->tx_passive_rdma_wait)
+ continue;
+
+ tx->tx_status = -ECONNABORTED;
+ tx->tx_passive_rdma_wait = 0;
+ done = (tx->tx_sending == 0);
+
+ if (!done)
+ continue;
+
+ list_del (&tx->tx_list);
+ list_add (&tx->tx_list, &zombies);
+ }
+
+ /* grab all blocked transmits */
+ list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) {
+ tx = list_entry (tmp, kib_tx_t, tx_list);
+
+ list_del (&tx->tx_list);
+ list_add (&tx->tx_list, &zombies);
+ }
+
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+ while (!list_empty(&zombies)) {
+ tx = list_entry (zombies.next, kib_tx_t, tx_list);
+
+ list_del(&tx->tx_list);
+ kibnal_tx_done (tx);
+ }
+}
+
+static void
+kibnal_reject (IB_HANDLE cep, uint16_t reason)
+{
+ CM_REJECT_INFO *rej;
+
+ PORTAL_ALLOC(rej, sizeof(*rej));
+ if (rej == NULL) /* PORTAL_ALLOC() will CERROR on failure */
+ return;
+
+ rej->Reason = reason;
+ iibt_cm_reject(cep, rej);
+ PORTAL_FREE(rej, sizeof(*rej));
+}
+
+static FSTATUS
+kibnal_qp_rts(IB_HANDLE qp_handle, __u32 qpn, __u8 resp_res,
+ IB_PATH_RECORD *path, __u8 init_depth, __u32 send_psn)
+{
+ IB_QP_ATTRIBUTES_MODIFY modify_attr;
+ FSTATUS frc;
+ ENTRY;
+
+ modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
+ .RequestState = QPStateReadyToRecv,
+ .RecvPSN = IBNAL_STARTING_PSN,
+ .DestQPNumber = qpn,
+ .ResponderResources = resp_res,
+ .MinRnrTimer = UsecToRnrNakTimer(2000), /* 20 ms */
+ .Attrs = (IB_QP_ATTR_RECVPSN |
+ IB_QP_ATTR_DESTQPNUMBER |
+ IB_QP_ATTR_RESPONDERRESOURCES |
+ IB_QP_ATTR_DESTAV |
+ IB_QP_ATTR_PATHMTU |
+ IB_QP_ATTR_MINRNRTIMER),
+ };
+ GetAVFromPath(0, path, &modify_attr.PathMTU, NULL,
+ &modify_attr.DestAV);
+
+ frc = iibt_qp_modify(qp_handle, &modify_attr, NULL);
+ if (frc != FSUCCESS)
+ RETURN(frc);
+
+ modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
+ .RequestState = QPStateReadyToSend,
+ .FlowControl = TRUE,
+ .InitiatorDepth = init_depth,
+ .SendPSN = send_psn,
+ .LocalAckTimeout = path->PktLifeTime + 2, /* 2 or 1? */
+ .RetryCount = IBNAL_RETRY,
+ .RnrRetryCount = IBNAL_RNR_RETRY,
+ .Attrs = (IB_QP_ATTR_FLOWCONTROL |
+ IB_QP_ATTR_INITIATORDEPTH |
+ IB_QP_ATTR_SENDPSN |
+ IB_QP_ATTR_LOCALACKTIMEOUT |
+ IB_QP_ATTR_RETRYCOUNT |
+ IB_QP_ATTR_RNRRETRYCOUNT),
+ };
+
+ frc = iibt_qp_modify(qp_handle, &modify_attr, NULL);
+ RETURN(frc);
+}
+
+static void
+kibnal_connect_reply (IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
+{
+ IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs;
+ kib_conn_t *conn = arg;
+ kib_wire_connreq_t *wcr;
+ CM_REPLY_INFO *rep = &info->Info.Reply;
+ uint16_t reason;
+ FSTATUS frc;
+
+ wcr = (kib_wire_connreq_t *)info->Info.Reply.PrivateData;
+
+ if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
+ CERROR ("Can't connect "LPX64": bad magic %08x\n",
+ conn->ibc_peer->ibp_nid, le32_to_cpu(wcr->wcr_magic));
+ GOTO(reject, reason = RC_USER_REJ);
+ }
+
+ if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
+ CERROR ("Can't connect "LPX64": bad version %d\n",
+ conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_magic));
+ GOTO(reject, reason = RC_USER_REJ);
+ }
+
+ if (wcr->wcr_queue_depth != cpu_to_le16(IBNAL_MSG_QUEUE_SIZE)) {
+ CERROR ("Can't connect "LPX64": bad queue depth %d\n",
+ conn->ibc_peer->ibp_nid,
+ le16_to_cpu(wcr->wcr_queue_depth));
+ GOTO(reject, reason = RC_USER_REJ);
+ }
+
+ if (le64_to_cpu(wcr->wcr_nid) != conn->ibc_peer->ibp_nid) {
+ CERROR ("Unexpected NID "LPX64" from "LPX64"\n",
+ le64_to_cpu(wcr->wcr_nid), conn->ibc_peer->ibp_nid);
+ GOTO(reject, reason = RC_USER_REJ);
+ }
+
+ CDEBUG(D_NET, "Connection %p -> "LPX64" REP_RECEIVED.\n",
+ conn, conn->ibc_peer->ibp_nid);
+
+ conn->ibc_incarnation = le64_to_cpu(wcr->wcr_incarnation);
+ conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
+
+ frc = kibnal_qp_rts(conn->ibc_qp, rep->QPN,
+ min_t(__u8, rep->ArbInitiatorDepth,
+ ca_attr->MaxQPResponderResources),
+ &conn->ibc_connreq->cr_path,
+ min_t(__u8, rep->ArbResponderResources,
+ ca_attr->MaxQPInitiatorDepth),
+ rep->StartingPSN);
+ if (frc != FSUCCESS) {
+ CERROR("Connection %p -> "LPX64" QP RTS/RTR failed: %d\n",
+ conn, conn->ibc_peer->ibp_nid, frc);
+ GOTO(reject, reason = RC_NO_QP);
+ }
+
+ /* the callback arguments are ignored for an active accept */
+ conn->ibc_connreq->cr_discarded.Status = FSUCCESS;
+ frc = iibt_cm_accept(cep, &conn->ibc_connreq->cr_discarded,
+ NULL, NULL, NULL, NULL);
+ if (frc != FCM_CONNECT_ESTABLISHED) {
+ CERROR("Connection %p -> "LPX64" CMAccept failed: %d\n",
+ conn, conn->ibc_peer->ibp_nid, frc);
+ kibnal_connreq_done (conn, 1, -ECONNABORTED);
+ /* XXX don't call reject after accept fails? */
+ return;
+ }
+
+ CDEBUG(D_NET, "Connection %p -> "LPX64" Established\n",
+ conn, conn->ibc_peer->ibp_nid);
+
+ kibnal_connreq_done (conn, 1, 0);
+ return;
+
+reject:
+ kibnal_reject(cep, reason);
+ kibnal_connreq_done (conn, 1, -EPROTO);
+}
+
+/* ib_cm.h has a wealth of information on the CM procedures */
+static void
+kibnal_cm_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
+{
+ kib_conn_t *conn = arg;
+
+ CDEBUG(D_NET, "status 0x%x\n", info->Status);
+
+ /* Established Connection Notifier */
+ switch (info->Status) {
+ default:
+ CERROR("unknown status %d on Connection %p -> "LPX64"\n",
+ info->Status, conn, conn->ibc_peer->ibp_nid);
+ LBUG();
+ break;
+
+ case FCM_CONNECT_REPLY:
+ kibnal_connect_reply(cep, info, arg);
+ break;
+
+ case FCM_DISCONNECT_REQUEST:
+ /* XXX lock around these state management bits? */
+ if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
+ kibnal_close_conn (conn, 0);
+ conn->ibc_state = IBNAL_CONN_DREP;
+ iibt_cm_disconnect(conn->ibc_cep, NULL, NULL);
+ break;
+
+ /* these both guarantee that no more cm callbacks will occur */
+ case FCM_DISCONNECTED: /* aka FCM_DISCONNECT_TIMEOUT */
+ case FCM_DISCONNECT_REPLY:
+ CDEBUG(D_NET, "Connection %p -> "LPX64" disconnect done.\n",
+ conn, conn->ibc_peer->ibp_nid);
+
+ conn->ibc_state = IBNAL_CONN_DISCONNECTED;
+ kibnal_flush_pending(conn);
+ kibnal_put_conn(conn); /* Lose CM's ref */
+ break;
+ }
+
+ return;
+}
+
+static int
+kibnal_set_cm_flags(IB_HANDLE cep)
+{
+ FSTATUS frc;
+ uint32 value = 1;
+
+ frc = iibt_cm_modify_cep(cep, CM_FLAG_TIMEWAIT_CALLBACK,
+ (char *)&value, sizeof(value), 0);
+ if (frc != FSUCCESS) {
+ CERROR("error setting timeout callback: %d\n", frc);
+ return -1;
+ }
+
+#if 0
+ frc = iibt_cm_modify_cep(cep, CM_FLAG_ASYNC_ACCEPT, (char *)&value,
+ sizeof(value), 0);
+ if (frc != FSUCCESS) {
+ CERROR("error setting async accept: %d\n", frc);
+ return -1;
+ }
+#endif
+
+ return 0;
+}
+
+void
+kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
+{
+ IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs;
+ IB_QP_ATTRIBUTES_QUERY *query;
+ CM_REQUEST_INFO *req;
+ CM_CONN_INFO *rep = NULL, *rcv = NULL;
+ kib_wire_connreq_t *wcr;
+ kib_conn_t *conn = NULL;
+ uint16_t reason = 0;
+ FSTATUS frc;
+ int rc = 0;
+
+ LASSERT(cep);
+ LASSERT(info);
+ LASSERT(arg == NULL); /* no conn yet for passive */
+
+ CDEBUG(D_NET, "status 0x%x\n", info->Status);
+
+ req = &info->Info.Request;
+ wcr = (kib_wire_connreq_t *)req->PrivateData;
+
+ CDEBUG(D_NET, "%d from "LPX64"\n", info->Status,
+ le64_to_cpu(wcr->wcr_nid));
+
+ if (info->Status == FCM_CONNECT_CANCEL)
+ return;
+
+ LASSERT (info->Status == FCM_CONNECT_REQUEST);
+
+ if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
+ CERROR ("Can't accept: bad magic %08x\n",
+ le32_to_cpu(wcr->wcr_magic));
+ GOTO(out, reason = RC_USER_REJ);
+ }
+
+ if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
+ CERROR ("Can't accept: bad version %d\n",
+ le16_to_cpu(wcr->wcr_magic));
+ GOTO(out, reason = RC_USER_REJ);
+ }
+
+ rc = kibnal_accept(&conn, cep,
+ le64_to_cpu(wcr->wcr_nid),
+ le64_to_cpu(wcr->wcr_incarnation),
+ le16_to_cpu(wcr->wcr_queue_depth));
+ if (rc != 0) {
+ CERROR ("Can't accept "LPX64": %d\n",
+ le64_to_cpu(wcr->wcr_nid), rc);
+ GOTO(out, reason = RC_NO_RESOURCES);
+ }
+
+ frc = kibnal_qp_rts(conn->ibc_qp, req->CEPInfo.QPN,
+ min_t(__u8, req->CEPInfo.OfferedInitiatorDepth,
+ ca_attr->MaxQPResponderResources),
+ &req->PathInfo.Path,
+ min_t(__u8, req->CEPInfo.OfferedResponderResources,
+ ca_attr->MaxQPInitiatorDepth),
+ req->CEPInfo.StartingPSN);
+
+ if (frc != FSUCCESS) {
+ CERROR ("Can't mark QP RTS/RTR "LPX64": %d\n",
+ le64_to_cpu(wcr->wcr_nid), frc);
+ GOTO(out, reason = RC_NO_QP);
+ }
+
+ frc = iibt_qp_query(conn->ibc_qp, &conn->ibc_qp_attrs, NULL);
+ if (frc != FSUCCESS) {
+ CERROR ("Couldn't query qp attributes "LPX64": %d\n",
+ le64_to_cpu(wcr->wcr_nid), frc);
+ GOTO(out, reason = RC_NO_QP);
+ }
+ query = &conn->ibc_qp_attrs;
+
+ PORTAL_ALLOC(rep, sizeof(*rep));
+ PORTAL_ALLOC(rcv, sizeof(*rcv));
+ if (rep == NULL || rcv == NULL) {
+ CERROR ("can't reply and receive buffers\n");
+ GOTO(out, reason = RC_INSUFFICIENT_RESP_RES);
+ }
+
+ /* don't try to deref this into the incoming wcr :) */
+ wcr = (kib_wire_connreq_t *)rep->Info.Reply.PrivateData;
+
+ rep->Info.Reply = (CM_REPLY_INFO) {
+ .QPN = query->QPNumber,
+ .QKey = query->Qkey,
+ .StartingPSN = query->RecvPSN,
+ .EndToEndFlowControl = query->FlowControl,
+ /* XXX Hmm. */
+ .ArbInitiatorDepth = query->InitiatorDepth,
+ .ArbResponderResources = query->ResponderResources,
+ .TargetAckDelay = 0,
+ .FailoverAccepted = 0,
+ .RnRRetryCount = req->CEPInfo.RnrRetryCount,
+ };
+
+ *wcr = (kib_wire_connreq_t) {
+ .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC),
+ .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION),
+ .wcr_queue_depth = cpu_to_le32(IBNAL_MSG_QUEUE_SIZE),
+ .wcr_nid = cpu_to_le64(kibnal_data.kib_nid),
+ .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
+ };
+
+ frc = iibt_cm_accept(cep, rep, rcv, kibnal_cm_callback, conn,
+ &conn->ibc_cep);
+
+ PORTAL_FREE(rep, sizeof(*rep));
+ PORTAL_FREE(rcv, sizeof(*rcv));
+
+ if (frc != FCM_CONNECT_ESTABLISHED) {
+ /* XXX it seems we don't call reject after this point? */
+ CERROR("iibt_cm_accept() failed: %d, aborting\n", frc);
+ rc = -ECONNABORTED;
+ goto out;
+ }
+
+ if (kibnal_set_cm_flags(conn->ibc_cep)) {
+ rc = -ECONNABORTED;
+ goto out;
+ }
+
+ CDEBUG(D_WARNING, "Connection %p -> "LPX64" ESTABLISHED.\n",
+ conn, conn->ibc_peer->ibp_nid);
+
+out:
+ if (reason) {
+ kibnal_reject(cep, reason);
+ rc = -ECONNABORTED;
+ }
+ if (conn != NULL)
+ kibnal_connreq_done(conn, 0, rc);
+
+ return;
+}
+
+static void
+dump_path_records(PATH_RESULTS *results)
+{
+ IB_PATH_RECORD *path;
+ int i;
+
+ for(i = 0; i < results->NumPathRecords; i++) {
+ path = &results->PathRecords[i];
+ CDEBUG(D_NET, "%d: sgid "LPX64":"LPX64" dgid "
+ LPX64":"LPX64" pkey %x\n",
+ i,
+ path->SGID.Type.Global.SubnetPrefix,
+ path->SGID.Type.Global.InterfaceID,
+ path->DGID.Type.Global.SubnetPrefix,
+ path->DGID.Type.Global.InterfaceID,
+ path->P_Key);
+ }
+}
+
+static void
+kibnal_pathreq_callback (void *arg, QUERY *query,
+ QUERY_RESULT_VALUES *query_res)
+{
+ IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs;
+ kib_conn_t *conn = arg;
+ PATH_RESULTS *path;
+ FSTATUS frc;
+
+ if (query_res->Status != FSUCCESS || query_res->ResultDataSize == 0) {
+ CERROR ("status %d data size %d\n", query_res->Status,
+ query_res->ResultDataSize);
+ kibnal_connreq_done (conn, 1, -EINVAL);
+ return;
+ }
+
+ path = (PATH_RESULTS *)query_res->QueryResult;
+
+ if (path->NumPathRecords < 1) {
+ CERROR ("expected path records: %d\n", path->NumPathRecords);
+ kibnal_connreq_done (conn, 1, -EINVAL);
+ return;
+ }
+
+ dump_path_records(path);
+
+ /* just using the first. this is probably a horrible idea. */
+ conn->ibc_connreq->cr_path = path->PathRecords[0];
+
+ conn->ibc_cep = iibt_cm_create_cep(CM_RC_TYPE);
+ if (conn->ibc_cep == NULL) {
+ CERROR ("Can't create CEP\n");
+ kibnal_connreq_done (conn, 1, -EINVAL);
+ return;
+ }
+
+ if (kibnal_set_cm_flags(conn->ibc_cep)) {
+ kibnal_connreq_done (conn, 1, -EINVAL);
+ return;
+ }
+
+ conn->ibc_connreq->cr_wcr = (kib_wire_connreq_t) {
+ .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC),
+ .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION),
+ .wcr_queue_depth = cpu_to_le16(IBNAL_MSG_QUEUE_SIZE),
+ .wcr_nid = cpu_to_le64(kibnal_data.kib_nid),
+ .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
+ };
+
+ conn->ibc_connreq->cr_cmreq = (CM_REQUEST_INFO) {
+ .SID = conn->ibc_connreq->cr_service.RID.ServiceID,
+ .CEPInfo = (CM_CEP_INFO) {
+ .CaGUID = kibnal_data.kib_hca_guids[0],
+ .EndToEndFlowControl = FALSE,
+ .PortGUID = conn->ibc_connreq->cr_path.SGID.Type.Global.InterfaceID,
+ .RetryCount = IBNAL_RETRY,
+ .RnrRetryCount = IBNAL_RNR_RETRY,
+ .AckTimeout = IBNAL_ACK_TIMEOUT,
+ .StartingPSN = IBNAL_STARTING_PSN,
+ .QPN = conn->ibc_qp_attrs.QPNumber,
+ .QKey = conn->ibc_qp_attrs.Qkey,
+ .OfferedResponderResources = ca_attr->MaxQPResponderResources,
+ .OfferedInitiatorDepth = ca_attr->MaxQPInitiatorDepth,
+ },
+ .PathInfo = (CM_CEP_PATHINFO) {
+ .bSubnetLocal = TRUE,
+ .Path = conn->ibc_connreq->cr_path,
+ },
+ };
+
+#if 0
+ /* XXX set timeout just like SDP!!!*/
+ conn->ibc_connreq->cr_path.packet_life = 13;
+#endif
+ /* Flag I'm getting involved with the CM... */
+ conn->ibc_state = IBNAL_CONN_CONNECTING;
+
+ CDEBUG(D_NET, "Connecting to, service id "LPX64", on "LPX64"\n",
+ conn->ibc_connreq->cr_service.RID.ServiceID,
+ *kibnal_service_nid_field(&conn->ibc_connreq->cr_service));
+
+ memset(conn->ibc_connreq->cr_cmreq.PrivateData, 0,
+ CM_REQUEST_INFO_USER_LEN);
+ memcpy(conn->ibc_connreq->cr_cmreq.PrivateData,
+ &conn->ibc_connreq->cr_wcr, sizeof(conn->ibc_connreq->cr_wcr));
+
+ /* kibnal_cm_callback gets my conn ref */
+ frc = iibt_cm_connect(conn->ibc_cep, &conn->ibc_connreq->cr_cmreq,
+ kibnal_cm_callback, conn);
+ if (frc != FPENDING && frc != FSUCCESS) {
+ CERROR ("Connect: %d\n", frc);
+ /* Back out state change as connect failed */
+ conn->ibc_state = IBNAL_CONN_INIT_QP;
+ kibnal_connreq_done (conn, 1, -EINVAL);
+ }
+}
+
+static void
+dump_service_records(SERVICE_RECORD_RESULTS *results)
+{
+ IB_SERVICE_RECORD *svc;
+ int i;
+
+ for(i = 0; i < results->NumServiceRecords; i++) {
+ svc = &results->ServiceRecords[i];
+ CDEBUG(D_NET, "%d: sid "LPX64" gid "LPX64":"LPX64" pkey %x\n",
+ i,
+ svc->RID.ServiceID,
+ svc->RID.ServiceGID.Type.Global.SubnetPrefix,
+ svc->RID.ServiceGID.Type.Global.InterfaceID,
+ svc->RID.ServiceP_Key);
+ }
+}
+
+
+static void
+kibnal_service_get_callback (void *arg, QUERY *query,
+ QUERY_RESULT_VALUES *query_res)
+{
+ kib_conn_t *conn = arg;
+ SERVICE_RECORD_RESULTS *svc;
+ COMMAND_CONTROL_PARAMETERS sd_params;
+ QUERY path_query;
+ FSTATUS frc;
+
+ if (query_res->Status != FSUCCESS || query_res->ResultDataSize == 0) {
+ CERROR ("status %d data size %d\n", query_res->Status,
+ query_res->ResultDataSize);
+ kibnal_connreq_done (conn, 1, -EINVAL);
+ return;
+ }
+
+ svc = (SERVICE_RECORD_RESULTS *)query_res->QueryResult;
+
+ if (svc->NumServiceRecords < 1) {
+ CERROR ("%d service records\n", svc->NumServiceRecords);
+ kibnal_connreq_done (conn, 1, -EINVAL);
+ return;
+ }
+
+ dump_service_records(svc);
+
+ conn->ibc_connreq->cr_service = svc->ServiceRecords[0];
+
+ CDEBUG(D_NET, "Got status %d, service id "LPX64", on "LPX64"\n",
+ query_res->Status , conn->ibc_connreq->cr_service.RID.ServiceID,
+ *kibnal_service_nid_field(&conn->ibc_connreq->cr_service));
+
+ memset(&path_query, 0, sizeof(path_query));
+ path_query.InputType = InputTypePortGuidPair;
+ path_query.OutputType = OutputTypePathRecord;
+ path_query.InputValue.PortGuidPair.SourcePortGuid = kibnal_data.kib_port_guid;
+ path_query.InputValue.PortGuidPair.DestPortGuid = conn->ibc_connreq->cr_service.RID.ServiceGID.Type.Global.InterfaceID;
+
+ memset(&sd_params, 0, sizeof(sd_params));
+ sd_params.RetryCount = IBNAL_RETRY;
+ sd_params.Timeout = 10 * 1000; /* wait 10 seconds */
+
+ /* kibnal_service_get_callback gets my conn ref */
+
+ frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd,
+ kibnal_data.kib_port_guid,
+ &path_query,
+ kibnal_pathreq_callback,
+ &sd_params, conn);
+ if (frc == FPENDING)
+ return;
+
+ CERROR ("Path record request failed: %d\n", frc);
+ kibnal_connreq_done (conn, 1, -EINVAL);
+}
+
+static void
+kibnal_connect_peer (kib_peer_t *peer)
+{
+ COMMAND_CONTROL_PARAMETERS sd_params;
+ QUERY query;
+ FSTATUS frc;
+ kib_conn_t *conn = kibnal_create_conn();
+
+ LASSERT (peer->ibp_connecting != 0);
+
+ if (conn == NULL) {
+ CERROR ("Can't allocate conn\n");
+ kibnal_peer_connect_failed (peer, 1, -ENOMEM);
+ return;
+ }
+
+ conn->ibc_peer = peer;
+ kib_peer_addref(peer);
+
+ PORTAL_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
+ if (conn->ibc_connreq == NULL) {
+ CERROR ("Can't allocate connreq\n");
+ kibnal_connreq_done (conn, 1, -ENOMEM);
+ return;
+ }
+
+ memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq));
+
+ kibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid);
+
+ memset(&query, 0, sizeof(query));
+ query.InputType = InputTypeServiceRecord;
+ query.OutputType = OutputTypeServiceRecord;
+ query.InputValue.ServiceRecordValue.ServiceRecord = conn->ibc_connreq->cr_service;
+ query.InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK;
+
+ memset(&sd_params, 0, sizeof(sd_params));
+ sd_params.RetryCount = IBNAL_RETRY;
+ sd_params.Timeout = 10 * 1000; /* wait 10 seconds */
+
+ /* kibnal_service_get_callback gets my conn ref */
+ frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd,
+ kibnal_data.kib_port_guid,
+ &query,
+ kibnal_service_get_callback,
+ &sd_params, conn);
+ if (frc == FPENDING)
+ return;
+
+ CERROR ("iibt_sd_query_port_fabric_information(): %d\n", frc);
+ kibnal_connreq_done (conn, 1, frc);
+}
+
+static int
+kibnal_conn_timed_out (kib_conn_t *conn)
+{
+ kib_tx_t *tx;
+ struct list_head *ttmp;
+ unsigned long flags;
+
+ spin_lock_irqsave (&conn->ibc_lock, flags);
+
+ list_for_each (ttmp, &conn->ibc_tx_queue) {
+ tx = list_entry (ttmp, kib_tx_t, tx_list);
+
+ LASSERT (!tx->tx_passive_rdma_wait);
+ LASSERT (tx->tx_sending == 0);
+
+ if (time_after_eq (jiffies, tx->tx_deadline)) {
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+ return 1;
+ }
+ }
+
+ list_for_each (ttmp, &conn->ibc_active_txs) {
+ tx = list_entry (ttmp, kib_tx_t, tx_list);
+
+ LASSERT (tx->tx_passive_rdma ||
+ !tx->tx_passive_rdma_wait);
+
+ LASSERT (tx->tx_passive_rdma_wait ||
+ tx->tx_sending != 0);
+
+ if (time_after_eq (jiffies, tx->tx_deadline)) {
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+ return 1;
+ }
+ }
+
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+ return 0;
+}
+
+static void
+kibnal_check_conns (int idx)
+{
+ struct list_head *peers = &kibnal_data.kib_peers[idx];
+ struct list_head *ptmp;
+ kib_peer_t *peer;
+ kib_conn_t *conn;
+ struct list_head *ctmp;
+
+ again:
+ /* NB. We expect to have a look at all the peers and not find any
+ * rdmas to time out, so we just use a shared lock while we
+ * take a look... */
+ read_lock (&kibnal_data.kib_global_lock);
+
+ list_for_each (ptmp, peers) {
+ peer = list_entry (ptmp, kib_peer_t, ibp_list);
+
+ list_for_each (ctmp, &peer->ibp_conns) {
+ conn = list_entry (ctmp, kib_conn_t, ibc_list);
+
+ KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_ESTABLISHED);
+
+ /* In case we have enough credits to return via a
+ * NOOP, but there were no non-blocking tx descs
+ * free to do it last time... */
+ kibnal_check_sends(conn);
+
+ if (!kibnal_conn_timed_out(conn))
+ continue;
+
+ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+
+ atomic_inc (&conn->ibc_refcount);
+ read_unlock (&kibnal_data.kib_global_lock);
+
+ CERROR("Timed out RDMA with "LPX64"\n",
+ peer->ibp_nid);
+
+ kibnal_close_conn (conn, -ETIMEDOUT);
+ kibnal_put_conn (conn);
+
+ /* start again now I've dropped the lock */
+ goto again;
+ }
+ }
+
+ read_unlock (&kibnal_data.kib_global_lock);
+}
+
+static void
+kib_connd_handle_state(kib_conn_t *conn)
+{
+ FSTATUS frc;
+
+ switch (conn->ibc_state) {
+ /* all refs have gone, free and be done with it */
+ case IBNAL_CONN_DISCONNECTED:
+ kibnal_destroy_conn (conn);
+ return; /* avoid put_conn */
+
+ case IBNAL_CONN_SEND_DREQ:
+ frc = iibt_cm_disconnect(conn->ibc_cep, NULL, NULL);
+ if (frc != FSUCCESS) /* XXX do real things */
+ CERROR("disconnect failed: %d\n", frc);
+ conn->ibc_state = IBNAL_CONN_DREQ;
+ break;
+
+ /* a callback got to the conn before we did */
+ case IBNAL_CONN_DREP:
+ break;
+
+ default:
+ CERROR ("Bad conn %p state: %d\n", conn,
+ conn->ibc_state);
+ LBUG();
+ break;
+ }
+
+ /* drop ref from close_conn */
+ kibnal_put_conn(conn);
+}
+
+int
+kibnal_connd (void *arg)
+{
+ wait_queue_t wait;
+ unsigned long flags;
+ kib_conn_t *conn;
+ kib_peer_t *peer;
+ int timeout;
+ int i;
+ int peer_index = 0;
+ unsigned long deadline = jiffies;
+
+ kportal_daemonize ("kibnal_connd");
+ kportal_blockallsigs ();
+
+ init_waitqueue_entry (&wait, current);
+
+ spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+
+ for (;;) {
+ if (!list_empty (&kibnal_data.kib_connd_conns)) {
+ conn = list_entry (kibnal_data.kib_connd_conns.next,
+ kib_conn_t, ibc_list);
+ list_del (&conn->ibc_list);
+
+ spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+ kib_connd_handle_state(conn);
+
+ spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+ continue;
+ }
+
+ if (!list_empty (&kibnal_data.kib_connd_peers)) {
+ peer = list_entry (kibnal_data.kib_connd_peers.next,
+ kib_peer_t, ibp_connd_list);
+
+ list_del_init (&peer->ibp_connd_list);
+ spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+
+ kibnal_connect_peer (peer);
+ kib_peer_decref (peer);
+
+ spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+ }
+
+ /* shut down and nobody left to reap... */
+ if (kibnal_data.kib_shutdown &&
+ atomic_read(&kibnal_data.kib_nconns) == 0)
+ break;
+
+ spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+
+ /* careful with the jiffy wrap... */
+ while ((timeout = (int)(deadline - jiffies)) <= 0) {
+ const int n = 4;
+ const int p = 1;
+ int chunk = kibnal_data.kib_peer_hash_size;
+
+ /* Time to check for RDMA timeouts on a few more
+ * peers: I do checks every 'p' seconds on a
+ * proportion of the peer table and I need to check
+ * every connection 'n' times within a timeout
+ * interval, to ensure I detect a timeout on any
+ * connection within (n+1)/n times the timeout
+ * interval. */
+
+ if (kibnal_tunables.kib_io_timeout > n * p)
+ chunk = (chunk * n * p) /
+ kibnal_tunables.kib_io_timeout;
+ if (chunk == 0)
+ chunk = 1;
+
+ for (i = 0; i < chunk; i++) {
+ kibnal_check_conns (peer_index);
+ peer_index = (peer_index + 1) %
+ kibnal_data.kib_peer_hash_size;
+ }
+
+ deadline += p * HZ;
+ }
+
+ kibnal_data.kib_connd_waketime = jiffies + timeout;
+
+ set_current_state (TASK_INTERRUPTIBLE);
+ add_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
+
+ if (!kibnal_data.kib_shutdown &&
+ list_empty (&kibnal_data.kib_connd_conns) &&
+ list_empty (&kibnal_data.kib_connd_peers))
+ schedule_timeout (timeout);
+
+ set_current_state (TASK_RUNNING);
+ remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
+
+ spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+ }
+
+ spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+
+ kibnal_thread_fini ();
+ return (0);
+}
+
+int
+kibnal_scheduler(void *arg)
+{
+ long id = (long)arg;
+ char name[16];
+ kib_rx_t *rx;
+ kib_tx_t *tx;
+ unsigned long flags;
+ int rc;
+ int counter = 0;
+ int did_something;
+
+ snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
+ kportal_daemonize(name);
+ kportal_blockallsigs();
+
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
+
+ for (;;) {
+ did_something = 0;
+
+ while (!list_empty(&kibnal_data.kib_sched_txq)) {
+ tx = list_entry(kibnal_data.kib_sched_txq.next,
+ kib_tx_t, tx_list);
+ list_del(&tx->tx_list);
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
+ flags);
+ kibnal_tx_done(tx);
+
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock,
+ flags);
+ }
+
+ if (!list_empty(&kibnal_data.kib_sched_rxq)) {
+ rx = list_entry(kibnal_data.kib_sched_rxq.next,
+ kib_rx_t, rx_list);
+ list_del(&rx->rx_list);
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
+ flags);
+
+ kibnal_rx(rx);
+
+ did_something = 1;
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock,
+ flags);
+ }
+
+ /* shut down and no receives to complete... */
+ if (kibnal_data.kib_shutdown &&
+ atomic_read(&kibnal_data.kib_nconns) == 0)
+ break;
+
+ /* nothing to do or hogging CPU */
+ if (!did_something || counter++ == IBNAL_RESCHED) {
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
+ flags);
+ counter = 0;
+
+ if (!did_something) {
+ rc = wait_event_interruptible(
+ kibnal_data.kib_sched_waitq,
+ !list_empty(&kibnal_data.kib_sched_txq) ||
+ !list_empty(&kibnal_data.kib_sched_rxq) ||
+ (kibnal_data.kib_shutdown &&
+ atomic_read (&kibnal_data.kib_nconns) == 0));
+ } else {
+ our_cond_resched();
+ }
+
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock,
+ flags);
+ }
+ }
+
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+
+ kibnal_thread_fini();
+ return (0);
+}
+
+
+lib_nal_t kibnal_lib = {
+ libnal_data: &kibnal_data, /* NAL private data */
+ libnal_send: kibnal_send,
+ libnal_send_pages: kibnal_send_pages,
+ libnal_recv: kibnal_recv,
+ libnal_recv_pages: kibnal_recv_pages,
+ libnal_dist: kibnal_dist
+};
--- /dev/null
+.deps
+Makefile
+.*.cmd
+autoMakefile.in
+autoMakefile
+*.ko
+*.mod.c
+.*.flags
+.tmp_versions
+.depend
#include "openibnal.h"
-nal_t koibnal_api;
-ptl_handle_ni_t koibnal_ni;
-koib_data_t koibnal_data;
-koib_tunables_t koibnal_tunables;
+nal_t kibnal_api;
+ptl_handle_ni_t kibnal_ni;
+kib_data_t kibnal_data;
+kib_tunables_t kibnal_tunables;
#ifdef CONFIG_SYSCTL
-#define OPENIBNAL_SYSCTL 202
+#define IBNAL_SYSCTL 202
-#define OPENIBNAL_SYSCTL_TIMEOUT 1
-#define OPENIBNAL_SYSCTL_ZERO_COPY 2
+#define IBNAL_SYSCTL_TIMEOUT 1
-static ctl_table koibnal_ctl_table[] = {
- {OPENIBNAL_SYSCTL_TIMEOUT, "timeout",
- &koibnal_tunables.koib_io_timeout, sizeof (int),
+static ctl_table kibnal_ctl_table[] = {
+ {IBNAL_SYSCTL_TIMEOUT, "timeout",
+ &kibnal_tunables.kib_io_timeout, sizeof (int),
0644, NULL, &proc_dointvec},
{ 0 }
};
-static ctl_table koibnal_top_ctl_table[] = {
- {OPENIBNAL_SYSCTL, "openibnal", NULL, 0, 0555, koibnal_ctl_table},
+static ctl_table kibnal_top_ctl_table[] = {
+ {IBNAL_SYSCTL, "openibnal", NULL, 0, 0555, kibnal_ctl_table},
{ 0 }
};
#endif
"service id: "LPX64"\n"
"name : %s\n"
"NID : "LPX64"\n", tag, rc,
- service->service_id, name, service->service_data64[0]);
+ service->service_id, name,
+ *kibnal_service_nid_field(service));
}
void
-koibnal_service_setunset_done (tTS_IB_CLIENT_QUERY_TID tid, int status,
+kibnal_service_setunset_done (tTS_IB_CLIENT_QUERY_TID tid, int status,
struct ib_common_attrib_service *service, void *arg)
{
*(int *)arg = status;
- up (&koibnal_data.koib_nid_signal);
+ up (&kibnal_data.kib_nid_signal);
}
+#if IBNAL_CHECK_ADVERT
+void
+kibnal_check_advert (void)
+{
+ struct ib_common_attrib_service *svc;
+ __u64 tid;
+ int rc;
+ int rc2;
+
+ PORTAL_ALLOC(svc, sizeof(*svc));
+ if (svc == NULL)
+ return;
+
+ memset (svc, 0, sizeof (*svc));
+ kibnal_set_service_keys(svc, kibnal_data.kib_nid);
+
+ rc = ib_service_get (kibnal_data.kib_device,
+ kibnal_data.kib_port,
+ svc,
+ KIBNAL_SERVICE_KEY_MASK,
+ kibnal_tunables.kib_io_timeout * HZ,
+ kibnal_service_setunset_done, &rc2,
+ &tid);
+
+ if (rc != 0) {
+ CERROR ("Immediate error %d checking SM service\n", rc);
+ } else {
+ down (&kibnal_data.kib_nid_signal);
+ rc = rc2;
+
+ if (rc != 0)
+ CERROR ("Error %d checking SM service\n", rc);
+ }
+
+ PORTAL_FREE(svc, sizeof(*svc));
+}
+#endif
+
int
-koibnal_advertise (void)
+kibnal_advertise (void)
{
+ struct ib_common_attrib_service *svc;
__u64 tid;
int rc;
int rc2;
- LASSERT (koibnal_data.koib_nid != PTL_NID_ANY);
+ LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
+
+ PORTAL_ALLOC(svc, sizeof(*svc));
+ if (svc == NULL)
+ return (-ENOMEM);
- memset (&koibnal_data.koib_service, 0,
- sizeof (koibnal_data.koib_service));
+ memset (svc, 0, sizeof (*svc));
- koibnal_data.koib_service.service_id
- = koibnal_data.koib_cm_service_id;
+ svc->service_id = kibnal_data.kib_service_id;
- rc = ib_cached_gid_get(koibnal_data.koib_device,
- koibnal_data.koib_port,
+ rc = ib_cached_gid_get(kibnal_data.kib_device,
+ kibnal_data.kib_port,
0,
- koibnal_data.koib_service.service_gid);
+ svc->service_gid);
if (rc != 0) {
CERROR ("Can't get port %d GID: %d\n",
- koibnal_data.koib_port, rc);
- return (rc);
+ kibnal_data.kib_port, rc);
+ goto out;
}
- rc = ib_cached_pkey_get(koibnal_data.koib_device,
- koibnal_data.koib_port,
+ rc = ib_cached_pkey_get(kibnal_data.kib_device,
+ kibnal_data.kib_port,
0,
- &koibnal_data.koib_service.service_pkey);
+ &svc->service_pkey);
if (rc != 0) {
CERROR ("Can't get port %d PKEY: %d\n",
- koibnal_data.koib_port, rc);
- return (rc);
+ kibnal_data.kib_port, rc);
+ goto out;
}
- koibnal_data.koib_service.service_lease = 0xffffffff;
+ svc->service_lease = 0xffffffff;
- koibnal_set_service_keys(&koibnal_data.koib_service, koibnal_data.koib_nid);
+ kibnal_set_service_keys(svc, kibnal_data.kib_nid);
CDEBUG(D_NET, "Advertising service id "LPX64" %s:"LPX64"\n",
- koibnal_data.koib_service.service_id,
- koibnal_data.koib_service.service_name,
- *koibnal_service_nid_field(&koibnal_data.koib_service));
+ svc->service_id,
+ svc->service_name, *kibnal_service_nid_field(svc));
- rc = ib_service_set (koibnal_data.koib_device,
- koibnal_data.koib_port,
- &koibnal_data.koib_service,
+ rc = ib_service_set (kibnal_data.kib_device,
+ kibnal_data.kib_port,
+ svc,
IB_SA_SERVICE_COMP_MASK_ID |
IB_SA_SERVICE_COMP_MASK_GID |
IB_SA_SERVICE_COMP_MASK_PKEY |
IB_SA_SERVICE_COMP_MASK_LEASE |
- KOIBNAL_SERVICE_KEY_MASK,
- koibnal_tunables.koib_io_timeout * HZ,
- koibnal_service_setunset_done, &rc2, &tid);
+ KIBNAL_SERVICE_KEY_MASK,
+ kibnal_tunables.kib_io_timeout * HZ,
+ kibnal_service_setunset_done, &rc2, &tid);
- if (rc == 0) {
- down (&koibnal_data.koib_nid_signal);
- rc = rc2;
+ if (rc != 0) {
+ CERROR ("Immediate error %d advertising NID "LPX64"\n",
+ rc, kibnal_data.kib_nid);
+ goto out;
}
-
- if (rc != 0)
- CERROR ("Error %d advertising SM service\n", rc);
+ down (&kibnal_data.kib_nid_signal);
+
+ rc = rc2;
+ if (rc != 0)
+ CERROR ("Error %d advertising NID "LPX64"\n",
+ rc, kibnal_data.kib_nid);
+ out:
+ PORTAL_FREE(svc, sizeof(*svc));
return (rc);
}
-int
-koibnal_unadvertise (int expect_success)
+void
+kibnal_unadvertise (int expect_success)
{
+ struct ib_common_attrib_service *svc;
__u64 tid;
int rc;
int rc2;
- LASSERT (koibnal_data.koib_nid != PTL_NID_ANY);
+ LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
- memset (&koibnal_data.koib_service, 0,
- sizeof (koibnal_data.koib_service));
+ PORTAL_ALLOC(svc, sizeof(*svc));
+ if (svc == NULL)
+ return;
- koibnal_set_service_keys(&koibnal_data.koib_service, koibnal_data.koib_nid);
+ memset (svc, 0, sizeof(*svc));
+
+ kibnal_set_service_keys(svc, kibnal_data.kib_nid);
CDEBUG(D_NET, "Unadvertising service %s:"LPX64"\n",
- koibnal_data.koib_service.service_name,
- *koibnal_service_nid_field(&koibnal_data.koib_service));
-
- rc = ib_service_delete (koibnal_data.koib_device,
- koibnal_data.koib_port,
- &koibnal_data.koib_service,
- KOIBNAL_SERVICE_KEY_MASK,
- koibnal_tunables.koib_io_timeout * HZ,
- koibnal_service_setunset_done, &rc2, &tid);
+ svc->service_name, *kibnal_service_nid_field(svc));
+
+ rc = ib_service_delete (kibnal_data.kib_device,
+ kibnal_data.kib_port,
+ svc,
+ KIBNAL_SERVICE_KEY_MASK,
+ kibnal_tunables.kib_io_timeout * HZ,
+ kibnal_service_setunset_done, &rc2, &tid);
if (rc != 0) {
CERROR ("Immediate error %d unadvertising NID "LPX64"\n",
- rc, koibnal_data.koib_nid);
- return (rc);
+ rc, kibnal_data.kib_nid);
+ goto out;
}
- down (&koibnal_data.koib_nid_signal);
+ down (&kibnal_data.kib_nid_signal);
if ((rc2 == 0) == !!expect_success)
- return (0);
+ goto out; /* success: rc == 0 */
if (expect_success)
CERROR("Error %d unadvertising NID "LPX64"\n",
- rc, koibnal_data.koib_nid);
+ rc, kibnal_data.kib_nid);
else
CWARN("Removed conflicting NID "LPX64"\n",
- koibnal_data.koib_nid);
-
- return (rc);
-}
-
-int
-koibnal_check_advert (void)
-{
- __u64 tid;
- int rc;
- int rc2;
-
- static struct ib_common_attrib_service srv;
-
- memset (&srv, 0, sizeof (srv));
-
- koibnal_set_service_keys(&srv, koibnal_data.koib_nid);
-
- rc = ib_service_get (koibnal_data.koib_device,
- koibnal_data.koib_port,
- &srv,
- KOIBNAL_SERVICE_KEY_MASK,
- koibnal_tunables.koib_io_timeout * HZ,
- koibnal_service_setunset_done, &rc2,
- &tid);
-
- if (rc != 0) {
- CERROR ("Immediate error %d checking SM service\n", rc);
- } else {
- down (&koibnal_data.koib_nid_signal);
- rc = rc2;
-
- if (rc != 0)
- CERROR ("Error %d checking SM service\n", rc);
- }
-
- return (rc);
+ kibnal_data.kib_nid);
+ out:
+ PORTAL_FREE(svc, sizeof(*svc));
}
int
-koibnal_set_mynid(ptl_nid_t nid)
+kibnal_set_mynid(ptl_nid_t nid)
{
struct timeval tv;
- lib_ni_t *ni = &koibnal_lib.libnal_ni;
+ lib_ni_t *ni = &kibnal_lib.libnal_ni;
int rc;
CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
do_gettimeofday(&tv);
- down (&koibnal_data.koib_nid_mutex);
+ down (&kibnal_data.kib_nid_mutex);
- if (nid == koibnal_data.koib_nid) {
+ if (nid == kibnal_data.kib_nid) {
/* no change of NID */
- up (&koibnal_data.koib_nid_mutex);
+ up (&kibnal_data.kib_nid_mutex);
return (0);
}
CDEBUG(D_NET, "NID "LPX64"("LPX64")\n",
- koibnal_data.koib_nid, nid);
+ kibnal_data.kib_nid, nid);
- if (koibnal_data.koib_nid != PTL_NID_ANY) {
+ if (kibnal_data.kib_nid != PTL_NID_ANY) {
- koibnal_unadvertise (1);
+ kibnal_unadvertise (1);
- rc = ib_cm_listen_stop (koibnal_data.koib_listen_handle);
+ rc = ib_cm_listen_stop (kibnal_data.kib_listen_handle);
if (rc != 0)
CERROR ("Error %d stopping listener\n", rc);
}
- koibnal_data.koib_nid = ni->ni_pid.nid = nid;
- koibnal_data.koib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+ kibnal_data.kib_nid = ni->ni_pid.nid = nid;
+ kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
/* Delete all existing peers and their connections after new
* NID/incarnation set to ensure no old connections in our brave
* new world. */
- koibnal_del_peer (PTL_NID_ANY, 0);
-
- rc = 0;
- if (koibnal_data.koib_nid != PTL_NID_ANY) {
- /* New NID installed */
+ kibnal_del_peer (PTL_NID_ANY, 0);
- /* remove any previous advert (crashed node etc) */
- koibnal_unadvertise(0);
+ if (kibnal_data.kib_nid == PTL_NID_ANY) {
+ /* No new NID to install */
+ up (&kibnal_data.kib_nid_mutex);
+ return (0);
+ }
+
+ /* remove any previous advert (crashed node etc) */
+ kibnal_unadvertise(0);
- /* Assign new service number */
- koibnal_data.koib_cm_service_id = ib_cm_service_assign();
- CDEBUG(D_NET, "service_id "LPX64"\n", koibnal_data.koib_cm_service_id);
+ /* Assign new service number */
+ kibnal_data.kib_service_id = ib_cm_service_assign();
+ CDEBUG(D_NET, "service_id "LPX64"\n", kibnal_data.kib_service_id);
- rc = ib_cm_listen(koibnal_data.koib_cm_service_id,
- TS_IB_CM_SERVICE_EXACT_MASK,
- koibnal_passive_conn_callback, NULL,
- &koibnal_data.koib_listen_handle);
- if (rc != 0) {
- CERROR ("ib_cm_listen error: %d\n", rc);
- goto out;
+ rc = ib_cm_listen(kibnal_data.kib_service_id,
+ TS_IB_CM_SERVICE_EXACT_MASK,
+ kibnal_passive_conn_callback, NULL,
+ &kibnal_data.kib_listen_handle);
+ if (rc == 0) {
+ rc = kibnal_advertise();
+ if (rc == 0) {
+#if IBNAL_CHECK_ADVERT
+ kibnal_check_advert();
+#endif
+ up (&kibnal_data.kib_nid_mutex);
+ return (0);
}
- rc = koibnal_advertise();
-
- koibnal_check_advert();
- }
-
- out:
- if (rc != 0) {
- koibnal_data.koib_nid = PTL_NID_ANY;
+ ib_cm_listen_stop(kibnal_data.kib_listen_handle);
/* remove any peers that sprung up while I failed to
* advertise myself */
- koibnal_del_peer (PTL_NID_ANY, 0);
+ kibnal_del_peer (PTL_NID_ANY, 0);
}
-
- up (&koibnal_data.koib_nid_mutex);
- return (0);
+
+ kibnal_data.kib_nid = PTL_NID_ANY;
+ up (&kibnal_data.kib_nid_mutex);
+ return (rc);
}
-koib_peer_t *
-koibnal_create_peer (ptl_nid_t nid)
+kib_peer_t *
+kibnal_create_peer (ptl_nid_t nid)
{
- koib_peer_t *peer;
+ kib_peer_t *peer;
LASSERT (nid != PTL_NID_ANY);
INIT_LIST_HEAD (&peer->ibp_tx_queue);
peer->ibp_reconnect_time = jiffies;
- peer->ibp_reconnect_interval = OPENIBNAL_MIN_RECONNECT_INTERVAL;
+ peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
- atomic_inc (&koibnal_data.koib_npeers);
+ atomic_inc (&kibnal_data.kib_npeers);
return (peer);
}
void
-koibnal_destroy_peer (koib_peer_t *peer)
+kibnal_destroy_peer (kib_peer_t *peer)
{
CDEBUG (D_NET, "peer "LPX64" %p deleted\n", peer->ibp_nid, peer);
LASSERT (atomic_read (&peer->ibp_refcount) == 0);
LASSERT (peer->ibp_persistence == 0);
- LASSERT (!koibnal_peer_active(peer));
+ LASSERT (!kibnal_peer_active(peer));
LASSERT (peer->ibp_connecting == 0);
LASSERT (list_empty (&peer->ibp_conns));
LASSERT (list_empty (&peer->ibp_tx_queue));
* they are destroyed, so we can be assured that _all_ state to do
* with this peer has been cleaned up when its refcount drops to
* zero. */
- atomic_dec (&koibnal_data.koib_npeers);
+ atomic_dec (&kibnal_data.kib_npeers);
}
void
-koibnal_put_peer (koib_peer_t *peer)
+kibnal_put_peer (kib_peer_t *peer)
{
CDEBUG (D_OTHER, "putting peer[%p] -> "LPX64" (%d)\n",
peer, peer->ibp_nid,
if (!atomic_dec_and_test (&peer->ibp_refcount))
return;
- koibnal_destroy_peer (peer);
+ kibnal_destroy_peer (peer);
}
-koib_peer_t *
-koibnal_find_peer_locked (ptl_nid_t nid)
+kib_peer_t *
+kibnal_find_peer_locked (ptl_nid_t nid)
{
- struct list_head *peer_list = koibnal_nid2peerlist (nid);
+ struct list_head *peer_list = kibnal_nid2peerlist (nid);
struct list_head *tmp;
- koib_peer_t *peer;
+ kib_peer_t *peer;
list_for_each (tmp, peer_list) {
- peer = list_entry (tmp, koib_peer_t, ibp_list);
+ peer = list_entry (tmp, kib_peer_t, ibp_list);
LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
peer->ibp_connecting != 0 || /* creating conns */
return (NULL);
}
-koib_peer_t *
-koibnal_get_peer (ptl_nid_t nid)
+kib_peer_t *
+kibnal_get_peer (ptl_nid_t nid)
{
- koib_peer_t *peer;
+ kib_peer_t *peer;
- read_lock (&koibnal_data.koib_global_lock);
- peer = koibnal_find_peer_locked (nid);
+ read_lock (&kibnal_data.kib_global_lock);
+ peer = kibnal_find_peer_locked (nid);
if (peer != NULL) /* +1 ref for caller? */
atomic_inc (&peer->ibp_refcount);
- read_unlock (&koibnal_data.koib_global_lock);
+ read_unlock (&kibnal_data.kib_global_lock);
return (peer);
}
void
-koibnal_unlink_peer_locked (koib_peer_t *peer)
+kibnal_unlink_peer_locked (kib_peer_t *peer)
{
LASSERT (peer->ibp_persistence == 0);
LASSERT (list_empty(&peer->ibp_conns));
- LASSERT (koibnal_peer_active(peer));
+ LASSERT (kibnal_peer_active(peer));
list_del_init (&peer->ibp_list);
/* lose peerlist's ref */
- koibnal_put_peer (peer);
+ kibnal_put_peer (peer);
}
int
-koibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
+kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
{
- koib_peer_t *peer;
+ kib_peer_t *peer;
struct list_head *ptmp;
int i;
- read_lock (&koibnal_data.koib_global_lock);
+ read_lock (&kibnal_data.kib_global_lock);
- for (i = 0; i < koibnal_data.koib_peer_hash_size; i++) {
+ for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
- list_for_each (ptmp, &koibnal_data.koib_peers[i]) {
+ list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
- peer = list_entry (ptmp, koib_peer_t, ibp_list);
+ peer = list_entry (ptmp, kib_peer_t, ibp_list);
LASSERT (peer->ibp_persistence != 0 ||
peer->ibp_connecting != 0 ||
!list_empty (&peer->ibp_conns));
*nidp = peer->ibp_nid;
*persistencep = peer->ibp_persistence;
- read_unlock (&koibnal_data.koib_global_lock);
+ read_unlock (&kibnal_data.kib_global_lock);
return (0);
}
}
- read_unlock (&koibnal_data.koib_global_lock);
+ read_unlock (&kibnal_data.kib_global_lock);
return (-ENOENT);
}
int
-koibnal_add_persistent_peer (ptl_nid_t nid)
+kibnal_add_persistent_peer (ptl_nid_t nid)
{
unsigned long flags;
- koib_peer_t *peer;
- koib_peer_t *peer2;
+ kib_peer_t *peer;
+ kib_peer_t *peer2;
if (nid == PTL_NID_ANY)
return (-EINVAL);
- peer = koibnal_create_peer (nid);
+ peer = kibnal_create_peer (nid);
if (peer == NULL)
return (-ENOMEM);
- write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
- peer2 = koibnal_find_peer_locked (nid);
+ peer2 = kibnal_find_peer_locked (nid);
if (peer2 != NULL) {
- koibnal_put_peer (peer);
+ kibnal_put_peer (peer);
peer = peer2;
} else {
/* peer table takes existing ref on peer */
list_add_tail (&peer->ibp_list,
- koibnal_nid2peerlist (nid));
+ kibnal_nid2peerlist (nid));
}
peer->ibp_persistence++;
- write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
return (0);
}
void
-koibnal_del_peer_locked (koib_peer_t *peer, int single_share)
+kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
{
struct list_head *ctmp;
struct list_head *cnxt;
- koib_conn_t *conn;
+ kib_conn_t *conn;
if (!single_share)
peer->ibp_persistence = 0;
return;
list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
- conn = list_entry(ctmp, koib_conn_t, ibc_list);
+ conn = list_entry(ctmp, kib_conn_t, ibc_list);
- koibnal_close_conn_locked (conn, 0);
+ kibnal_close_conn_locked (conn, 0);
}
/* NB peer unlinks itself when last conn is closed */
}
int
-koibnal_del_peer (ptl_nid_t nid, int single_share)
+kibnal_del_peer (ptl_nid_t nid, int single_share)
{
unsigned long flags;
struct list_head *ptmp;
struct list_head *pnxt;
- koib_peer_t *peer;
+ kib_peer_t *peer;
int lo;
int hi;
int i;
int rc = -ENOENT;
- write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
if (nid != PTL_NID_ANY)
- lo = hi = koibnal_nid2peerlist(nid) - koibnal_data.koib_peers;
+ lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
else {
lo = 0;
- hi = koibnal_data.koib_peer_hash_size - 1;
+ hi = kibnal_data.kib_peer_hash_size - 1;
}
for (i = lo; i <= hi; i++) {
- list_for_each_safe (ptmp, pnxt, &koibnal_data.koib_peers[i]) {
- peer = list_entry (ptmp, koib_peer_t, ibp_list);
+ list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
+ peer = list_entry (ptmp, kib_peer_t, ibp_list);
LASSERT (peer->ibp_persistence != 0 ||
peer->ibp_connecting != 0 ||
!list_empty (&peer->ibp_conns));
if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid))
continue;
- koibnal_del_peer_locked (peer, single_share);
+ kibnal_del_peer_locked (peer, single_share);
rc = 0; /* matched something */
if (single_share)
}
}
out:
- write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
return (rc);
}
-koib_conn_t *
-koibnal_get_conn_by_idx (int index)
+kib_conn_t *
+kibnal_get_conn_by_idx (int index)
{
- koib_peer_t *peer;
+ kib_peer_t *peer;
struct list_head *ptmp;
- koib_conn_t *conn;
+ kib_conn_t *conn;
struct list_head *ctmp;
int i;
- read_lock (&koibnal_data.koib_global_lock);
+ read_lock (&kibnal_data.kib_global_lock);
- for (i = 0; i < koibnal_data.koib_peer_hash_size; i++) {
- list_for_each (ptmp, &koibnal_data.koib_peers[i]) {
+ for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
+ list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
- peer = list_entry (ptmp, koib_peer_t, ibp_list);
+ peer = list_entry (ptmp, kib_peer_t, ibp_list);
LASSERT (peer->ibp_persistence > 0 ||
peer->ibp_connecting != 0 ||
!list_empty (&peer->ibp_conns));
if (index-- > 0)
continue;
- conn = list_entry (ctmp, koib_conn_t, ibc_list);
+ conn = list_entry (ctmp, kib_conn_t, ibc_list);
CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
atomic_read (&conn->ibc_refcount));
atomic_inc (&conn->ibc_refcount);
- read_unlock (&koibnal_data.koib_global_lock);
+ read_unlock (&kibnal_data.kib_global_lock);
return (conn);
}
}
}
- read_unlock (&koibnal_data.koib_global_lock);
+ read_unlock (&kibnal_data.kib_global_lock);
return (NULL);
}
-koib_conn_t *
-koibnal_create_conn (void)
+kib_conn_t *
+kibnal_create_conn (void)
{
- koib_conn_t *conn;
+ kib_conn_t *conn;
int i;
__u64 vaddr = 0;
__u64 vaddr_base;
memset (conn, 0, sizeof (*conn));
INIT_LIST_HEAD (&conn->ibc_tx_queue);
- INIT_LIST_HEAD (&conn->ibc_rdma_queue);
+ INIT_LIST_HEAD (&conn->ibc_active_txs);
spin_lock_init (&conn->ibc_lock);
- atomic_inc (&koibnal_data.koib_nconns);
+ atomic_inc (&kibnal_data.kib_nconns);
/* well not really, but I call destroy() on failure, which decrements */
- PORTAL_ALLOC (conn->ibc_rxs, OPENIBNAL_RX_MSGS * sizeof (koib_rx_t));
+ PORTAL_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
if (conn->ibc_rxs == NULL)
goto failed;
- memset (conn->ibc_rxs, 0, OPENIBNAL_RX_MSGS * sizeof(koib_rx_t));
+ memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
- rc = koibnal_alloc_pages(&conn->ibc_rx_pages,
- OPENIBNAL_RX_MSG_PAGES,
- IB_ACCESS_LOCAL_WRITE);
+ rc = kibnal_alloc_pages(&conn->ibc_rx_pages,
+ IBNAL_RX_MSG_PAGES,
+ IB_ACCESS_LOCAL_WRITE);
if (rc != 0)
goto failed;
- vaddr_base = vaddr = conn->ibc_rx_pages->oibp_vaddr;
+ vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr;
- for (i = ipage = page_offset = 0; i < OPENIBNAL_RX_MSGS; i++) {
- struct page *page = conn->ibc_rx_pages->oibp_pages[ipage];
- koib_rx_t *rx = &conn->ibc_rxs[i];
+ for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
+ struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
+ kib_rx_t *rx = &conn->ibc_rxs[i];
rx->rx_conn = conn;
rx->rx_vaddr = vaddr;
- rx->rx_msg = (koib_msg_t *)(((char *)page_address(page)) + page_offset);
+ rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset);
- vaddr += OPENIBNAL_MSG_SIZE;
- LASSERT (vaddr <= vaddr_base + OPENIBNAL_RX_MSG_BYTES);
+ vaddr += IBNAL_MSG_SIZE;
+ LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
- page_offset += OPENIBNAL_MSG_SIZE;
+ page_offset += IBNAL_MSG_SIZE;
LASSERT (page_offset <= PAGE_SIZE);
if (page_offset == PAGE_SIZE) {
page_offset = 0;
ipage++;
- LASSERT (ipage <= OPENIBNAL_RX_MSG_PAGES);
+ LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
}
}
params.qp_create = (struct ib_qp_create_param) {
.limit = {
/* Sends have an optional RDMA */
- .max_outstanding_send_request = 2 * OPENIBNAL_MSG_QUEUE_SIZE,
- .max_outstanding_receive_request = OPENIBNAL_MSG_QUEUE_SIZE,
+ .max_outstanding_send_request = 2 * IBNAL_MSG_QUEUE_SIZE,
+ .max_outstanding_receive_request = IBNAL_MSG_QUEUE_SIZE,
.max_send_gather_element = 1,
.max_receive_scatter_element = 1,
},
- .pd = koibnal_data.koib_pd,
- .send_queue = koibnal_data.koib_tx_cq,
- .receive_queue = koibnal_data.koib_rx_cq,
+ .pd = kibnal_data.kib_pd,
+ .send_queue = kibnal_data.kib_cq,
+ .receive_queue = kibnal_data.kib_cq,
.send_policy = IB_WQ_SIGNAL_SELECTABLE,
.receive_policy = IB_WQ_SIGNAL_SELECTABLE,
.rd_domain = 0,
}
/* Mark QP created */
- conn->ibc_state = OPENIBNAL_CONN_INIT_QP;
+ conn->ibc_state = IBNAL_CONN_INIT_QP;
params.qp_attr = (struct ib_qp_attribute) {
.state = IB_QP_STATE_INIT,
- .port = koibnal_data.koib_port,
+ .port = kibnal_data.kib_port,
.enable_rdma_read = 1,
.enable_rdma_write = 1,
.valid_fields = (IB_QP_ATTRIBUTE_STATE |
return (conn);
failed:
- koibnal_destroy_conn (conn);
+ kibnal_destroy_conn (conn);
return (NULL);
}
void
-koibnal_destroy_conn (koib_conn_t *conn)
+kibnal_destroy_conn (kib_conn_t *conn)
{
int rc;
LASSERT (atomic_read (&conn->ibc_refcount) == 0);
LASSERT (list_empty(&conn->ibc_tx_queue));
- LASSERT (list_empty(&conn->ibc_rdma_queue));
+ LASSERT (list_empty(&conn->ibc_active_txs));
LASSERT (conn->ibc_nsends_posted == 0);
LASSERT (conn->ibc_connreq == NULL);
switch (conn->ibc_state) {
- case OPENIBNAL_CONN_ZOMBIE:
+ case IBNAL_CONN_ZOMBIE:
/* called after connection sequence initiated */
- case OPENIBNAL_CONN_INIT_QP:
+ case IBNAL_CONN_INIT_QP:
rc = ib_qp_destroy(conn->ibc_qp);
if (rc != 0)
CERROR("Can't destroy QP: %d\n", rc);
/* fall through */
- case OPENIBNAL_CONN_INIT_NOTHING:
+ case IBNAL_CONN_INIT_NOTHING:
break;
default:
}
if (conn->ibc_rx_pages != NULL)
- koibnal_free_pages(conn->ibc_rx_pages);
+ kibnal_free_pages(conn->ibc_rx_pages);
if (conn->ibc_rxs != NULL)
PORTAL_FREE(conn->ibc_rxs,
- OPENIBNAL_RX_MSGS * sizeof(koib_rx_t));
+ IBNAL_RX_MSGS * sizeof(kib_rx_t));
if (conn->ibc_peer != NULL)
- koibnal_put_peer(conn->ibc_peer);
+ kibnal_put_peer(conn->ibc_peer);
PORTAL_FREE(conn, sizeof (*conn));
- atomic_dec(&koibnal_data.koib_nconns);
+ atomic_dec(&kibnal_data.kib_nconns);
- if (atomic_read (&koibnal_data.koib_nconns) == 0 &&
- koibnal_data.koib_shutdown) {
+ if (atomic_read (&kibnal_data.kib_nconns) == 0 &&
+ kibnal_data.kib_shutdown) {
/* I just nuked the last connection on shutdown; wake up
* everyone so they can exit. */
- wake_up_all(&koibnal_data.koib_sched_waitq);
- wake_up_all(&koibnal_data.koib_connd_waitq);
+ wake_up_all(&kibnal_data.kib_sched_waitq);
+ wake_up_all(&kibnal_data.kib_connd_waitq);
}
}
void
-koibnal_put_conn (koib_conn_t *conn)
+kibnal_put_conn (kib_conn_t *conn)
{
unsigned long flags;
return;
/* last ref only goes on zombies */
- LASSERT (conn->ibc_state == OPENIBNAL_CONN_ZOMBIE);
+ LASSERT (conn->ibc_state == IBNAL_CONN_ZOMBIE);
- spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags);
+ spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
- list_add (&conn->ibc_list, &koibnal_data.koib_connd_conns);
- wake_up (&koibnal_data.koib_connd_waitq);
+ list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns);
+ wake_up (&kibnal_data.kib_connd_waitq);
- spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags);
+ spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
}
int
-koibnal_close_peer_conns_locked (koib_peer_t *peer, int why)
+kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
{
- koib_conn_t *conn;
+ kib_conn_t *conn;
struct list_head *ctmp;
struct list_head *cnxt;
int count = 0;
list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
- conn = list_entry (ctmp, koib_conn_t, ibc_list);
+ conn = list_entry (ctmp, kib_conn_t, ibc_list);
count++;
- koibnal_close_conn_locked (conn, why);
+ kibnal_close_conn_locked (conn, why);
}
return (count);
}
int
-koibnal_close_stale_conns_locked (koib_peer_t *peer, __u64 incarnation)
+kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
{
- koib_conn_t *conn;
+ kib_conn_t *conn;
struct list_head *ctmp;
struct list_head *cnxt;
int count = 0;
list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
- conn = list_entry (ctmp, koib_conn_t, ibc_list);
+ conn = list_entry (ctmp, kib_conn_t, ibc_list);
if (conn->ibc_incarnation == incarnation)
continue;
peer->ibp_nid, conn->ibc_incarnation, incarnation);
count++;
- koibnal_close_conn_locked (conn, -ESTALE);
+ kibnal_close_conn_locked (conn, -ESTALE);
}
return (count);
}
int
-koibnal_close_matching_conns (ptl_nid_t nid)
+kibnal_close_matching_conns (ptl_nid_t nid)
{
unsigned long flags;
- koib_peer_t *peer;
+ kib_peer_t *peer;
struct list_head *ptmp;
struct list_head *pnxt;
int lo;
int i;
int count = 0;
- write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
if (nid != PTL_NID_ANY)
- lo = hi = koibnal_nid2peerlist(nid) - koibnal_data.koib_peers;
+ lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
else {
lo = 0;
- hi = koibnal_data.koib_peer_hash_size - 1;
+ hi = kibnal_data.kib_peer_hash_size - 1;
}
for (i = lo; i <= hi; i++) {
- list_for_each_safe (ptmp, pnxt, &koibnal_data.koib_peers[i]) {
+ list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
- peer = list_entry (ptmp, koib_peer_t, ibp_list);
+ peer = list_entry (ptmp, kib_peer_t, ibp_list);
LASSERT (peer->ibp_persistence != 0 ||
peer->ibp_connecting != 0 ||
!list_empty (&peer->ibp_conns));
if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid))
continue;
- count += koibnal_close_peer_conns_locked (peer, 0);
+ count += kibnal_close_peer_conns_locked (peer, 0);
}
}
- write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
/* wildcards always succeed */
if (nid == PTL_NID_ANY)
}
int
-koibnal_cmd(struct portals_cfg *pcfg, void * private)
+kibnal_cmd(struct portals_cfg *pcfg, void * private)
{
int rc = -EINVAL;
ptl_nid_t nid = 0;
int share_count = 0;
- rc = koibnal_get_peer_info(pcfg->pcfg_count,
- &nid, &share_count);
+ rc = kibnal_get_peer_info(pcfg->pcfg_count,
+ &nid, &share_count);
pcfg->pcfg_nid = nid;
pcfg->pcfg_size = 0;
pcfg->pcfg_id = 0;
break;
}
case NAL_CMD_ADD_PEER: {
- rc = koibnal_add_persistent_peer (pcfg->pcfg_nid);
+ rc = kibnal_add_persistent_peer (pcfg->pcfg_nid);
break;
}
case NAL_CMD_DEL_PEER: {
- rc = koibnal_del_peer (pcfg->pcfg_nid,
+ rc = kibnal_del_peer (pcfg->pcfg_nid,
/* flags == single_share */
pcfg->pcfg_flags != 0);
break;
}
case NAL_CMD_GET_CONN: {
- koib_conn_t *conn = koibnal_get_conn_by_idx (pcfg->pcfg_count);
+ kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count);
if (conn == NULL)
rc = -ENOENT;
pcfg->pcfg_id = 0;
pcfg->pcfg_misc = 0;
pcfg->pcfg_flags = 0;
- koibnal_put_conn (conn);
+ kibnal_put_conn (conn);
}
break;
}
case NAL_CMD_CLOSE_CONNECTION: {
- rc = koibnal_close_matching_conns (pcfg->pcfg_nid);
+ rc = kibnal_close_matching_conns (pcfg->pcfg_nid);
break;
}
case NAL_CMD_REGISTER_MYNID: {
if (pcfg->pcfg_nid == PTL_NID_ANY)
rc = -EINVAL;
else
- rc = koibnal_set_mynid (pcfg->pcfg_nid);
+ rc = kibnal_set_mynid (pcfg->pcfg_nid);
break;
}
}
}
void
-koibnal_free_pages (koib_pages_t *p)
+kibnal_free_pages (kib_pages_t *p)
{
- int npages = p->oibp_npages;
+ int npages = p->ibp_npages;
int rc;
int i;
- if (p->oibp_mapped) {
- rc = ib_memory_deregister(p->oibp_handle);
+ if (p->ibp_mapped) {
+ rc = ib_memory_deregister(p->ibp_handle);
if (rc != 0)
CERROR ("Deregister error: %d\n", rc);
}
for (i = 0; i < npages; i++)
- if (p->oibp_pages[i] != NULL)
- __free_page(p->oibp_pages[i]);
+ if (p->ibp_pages[i] != NULL)
+ __free_page(p->ibp_pages[i]);
- PORTAL_FREE (p, offsetof(koib_pages_t, oibp_pages[npages]));
+ PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
}
int
-koibnal_alloc_pages (koib_pages_t **pp, int npages, int access)
+kibnal_alloc_pages (kib_pages_t **pp, int npages, int access)
{
- koib_pages_t *p;
+ kib_pages_t *p;
struct ib_physical_buffer *phys_pages;
int i;
int rc;
- PORTAL_ALLOC(p, offsetof(koib_pages_t, oibp_pages[npages]));
+ PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
if (p == NULL) {
CERROR ("Can't allocate buffer %d\n", npages);
return (-ENOMEM);
}
- memset (p, 0, offsetof(koib_pages_t, oibp_pages[npages]));
- p->oibp_npages = npages;
+ memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
+ p->ibp_npages = npages;
for (i = 0; i < npages; i++) {
- p->oibp_pages[i] = alloc_page (GFP_KERNEL);
- if (p->oibp_pages[i] == NULL) {
+ p->ibp_pages[i] = alloc_page (GFP_KERNEL);
+ if (p->ibp_pages[i] == NULL) {
CERROR ("Can't allocate page %d of %d\n", i, npages);
- koibnal_free_pages(p);
+ kibnal_free_pages(p);
return (-ENOMEM);
}
}
PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages));
if (phys_pages == NULL) {
CERROR ("Can't allocate physarray for %d pages\n", npages);
- koibnal_free_pages(p);
+ kibnal_free_pages(p);
return (-ENOMEM);
}
for (i = 0; i < npages; i++) {
phys_pages[i].size = PAGE_SIZE;
phys_pages[i].address =
- koibnal_page2phys(p->oibp_pages[i]);
+ kibnal_page2phys(p->ibp_pages[i]);
}
- p->oibp_vaddr = 0;
- rc = ib_memory_register_physical(koibnal_data.koib_pd,
+ p->ibp_vaddr = 0;
+ rc = ib_memory_register_physical(kibnal_data.kib_pd,
phys_pages, npages,
- &p->oibp_vaddr,
+ &p->ibp_vaddr,
npages * PAGE_SIZE, 0,
access,
- &p->oibp_handle,
- &p->oibp_lkey,
- &p->oibp_rkey);
+ &p->ibp_handle,
+ &p->ibp_lkey,
+ &p->ibp_rkey);
PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages));
if (rc != 0) {
CERROR ("Error %d mapping %d pages\n", rc, npages);
- koibnal_free_pages(p);
+ kibnal_free_pages(p);
return (rc);
}
- p->oibp_mapped = 1;
+ p->ibp_mapped = 1;
*pp = p;
return (0);
}
int
-koibnal_setup_tx_descs (void)
+kibnal_setup_tx_descs (void)
{
int ipage = 0;
int page_offset = 0;
__u64 vaddr;
__u64 vaddr_base;
struct page *page;
- koib_tx_t *tx;
+ kib_tx_t *tx;
int i;
int rc;
/* pre-mapped messages are not bigger than 1 page */
- LASSERT (OPENIBNAL_MSG_SIZE <= PAGE_SIZE);
+ LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
/* No fancy arithmetic when we do the buffer calculations */
- LASSERT (PAGE_SIZE % OPENIBNAL_MSG_SIZE == 0);
+ LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
- rc = koibnal_alloc_pages(&koibnal_data.koib_tx_pages,
- OPENIBNAL_TX_MSG_PAGES,
- 0); /* local read access only */
+ rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages,
+ IBNAL_TX_MSG_PAGES,
+ 0); /* local read access only */
if (rc != 0)
return (rc);
- vaddr = vaddr_base = koibnal_data.koib_tx_pages->oibp_vaddr;
+ vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
- for (i = 0; i < OPENIBNAL_TX_MSGS; i++) {
- page = koibnal_data.koib_tx_pages->oibp_pages[ipage];
- tx = &koibnal_data.koib_tx_descs[i];
+ for (i = 0; i < IBNAL_TX_MSGS; i++) {
+ page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
+ tx = &kibnal_data.kib_tx_descs[i];
memset (tx, 0, sizeof(*tx)); /* zero flags etc */
- tx->tx_msg = (koib_msg_t *)(((char *)page_address(page)) + page_offset);
+ tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset);
tx->tx_vaddr = vaddr;
- tx->tx_isnblk = (i >= OPENIBNAL_NTX);
- tx->tx_mapped = KOIB_TX_UNMAPPED;
+ tx->tx_isnblk = (i >= IBNAL_NTX);
+ tx->tx_mapped = KIB_TX_UNMAPPED;
CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n",
i, tx, tx->tx_msg, tx->tx_vaddr);
if (tx->tx_isnblk)
list_add (&tx->tx_list,
- &koibnal_data.koib_idle_nblk_txs);
+ &kibnal_data.kib_idle_nblk_txs);
else
list_add (&tx->tx_list,
- &koibnal_data.koib_idle_txs);
+ &kibnal_data.kib_idle_txs);
- vaddr += OPENIBNAL_MSG_SIZE;
- LASSERT (vaddr <= vaddr_base + OPENIBNAL_TX_MSG_BYTES);
+ vaddr += IBNAL_MSG_SIZE;
+ LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
- page_offset += OPENIBNAL_MSG_SIZE;
+ page_offset += IBNAL_MSG_SIZE;
LASSERT (page_offset <= PAGE_SIZE);
if (page_offset == PAGE_SIZE) {
page_offset = 0;
ipage++;
- LASSERT (ipage <= OPENIBNAL_TX_MSG_PAGES);
+ LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
}
}
}
void
-koibnal_api_shutdown (nal_t *nal)
+kibnal_api_shutdown (nal_t *nal)
{
int i;
int rc;
CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
atomic_read (&portal_kmemory));
- LASSERT(nal == &koibnal_api);
+ LASSERT(nal == &kibnal_api);
- switch (koibnal_data.koib_init) {
+ switch (kibnal_data.kib_init) {
default:
- CERROR ("Unexpected state %d\n", koibnal_data.koib_init);
+ CERROR ("Unexpected state %d\n", kibnal_data.kib_init);
LBUG();
- case OPENIBNAL_INIT_ALL:
+ case IBNAL_INIT_ALL:
/* stop calls to nal_cmd */
libcfs_nal_cmd_unregister(OPENIBNAL);
/* No new peers */
/* resetting my NID to unadvertises me, removes my
* listener and nukes all current peers */
- koibnal_set_mynid (PTL_NID_ANY);
+ kibnal_set_mynid (PTL_NID_ANY);
/* Wait for all peer state to clean up */
i = 2;
- while (atomic_read (&koibnal_data.koib_npeers) != 0) {
+ while (atomic_read (&kibnal_data.kib_npeers) != 0) {
i++;
CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
"waiting for %d peers to close down\n",
- atomic_read (&koibnal_data.koib_npeers));
+ atomic_read (&kibnal_data.kib_npeers));
set_current_state (TASK_INTERRUPTIBLE);
schedule_timeout (HZ);
}
/* fall through */
- case OPENIBNAL_INIT_TX_CQ:
- rc = ib_cq_destroy (koibnal_data.koib_tx_cq);
- if (rc != 0)
- CERROR ("Destroy tx CQ error: %d\n", rc);
- /* fall through */
-
- case OPENIBNAL_INIT_RX_CQ:
- rc = ib_cq_destroy (koibnal_data.koib_rx_cq);
+ case IBNAL_INIT_CQ:
+ rc = ib_cq_destroy (kibnal_data.kib_cq);
if (rc != 0)
- CERROR ("Destroy rx CQ error: %d\n", rc);
+ CERROR ("Destroy CQ error: %d\n", rc);
/* fall through */
- case OPENIBNAL_INIT_TXD:
- koibnal_free_pages (koibnal_data.koib_tx_pages);
+ case IBNAL_INIT_TXD:
+ kibnal_free_pages (kibnal_data.kib_tx_pages);
/* fall through */
-#if OPENIBNAL_FMR
- case OPENIBNAL_INIT_FMR:
- rc = ib_fmr_pool_destroy (koibnal_data.koib_fmr_pool);
+#if IBNAL_FMR
+ case IBNAL_INIT_FMR:
+ rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool);
if (rc != 0)
CERROR ("Destroy FMR pool error: %d\n", rc);
/* fall through */
#endif
- case OPENIBNAL_INIT_PD:
- rc = ib_pd_destroy(koibnal_data.koib_pd);
+ case IBNAL_INIT_PD:
+ rc = ib_pd_destroy(kibnal_data.kib_pd);
if (rc != 0)
CERROR ("Destroy PD error: %d\n", rc);
/* fall through */
- case OPENIBNAL_INIT_LIB:
- lib_fini(&koibnal_lib);
+ case IBNAL_INIT_LIB:
+ lib_fini(&kibnal_lib);
/* fall through */
- case OPENIBNAL_INIT_DATA:
+ case IBNAL_INIT_DATA:
/* Module refcount only gets to zero when all peers
* have been closed so all lists must be empty */
- LASSERT (atomic_read (&koibnal_data.koib_npeers) == 0);
- LASSERT (koibnal_data.koib_peers != NULL);
- for (i = 0; i < koibnal_data.koib_peer_hash_size; i++) {
- LASSERT (list_empty (&koibnal_data.koib_peers[i]));
+ LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
+ LASSERT (kibnal_data.kib_peers != NULL);
+ for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
+ LASSERT (list_empty (&kibnal_data.kib_peers[i]));
}
- LASSERT (atomic_read (&koibnal_data.koib_nconns) == 0);
- LASSERT (list_empty (&koibnal_data.koib_sched_rxq));
- LASSERT (list_empty (&koibnal_data.koib_sched_txq));
- LASSERT (list_empty (&koibnal_data.koib_connd_conns));
- LASSERT (list_empty (&koibnal_data.koib_connd_peers));
+ LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
+ LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
+ LASSERT (list_empty (&kibnal_data.kib_sched_txq));
+ LASSERT (list_empty (&kibnal_data.kib_connd_conns));
+ LASSERT (list_empty (&kibnal_data.kib_connd_peers));
/* flag threads to terminate; wake and wait for them to die */
- koibnal_data.koib_shutdown = 1;
- wake_up_all (&koibnal_data.koib_sched_waitq);
- wake_up_all (&koibnal_data.koib_connd_waitq);
+ kibnal_data.kib_shutdown = 1;
+ wake_up_all (&kibnal_data.kib_sched_waitq);
+ wake_up_all (&kibnal_data.kib_connd_waitq);
i = 2;
- while (atomic_read (&koibnal_data.koib_nthreads) != 0) {
+ while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
i++;
CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
"Waiting for %d threads to terminate\n",
- atomic_read (&koibnal_data.koib_nthreads));
+ atomic_read (&kibnal_data.kib_nthreads));
set_current_state (TASK_INTERRUPTIBLE);
schedule_timeout (HZ);
}
/* fall through */
- case OPENIBNAL_INIT_NOTHING:
+ case IBNAL_INIT_NOTHING:
break;
}
- if (koibnal_data.koib_tx_descs != NULL)
- PORTAL_FREE (koibnal_data.koib_tx_descs,
- OPENIBNAL_TX_MSGS * sizeof(koib_tx_t));
+ if (kibnal_data.kib_tx_descs != NULL)
+ PORTAL_FREE (kibnal_data.kib_tx_descs,
+ IBNAL_TX_MSGS * sizeof(kib_tx_t));
- if (koibnal_data.koib_peers != NULL)
- PORTAL_FREE (koibnal_data.koib_peers,
+ if (kibnal_data.kib_peers != NULL)
+ PORTAL_FREE (kibnal_data.kib_peers,
sizeof (struct list_head) *
- koibnal_data.koib_peer_hash_size);
+ kibnal_data.kib_peer_hash_size);
CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
atomic_read (&portal_kmemory));
printk(KERN_INFO "Lustre: OpenIB NAL unloaded (final mem %d)\n",
atomic_read(&portal_kmemory));
- koibnal_data.koib_init = OPENIBNAL_INIT_NOTHING;
+ kibnal_data.kib_init = IBNAL_INIT_NOTHING;
}
int
-koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
+kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
ptl_ni_limits_t *requested_limits,
ptl_ni_limits_t *actual_limits)
{
int rc;
int i;
- LASSERT (nal == &koibnal_api);
+ LASSERT (nal == &kibnal_api);
if (nal->nal_refct != 0) {
if (actual_limits != NULL)
- *actual_limits = koibnal_lib.libnal_ni.ni_actual_limits;
+ *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits;
/* This module got the first ref */
PORTAL_MODULE_USE;
return (PTL_OK);
}
- LASSERT (koibnal_data.koib_init == OPENIBNAL_INIT_NOTHING);
+ LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING);
- memset (&koibnal_data, 0, sizeof (koibnal_data)); /* zero pointers, flags etc */
+ memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */
- init_MUTEX (&koibnal_data.koib_nid_mutex);
- init_MUTEX_LOCKED (&koibnal_data.koib_nid_signal);
- koibnal_data.koib_nid = PTL_NID_ANY;
+ init_MUTEX (&kibnal_data.kib_nid_mutex);
+ init_MUTEX_LOCKED (&kibnal_data.kib_nid_signal);
+ kibnal_data.kib_nid = PTL_NID_ANY;
- rwlock_init(&koibnal_data.koib_global_lock);
+ rwlock_init(&kibnal_data.kib_global_lock);
- koibnal_data.koib_peer_hash_size = OPENIBNAL_PEER_HASH_SIZE;
- PORTAL_ALLOC (koibnal_data.koib_peers,
- sizeof (struct list_head) * koibnal_data.koib_peer_hash_size);
- if (koibnal_data.koib_peers == NULL) {
+ kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
+ PORTAL_ALLOC (kibnal_data.kib_peers,
+ sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
+ if (kibnal_data.kib_peers == NULL) {
goto failed;
}
- for (i = 0; i < koibnal_data.koib_peer_hash_size; i++)
- INIT_LIST_HEAD(&koibnal_data.koib_peers[i]);
-
- spin_lock_init (&koibnal_data.koib_connd_lock);
- INIT_LIST_HEAD (&koibnal_data.koib_connd_peers);
- INIT_LIST_HEAD (&koibnal_data.koib_connd_conns);
- init_waitqueue_head (&koibnal_data.koib_connd_waitq);
-
- spin_lock_init (&koibnal_data.koib_sched_lock);
- INIT_LIST_HEAD (&koibnal_data.koib_sched_txq);
- INIT_LIST_HEAD (&koibnal_data.koib_sched_rxq);
- init_waitqueue_head (&koibnal_data.koib_sched_waitq);
-
- spin_lock_init (&koibnal_data.koib_tx_lock);
- INIT_LIST_HEAD (&koibnal_data.koib_idle_txs);
- INIT_LIST_HEAD (&koibnal_data.koib_idle_nblk_txs);
- init_waitqueue_head(&koibnal_data.koib_idle_tx_waitq);
-
- PORTAL_ALLOC (koibnal_data.koib_tx_descs,
- OPENIBNAL_TX_MSGS * sizeof(koib_tx_t));
- if (koibnal_data.koib_tx_descs == NULL) {
+ for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
+ INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
+
+ spin_lock_init (&kibnal_data.kib_connd_lock);
+ INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
+ INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
+ init_waitqueue_head (&kibnal_data.kib_connd_waitq);
+
+ spin_lock_init (&kibnal_data.kib_sched_lock);
+ INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
+ INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
+ init_waitqueue_head (&kibnal_data.kib_sched_waitq);
+
+ spin_lock_init (&kibnal_data.kib_tx_lock);
+ INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
+ INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs);
+ init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq);
+
+ PORTAL_ALLOC (kibnal_data.kib_tx_descs,
+ IBNAL_TX_MSGS * sizeof(kib_tx_t));
+ if (kibnal_data.kib_tx_descs == NULL) {
CERROR ("Can't allocate tx descs\n");
goto failed;
}
/* lists/ptrs/locks initialised */
- koibnal_data.koib_init = OPENIBNAL_INIT_DATA;
+ kibnal_data.kib_init = IBNAL_INIT_DATA;
/*****************************************************/
+
process_id.pid = requested_pid;
- process_id.nid = koibnal_data.koib_nid;
+ process_id.nid = kibnal_data.kib_nid;
- rc = lib_init(&koibnal_lib, nal, process_id,
+ rc = lib_init(&kibnal_lib, nal, process_id,
requested_limits, actual_limits);
if (rc != PTL_OK) {
CERROR("lib_init failed: error %d\n", rc);
}
/* lib interface initialised */
- koibnal_data.koib_init = OPENIBNAL_INIT_LIB;
+ kibnal_data.kib_init = IBNAL_INIT_LIB;
/*****************************************************/
- for (i = 0; i < OPENIBNAL_N_SCHED; i++) {
- rc = koibnal_thread_start (koibnal_scheduler, (void *)i);
+ for (i = 0; i < IBNAL_N_SCHED; i++) {
+ rc = kibnal_thread_start (kibnal_scheduler, (void *)i);
if (rc != 0) {
CERROR("Can't spawn openibnal scheduler[%d]: %d\n",
i, rc);
}
}
- rc = koibnal_thread_start (koibnal_connd, NULL);
+ rc = kibnal_thread_start (kibnal_connd, NULL);
if (rc != 0) {
CERROR ("Can't spawn openibnal connd: %d\n", rc);
goto failed;
}
- koibnal_data.koib_device = ib_device_get_by_index(0);
- if (koibnal_data.koib_device == NULL) {
+ kibnal_data.kib_device = ib_device_get_by_index(0);
+ if (kibnal_data.kib_device == NULL) {
CERROR ("Can't open ib device 0\n");
goto failed;
}
- rc = ib_device_properties_get(koibnal_data.koib_device,
- &koibnal_data.koib_device_props);
+ rc = ib_device_properties_get(kibnal_data.kib_device,
+ &kibnal_data.kib_device_props);
if (rc != 0) {
CERROR ("Can't get device props: %d\n", rc);
goto failed;
}
CDEBUG(D_NET, "Max Initiator: %d Max Responder %d\n",
- koibnal_data.koib_device_props.max_initiator_per_qp,
- koibnal_data.koib_device_props.max_responder_per_qp);
+ kibnal_data.kib_device_props.max_initiator_per_qp,
+ kibnal_data.kib_device_props.max_responder_per_qp);
- koibnal_data.koib_port = 0;
+ kibnal_data.kib_port = 0;
for (i = 1; i <= 2; i++) {
- rc = ib_port_properties_get(koibnal_data.koib_device, i,
- &koibnal_data.koib_port_props);
+ rc = ib_port_properties_get(kibnal_data.kib_device, i,
+ &kibnal_data.kib_port_props);
if (rc == 0) {
- koibnal_data.koib_port = i;
+ kibnal_data.kib_port = i;
break;
}
}
- if (koibnal_data.koib_port == 0) {
+ if (kibnal_data.kib_port == 0) {
CERROR ("Can't find a port\n");
goto failed;
}
- rc = ib_pd_create(koibnal_data.koib_device,
- NULL, &koibnal_data.koib_pd);
+ rc = ib_pd_create(kibnal_data.kib_device,
+ NULL, &kibnal_data.kib_pd);
if (rc != 0) {
CERROR ("Can't create PD: %d\n", rc);
goto failed;
}
/* flag PD initialised */
- koibnal_data.koib_init = OPENIBNAL_INIT_PD;
+ kibnal_data.kib_init = IBNAL_INIT_PD;
/*****************************************************/
-#if OPENIBNAL_FMR
+#if IBNAL_FMR
{
- const int pool_size = OPENIBNAL_NTX + OPENIBNAL_NTX_NBLK;
+ const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK;
struct ib_fmr_pool_param params = {
.max_pages_per_fmr = PTL_MTU/PAGE_SIZE,
.access = (IB_ACCESS_LOCAL_WRITE |
.flush_arg = NULL,
.cache = 1,
};
- rc = ib_fmr_pool_create(koibnal_data.koib_pd, ¶ms,
- &koibnal_data.koib_fmr_pool);
+ rc = ib_fmr_pool_create(kibnal_data.kib_pd, ¶ms,
+ &kibnal_data.kib_fmr_pool);
if (rc != 0) {
CERROR ("Can't create FMR pool size %d: %d\n",
pool_size, rc);
}
/* flag FMR pool initialised */
- koibnal_data.koib_init = OPENIBNAL_INIT_FMR;
+ kibnal_data.kib_init = IBNAL_INIT_FMR;
#endif
/*****************************************************/
- rc = koibnal_setup_tx_descs();
+ rc = kibnal_setup_tx_descs();
if (rc != 0) {
CERROR ("Can't register tx descs: %d\n", rc);
goto failed;
}
/* flag TX descs initialised */
- koibnal_data.koib_init = OPENIBNAL_INIT_TXD;
+ kibnal_data.kib_init = IBNAL_INIT_TXD;
/*****************************************************/
{
struct ib_cq_callback callback = {
- .context = OPENIBNAL_CALLBACK_CTXT,
+ .context = IBNAL_CALLBACK_CTXT,
.policy = IB_CQ_PROVIDER_REARM,
.function = {
- .entry = koibnal_rx_callback,
+ .entry = kibnal_callback,
},
.arg = NULL,
};
- int nentries = OPENIBNAL_RX_CQ_ENTRIES;
+ int nentries = IBNAL_CQ_ENTRIES;
- rc = ib_cq_create (koibnal_data.koib_device,
+ rc = ib_cq_create (kibnal_data.kib_device,
&nentries, &callback, NULL,
- &koibnal_data.koib_rx_cq);
+ &kibnal_data.kib_cq);
if (rc != 0) {
- CERROR ("Can't create RX CQ: %d\n", rc);
+ CERROR ("Can't create CQ: %d\n", rc);
goto failed;
}
/* I only want solicited events */
- rc = ib_cq_request_notification(koibnal_data.koib_rx_cq, 1);
+ rc = ib_cq_request_notification(kibnal_data.kib_cq, 1);
LASSERT (rc == 0);
}
- /* flag RX CQ initialised */
- koibnal_data.koib_init = OPENIBNAL_INIT_RX_CQ;
- /*****************************************************/
-
- {
- struct ib_cq_callback callback = {
- .context = OPENIBNAL_CALLBACK_CTXT,
- .policy = IB_CQ_PROVIDER_REARM,
- .function = {
- .entry = koibnal_tx_callback,
- },
- .arg = NULL,
- };
- int nentries = OPENIBNAL_TX_CQ_ENTRIES;
-
- rc = ib_cq_create (koibnal_data.koib_device,
- &nentries, &callback, NULL,
- &koibnal_data.koib_tx_cq);
- if (rc != 0) {
- CERROR ("Can't create RX CQ: %d\n", rc);
- goto failed;
- }
-
- /* I only want solicited events */
- rc = ib_cq_request_notification(koibnal_data.koib_tx_cq, 1);
- LASSERT (rc == 0);
- }
-
- /* flag TX CQ initialised */
- koibnal_data.koib_init = OPENIBNAL_INIT_TX_CQ;
+ /* flag CQ initialised */
+ kibnal_data.kib_init = IBNAL_INIT_CQ;
/*****************************************************/
- rc = libcfs_nal_cmd_register(OPENIBNAL, &koibnal_cmd, NULL);
+ rc = libcfs_nal_cmd_register(OPENIBNAL, &kibnal_cmd, NULL);
if (rc != 0) {
CERROR ("Can't initialise command interface (rc = %d)\n", rc);
goto failed;
}
/* flag everything initialised */
- koibnal_data.koib_init = OPENIBNAL_INIT_ALL;
+ kibnal_data.kib_init = IBNAL_INIT_ALL;
/*****************************************************/
printk(KERN_INFO "Lustre: OpenIB NAL loaded "
return (PTL_OK);
failed:
- koibnal_api_shutdown (&koibnal_api);
+ kibnal_api_shutdown (&kibnal_api);
return (PTL_FAIL);
}
void __exit
-koibnal_module_fini (void)
+kibnal_module_fini (void)
{
#ifdef CONFIG_SYSCTL
- if (koibnal_tunables.koib_sysctl != NULL)
- unregister_sysctl_table (koibnal_tunables.koib_sysctl);
+ if (kibnal_tunables.kib_sysctl != NULL)
+ unregister_sysctl_table (kibnal_tunables.kib_sysctl);
#endif
- PtlNIFini(koibnal_ni);
+ PtlNIFini(kibnal_ni);
ptl_unregister_nal(OPENIBNAL);
}
int __init
-koibnal_module_init (void)
+kibnal_module_init (void)
{
int rc;
/* the following must be sizeof(int) for proc_dointvec() */
- LASSERT(sizeof (koibnal_tunables.koib_io_timeout) == sizeof (int));
+ LASSERT(sizeof (kibnal_tunables.kib_io_timeout) == sizeof (int));
- koibnal_api.nal_ni_init = koibnal_api_startup;
- koibnal_api.nal_ni_fini = koibnal_api_shutdown;
+ kibnal_api.nal_ni_init = kibnal_api_startup;
+ kibnal_api.nal_ni_fini = kibnal_api_shutdown;
/* Initialise dynamic tunables to defaults once only */
- koibnal_tunables.koib_io_timeout = OPENIBNAL_IO_TIMEOUT;
+ kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
- rc = ptl_register_nal(OPENIBNAL, &koibnal_api);
+ rc = ptl_register_nal(OPENIBNAL, &kibnal_api);
if (rc != PTL_OK) {
- CERROR("Can't register OPENIBNAL: %d\n", rc);
+ CERROR("Can't register IBNAL: %d\n", rc);
return (-ENOMEM); /* or something... */
}
/* Pure gateways want the NAL started up at module load time... */
- rc = PtlNIInit(OPENIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &koibnal_ni);
+ rc = PtlNIInit(OPENIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni);
if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
ptl_unregister_nal(OPENIBNAL);
return (-ENODEV);
#ifdef CONFIG_SYSCTL
/* Press on regardless even if registering sysctl doesn't work */
- koibnal_tunables.koib_sysctl =
- register_sysctl_table (koibnal_top_ctl_table, 0);
+ kibnal_tunables.kib_sysctl =
+ register_sysctl_table (kibnal_top_ctl_table, 0);
#endif
return (0);
}
MODULE_DESCRIPTION("Kernel OpenIB NAL v0.01");
MODULE_LICENSE("GPL");
-module_init(koibnal_module_init);
-module_exit(koibnal_module_fini);
+module_init(kibnal_module_init);
+module_exit(kibnal_module_fini);
#include <linux/kmod.h>
#include <linux/sysctl.h>
-#define DEBUG_SUBSYSTEM S_OPENIBNAL
+#define DEBUG_SUBSYSTEM S_IBNAL
#include <linux/kp30.h>
#include <portals/p30.h>
#include <ts_ib_cm.h>
#include <ts_ib_sa_client.h>
-#define OPENIBNAL_SERVICE_NAME "openibnal"
+#define IBNAL_SERVICE_NAME "openibnal"
#if CONFIG_SMP
-# define OPENIBNAL_N_SCHED num_online_cpus() /* # schedulers */
+# define IBNAL_N_SCHED num_online_cpus() /* # schedulers */
#else
-# define OPENIBNAL_N_SCHED 1 /* # schedulers */
+# define IBNAL_N_SCHED 1 /* # schedulers */
#endif
-#define OPENIBNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */
-#define OPENIBNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */
+#define IBNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */
+#define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */
-#define OPENIBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */
+#define IBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */
-#define OPENIBNAL_MSG_QUEUE_SIZE 8 /* # messages in-flight */
-#define OPENIBNAL_CREDIT_HIGHWATER 6 /* when to eagerly return credits */
-#define OPENIBNAL_RETRY 7 /* # times to retry */
-#define OPENIBNAL_RNR_RETRY 7 /* */
-#define OPENIBNAL_CM_RETRY 7 /* # times to retry connection */
-#define OPENIBNAL_FLOW_CONTROL 1
-#define OPENIBNAL_RESPONDER_RESOURCES 8
+#define IBNAL_MSG_QUEUE_SIZE 8 /* # messages/RDMAs in-flight */
+#define IBNAL_CREDIT_HIGHWATER 6 /* when to eagerly return credits */
+#define IBNAL_RETRY 7 /* # times to retry */
+#define IBNAL_RNR_RETRY 7 /* */
+#define IBNAL_CM_RETRY 7 /* # times to retry connection */
+#define IBNAL_FLOW_CONTROL 1
+#define IBNAL_RESPONDER_RESOURCES 8
-#define OPENIBNAL_NTX 64 /* # tx descs */
-#define OPENIBNAL_NTX_NBLK 256 /* # reserved tx descs */
+#define IBNAL_NTX 64 /* # tx descs */
+#define IBNAL_NTX_NBLK 256 /* # reserved tx descs */
-#define OPENIBNAL_PEER_HASH_SIZE 101 /* # peer lists */
+#define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */
-#define OPENIBNAL_RESCHED 100 /* # scheduler loops before reschedule */
+#define IBNAL_RESCHED 100 /* # scheduler loops before reschedule */
-#define OPENIBNAL_CONCURRENT_PEERS 1000 /* # nodes all talking at once to me */
+#define IBNAL_CONCURRENT_PEERS 1000 /* # nodes all talking at once to me */
/* default vals for runtime tunables */
-#define OPENIBNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */
+#define IBNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */
/************************/
/* derived constants... */
/* TX messages (shared by all connections) */
-#define OPENIBNAL_TX_MSGS (OPENIBNAL_NTX + OPENIBNAL_NTX_NBLK)
-#define OPENIBNAL_TX_MSG_BYTES (OPENIBNAL_TX_MSGS * OPENIBNAL_MSG_SIZE)
-#define OPENIBNAL_TX_MSG_PAGES ((OPENIBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
-
-/* we may have up to 2 completions per transmit */
-#define OPENIBNAL_TX_CQ_ENTRIES (2*OPENIBNAL_TX_MSGS)
+#define IBNAL_TX_MSGS (IBNAL_NTX + IBNAL_NTX_NBLK)
+#define IBNAL_TX_MSG_BYTES (IBNAL_TX_MSGS * IBNAL_MSG_SIZE)
+#define IBNAL_TX_MSG_PAGES ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
/* RX messages (per connection) */
-#define OPENIBNAL_RX_MSGS OPENIBNAL_MSG_QUEUE_SIZE
-#define OPENIBNAL_RX_MSG_BYTES (OPENIBNAL_RX_MSGS * OPENIBNAL_MSG_SIZE)
-#define OPENIBNAL_RX_MSG_PAGES ((OPENIBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
+#define IBNAL_RX_MSGS IBNAL_MSG_QUEUE_SIZE
+#define IBNAL_RX_MSG_BYTES (IBNAL_RX_MSGS * IBNAL_MSG_SIZE)
+#define IBNAL_RX_MSG_PAGES ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
-/* 1 completion per receive, per connection */
-#define OPENIBNAL_RX_CQ_ENTRIES (OPENIBNAL_RX_MSGS * OPENIBNAL_CONCURRENT_PEERS)
+/* we may have up to 2 completions per transmit +
+ 1 completion per receive, per connection */
+#define IBNAL_CQ_ENTRIES ((2*IBNAL_TX_MSGS) + \
+ (IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS))
-#define OPENIBNAL_RDMA_BASE 0x0eeb0000
-#define OPENIBNAL_FMR 1
-#define OPENIBNAL_CKSUM 0
-//#define OPENIBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_PROCESS
-#define OPENIBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_INTERRUPT
+#define IBNAL_RDMA_BASE 0x0eeb0000
+#define IBNAL_FMR 1
+#define IBNAL_CKSUM 0
+//#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_PROCESS
+#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_INTERRUPT
typedef struct
{
- int koib_io_timeout; /* comms timeout (seconds) */
- struct ctl_table_header *koib_sysctl; /* sysctl interface */
-} koib_tunables_t;
+ int kib_io_timeout; /* comms timeout (seconds) */
+ struct ctl_table_header *kib_sysctl; /* sysctl interface */
+} kib_tunables_t;
typedef struct
{
- int oibp_npages; /* # pages */
- int oibp_mapped; /* mapped? */
- __u64 oibp_vaddr; /* mapped region vaddr */
- __u32 oibp_lkey; /* mapped region lkey */
- __u32 oibp_rkey; /* mapped region rkey */
- struct ib_mr *oibp_handle; /* mapped region handle */
- struct page *oibp_pages[0];
-} koib_pages_t;
+ int ibp_npages; /* # pages */
+ int ibp_mapped; /* mapped? */
+ __u64 ibp_vaddr; /* mapped region vaddr */
+ __u32 ibp_lkey; /* mapped region lkey */
+ __u32 ibp_rkey; /* mapped region rkey */
+ struct ib_mr *ibp_handle; /* mapped region handle */
+ struct page *ibp_pages[0];
+} kib_pages_t;
typedef struct
{
- int koib_init; /* initialisation state */
- __u64 koib_incarnation; /* which one am I */
- int koib_shutdown; /* shut down? */
- atomic_t koib_nthreads; /* # live threads */
-
- __u64 koib_cm_service_id; /* service number I listen on */
- ptl_nid_t koib_nid; /* my NID */
- struct semaphore koib_nid_mutex; /* serialise NID ops */
- struct semaphore koib_nid_signal; /* signal completion */
-
- rwlock_t koib_global_lock; /* stabilize peer/conn ops */
-
- struct list_head *koib_peers; /* hash table of all my known peers */
- int koib_peer_hash_size; /* size of koib_peers */
- atomic_t koib_npeers; /* # peers extant */
- atomic_t koib_nconns; /* # connections extant */
-
- struct list_head koib_connd_conns; /* connections to progress */
- struct list_head koib_connd_peers; /* peers waiting for a connection */
- wait_queue_head_t koib_connd_waitq; /* connection daemons sleep here */
- unsigned long koib_connd_waketime; /* when connd will wake */
- spinlock_t koib_connd_lock; /* serialise */
-
- wait_queue_head_t koib_sched_waitq; /* schedulers sleep here */
- struct list_head koib_sched_txq; /* tx requiring attention */
- struct list_head koib_sched_rxq; /* rx requiring attention */
- spinlock_t koib_sched_lock; /* serialise */
+ int kib_init; /* initialisation state */
+ __u64 kib_incarnation; /* which one am I */
+ int kib_shutdown; /* shut down? */
+ atomic_t kib_nthreads; /* # live threads */
+
+ __u64 kib_service_id; /* service number I listen on */
+ ptl_nid_t kib_nid; /* my NID */
+ struct semaphore kib_nid_mutex; /* serialise NID ops */
+ struct semaphore kib_nid_signal; /* signal completion */
+
+ rwlock_t kib_global_lock; /* stabilize peer/conn ops */
+
+ struct list_head *kib_peers; /* hash table of all my known peers */
+ int kib_peer_hash_size; /* size of kib_peers */
+ atomic_t kib_npeers; /* # peers extant */
+ atomic_t kib_nconns; /* # connections extant */
+
+ struct list_head kib_connd_conns; /* connections to progress */
+ struct list_head kib_connd_peers; /* peers waiting for a connection */
+ wait_queue_head_t kib_connd_waitq; /* connection daemons sleep here */
+ unsigned long kib_connd_waketime; /* when connd will wake */
+ spinlock_t kib_connd_lock; /* serialise */
+
+ wait_queue_head_t kib_sched_waitq; /* schedulers sleep here */
+ struct list_head kib_sched_txq; /* tx requiring attention */
+ struct list_head kib_sched_rxq; /* rx requiring attention */
+ spinlock_t kib_sched_lock; /* serialise */
- struct koib_tx *koib_tx_descs; /* all the tx descriptors */
- koib_pages_t *koib_tx_pages; /* premapped tx msg pages */
-
- struct list_head koib_idle_txs; /* idle tx descriptors */
- struct list_head koib_idle_nblk_txs; /* idle reserved tx descriptors */
- wait_queue_head_t koib_idle_tx_waitq; /* block here for tx descriptor */
- __u64 koib_next_tx_cookie; /* RDMA completion cookie */
- spinlock_t koib_tx_lock; /* serialise */
+ struct kib_tx *kib_tx_descs; /* all the tx descriptors */
+ kib_pages_t *kib_tx_pages; /* premapped tx msg pages */
+
+ struct list_head kib_idle_txs; /* idle tx descriptors */
+ struct list_head kib_idle_nblk_txs; /* idle reserved tx descriptors */
+ wait_queue_head_t kib_idle_tx_waitq; /* block here for tx descriptor */
+ __u64 kib_next_tx_cookie; /* RDMA completion cookie */
+ spinlock_t kib_tx_lock; /* serialise */
- struct ib_device *koib_device; /* "the" device */
- struct ib_device_properties koib_device_props; /* its properties */
- int koib_port; /* port on the device */
- struct ib_port_properties koib_port_props; /* its properties */
- struct ib_pd *koib_pd; /* protection domain */
-#if OPENIBNAL_FMR
- struct ib_fmr_pool *koib_fmr_pool; /* fast memory region pool */
+ struct ib_device *kib_device; /* "the" device */
+ struct ib_device_properties kib_device_props; /* its properties */
+ int kib_port; /* port on the device */
+ struct ib_port_properties kib_port_props; /* its properties */
+ struct ib_pd *kib_pd; /* protection domain */
+#if IBNAL_FMR
+ struct ib_fmr_pool *kib_fmr_pool; /* fast memory region pool */
#endif
- struct ib_cq *koib_rx_cq; /* receive completion queue */
- struct ib_cq *koib_tx_cq; /* transmit completion queue */
- void *koib_listen_handle; /* where I listen for connections */
- struct ib_common_attrib_service koib_service; /* SM service */
+ struct ib_cq *kib_cq; /* completion queue */
+ void *kib_listen_handle; /* where I listen for connections */
-} koib_data_t;
-
-#define OPENIBNAL_INIT_NOTHING 0
-#define OPENIBNAL_INIT_DATA 1
-#define OPENIBNAL_INIT_LIB 2
-#define OPENIBNAL_INIT_PD 3
-#define OPENIBNAL_INIT_FMR 4
-#define OPENIBNAL_INIT_TXD 5
-#define OPENIBNAL_INIT_RX_CQ 6
-#define OPENIBNAL_INIT_TX_CQ 7
-#define OPENIBNAL_INIT_ALL 8
+} kib_data_t;
+
+#define IBNAL_INIT_NOTHING 0
+#define IBNAL_INIT_DATA 1
+#define IBNAL_INIT_LIB 2
+#define IBNAL_INIT_PD 3
+#define IBNAL_INIT_FMR 4
+#define IBNAL_INIT_TXD 5
+#define IBNAL_INIT_CQ 6
+#define IBNAL_INIT_ALL 7
/************************************************************************
* Wire message structs.
__u32 md_lkey;
__u32 md_rkey;
__u64 md_addr;
-} koib_md_t;
+} kib_md_t;
typedef struct
{
__u32 rd_key; /* remote key */
__u32 rd_nob; /* # of bytes */
__u64 rd_addr; /* remote io vaddr */
-} koib_rdma_desc_t;
+} kib_rdma_desc_t;
typedef struct
{
- ptl_hdr_t oibim_hdr; /* portals header */
- char oibim_payload[0]; /* piggy-backed payload */
-} koib_immediate_msg_t;
+ ptl_hdr_t ibim_hdr; /* portals header */
+ char ibim_payload[0]; /* piggy-backed payload */
+} kib_immediate_msg_t;
typedef struct
{
- ptl_hdr_t oibrm_hdr; /* portals header */
- __u64 oibrm_cookie; /* opaque completion cookie */
- koib_rdma_desc_t oibrm_desc; /* where to suck/blow */
-} koib_rdma_msg_t;
+ ptl_hdr_t ibrm_hdr; /* portals header */
+ __u64 ibrm_cookie; /* opaque completion cookie */
+ kib_rdma_desc_t ibrm_desc; /* where to suck/blow */
+} kib_rdma_msg_t;
typedef struct
{
- __u64 oibcm_cookie; /* opaque completion cookie */
- __u32 oibcm_status; /* completion status */
-} koib_completion_msg_t;
+ __u64 ibcm_cookie; /* opaque completion cookie */
+ __u32 ibcm_status; /* completion status */
+} kib_completion_msg_t;
typedef struct
{
- __u32 oibm_magic; /* I'm an openibnal message */
- __u16 oibm_version; /* this is my version number */
- __u8 oibm_type; /* msg type */
- __u8 oibm_credits; /* returned credits */
-#if OPENIBNAL_CKSUM
- __u32 oibm_nob;
- __u32 oibm_cksum;
+ __u32 ibm_magic; /* I'm an openibnal message */
+ __u16 ibm_version; /* this is my version number */
+ __u8 ibm_type; /* msg type */
+ __u8 ibm_credits; /* returned credits */
+#if IBNAL_CKSUM
+ __u32 ibm_nob;
+ __u32 ibm_cksum;
#endif
union {
- koib_immediate_msg_t immediate;
- koib_rdma_msg_t rdma;
- koib_completion_msg_t completion;
- } oibm_u;
-} koib_msg_t;
-
-#define OPENIBNAL_MSG_MAGIC 0x0be91b91 /* unique magic */
-#define OPENIBNAL_MSG_VERSION 1 /* current protocol version */
-
-#define OPENIBNAL_MSG_NOOP 0xd0 /* nothing (just credits) */
-#define OPENIBNAL_MSG_IMMEDIATE 0xd1 /* portals hdr + payload */
-#define OPENIBNAL_MSG_PUT_RDMA 0xd2 /* portals PUT hdr + source rdma desc */
-#define OPENIBNAL_MSG_PUT_DONE 0xd3 /* signal PUT rdma completion */
-#define OPENIBNAL_MSG_GET_RDMA 0xd4 /* portals GET hdr + sink rdma desc */
-#define OPENIBNAL_MSG_GET_DONE 0xd5 /* signal GET rdma completion */
+ kib_immediate_msg_t immediate;
+ kib_rdma_msg_t rdma;
+ kib_completion_msg_t completion;
+ } ibm_u;
+} kib_msg_t;
+
+#define IBNAL_MSG_MAGIC 0x0be91b91 /* unique magic */
+#define IBNAL_MSG_VERSION 1 /* current protocol version */
+
+#define IBNAL_MSG_NOOP 0xd0 /* nothing (just credits) */
+#define IBNAL_MSG_IMMEDIATE 0xd1 /* portals hdr + payload */
+#define IBNAL_MSG_PUT_RDMA 0xd2 /* portals PUT hdr + source rdma desc */
+#define IBNAL_MSG_PUT_DONE 0xd3 /* signal PUT rdma completion */
+#define IBNAL_MSG_GET_RDMA 0xd4 /* portals GET hdr + sink rdma desc */
+#define IBNAL_MSG_GET_DONE 0xd5 /* signal GET rdma completion */
/***********************************************************************/
-typedef struct koib_rx /* receive message */
+typedef struct kib_rx /* receive message */
{
struct list_head rx_list; /* queue for attention */
- struct koib_conn *rx_conn; /* owning conn */
+ struct kib_conn *rx_conn; /* owning conn */
int rx_rdma; /* RDMA completion posted? */
int rx_posted; /* posted? */
__u64 rx_vaddr; /* pre-mapped buffer (hca vaddr) */
- koib_msg_t *rx_msg; /* pre-mapped buffer (host vaddr) */
+ kib_msg_t *rx_msg; /* pre-mapped buffer (host vaddr) */
struct ib_receive_param rx_sp; /* receive work item */
struct ib_gather_scatter rx_gl; /* and it's memory */
-} koib_rx_t;
+} kib_rx_t;
-typedef struct koib_tx /* transmit message */
+typedef struct kib_tx /* transmit message */
{
struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */
int tx_isnblk; /* I'm reserved for non-blocking sends */
- struct koib_conn *tx_conn; /* owning conn */
+ struct kib_conn *tx_conn; /* owning conn */
int tx_mapped; /* mapped for RDMA? */
int tx_sending; /* # tx callbacks outstanding */
int tx_status; /* completion status */
- int tx_passive_rdma; /* waiting for peer to RDMA? */
- int tx_passive_rdma_wait; /* on ibc_rdma_queue */
- unsigned long tx_passive_rdma_deadline; /* completion deadline */
+ unsigned long tx_deadline; /* completion deadline */
+ int tx_passive_rdma; /* peer sucks/blows */
+ int tx_passive_rdma_wait; /* waiting for peer to complete */
__u64 tx_passive_rdma_cookie; /* completion cookie */
lib_msg_t *tx_libmsg[2]; /* lib msgs to finalize on completion */
- koib_md_t tx_md; /* RDMA mapping (active/passive) */
+ kib_md_t tx_md; /* RDMA mapping (active/passive) */
__u64 tx_vaddr; /* pre-mapped buffer (hca vaddr) */
- koib_msg_t *tx_msg; /* pre-mapped buffer (host vaddr) */
+ kib_msg_t *tx_msg; /* pre-mapped buffer (host vaddr) */
int tx_nsp; /* # send work items */
struct ib_send_param tx_sp[2]; /* send work items... */
struct ib_gather_scatter tx_gl[2]; /* ...and their memory */
-} koib_tx_t;
+} kib_tx_t;
-#define KOIB_TX_UNMAPPED 0
-#define KOIB_TX_MAPPED 1
-#define KOIB_TX_MAPPED_FMR 2
+#define KIB_TX_UNMAPPED 0
+#define KIB_TX_MAPPED 1
+#define KIB_TX_MAPPED_FMR 2
-typedef struct koib_wire_connreq
+typedef struct kib_wire_connreq
{
__u32 wcr_magic; /* I'm an openibnal connreq */
__u16 wcr_version; /* this is my version number */
__u16 wcr_queue_depth; /* this is my receive queue size */
__u64 wcr_nid; /* peer's NID */
__u64 wcr_incarnation; /* peer's incarnation */
-} koib_wire_connreq_t;
+} kib_wire_connreq_t;
-typedef struct koib_connreq
+typedef struct kib_connreq
{
/* connection-in-progress */
- struct koib_conn *cr_conn;
- koib_wire_connreq_t cr_wcr;
+ struct kib_conn *cr_conn;
+ kib_wire_connreq_t cr_wcr;
__u64 cr_tid;
struct ib_common_attrib_service cr_service;
tTS_IB_GID cr_gid;
struct ib_path_record cr_path;
struct ib_cm_active_param cr_connparam;
-} koib_connreq_t;
+} kib_connreq_t;
-typedef struct koib_conn
+typedef struct kib_conn
{
- struct koib_peer *ibc_peer; /* owning peer */
+ struct kib_peer *ibc_peer; /* owning peer */
struct list_head ibc_list; /* stash on peer's conn list */
__u64 ibc_incarnation; /* which instance of the peer */
atomic_t ibc_refcount; /* # users */
int ibc_credits; /* # credits I have */
int ibc_outstanding_credits; /* # credits to return */
struct list_head ibc_tx_queue; /* send queue */
- struct list_head ibc_rdma_queue; /* tx awaiting RDMA completion */
+ struct list_head ibc_active_txs; /* active tx awaiting completion */
spinlock_t ibc_lock; /* serialise */
- koib_rx_t *ibc_rxs; /* the rx descs */
- koib_pages_t *ibc_rx_pages; /* premapped rx msg pages */
+ kib_rx_t *ibc_rxs; /* the rx descs */
+ kib_pages_t *ibc_rx_pages; /* premapped rx msg pages */
struct ib_qp *ibc_qp; /* queue pair */
__u32 ibc_qpn; /* queue pair number */
tTS_IB_CM_COMM_ID ibc_comm_id; /* connection ID? */
- koib_connreq_t *ibc_connreq; /* connection request state */
-} koib_conn_t;
+ kib_connreq_t *ibc_connreq; /* connection request state */
+} kib_conn_t;
-#define OPENIBNAL_CONN_INIT_NOTHING 0 /* initial state */
-#define OPENIBNAL_CONN_INIT_QP 1 /* ibc_qp set up */
-#define OPENIBNAL_CONN_CONNECTING 2 /* started to connect */
-#define OPENIBNAL_CONN_ESTABLISHED 3 /* connection established */
-#define OPENIBNAL_CONN_DEATHROW 4 /* waiting to be closed */
-#define OPENIBNAL_CONN_ZOMBIE 5 /* waiting to be freed */
+#define IBNAL_CONN_INIT_NOTHING 0 /* initial state */
+#define IBNAL_CONN_INIT_QP 1 /* ibc_qp set up */
+#define IBNAL_CONN_CONNECTING 2 /* started to connect */
+#define IBNAL_CONN_ESTABLISHED 3 /* connection established */
+#define IBNAL_CONN_DEATHROW 4 /* waiting to be closed */
+#define IBNAL_CONN_ZOMBIE 5 /* waiting to be freed */
-typedef struct koib_peer
+typedef struct kib_peer
{
struct list_head ibp_list; /* stash on global peer list */
- struct list_head ibp_connd_list; /* schedule on koib_connd_peers */
+ struct list_head ibp_connd_list; /* schedule on kib_connd_peers */
ptl_nid_t ibp_nid; /* who's on the other end(s) */
atomic_t ibp_refcount; /* # users */
int ibp_persistence; /* "known" peer refs */
int ibp_connecting; /* connecting+accepting */
unsigned long ibp_reconnect_time; /* when reconnect may be attempted */
unsigned long ibp_reconnect_interval; /* exponential backoff */
-} koib_peer_t;
+} kib_peer_t;
-extern lib_nal_t koibnal_lib;
-extern koib_data_t koibnal_data;
-extern koib_tunables_t koibnal_tunables;
+extern lib_nal_t kibnal_lib;
+extern kib_data_t kibnal_data;
+extern kib_tunables_t kibnal_tunables;
static inline struct list_head *
-koibnal_nid2peerlist (ptl_nid_t nid)
+kibnal_nid2peerlist (ptl_nid_t nid)
{
- unsigned int hash = ((unsigned int)nid) % koibnal_data.koib_peer_hash_size;
+ unsigned int hash = ((unsigned int)nid) % kibnal_data.kib_peer_hash_size;
- return (&koibnal_data.koib_peers [hash]);
+ return (&kibnal_data.kib_peers [hash]);
}
static inline int
-koibnal_peer_active(koib_peer_t *peer)
+kibnal_peer_active(kib_peer_t *peer)
{
/* Am I in the peer hash table? */
return (!list_empty(&peer->ibp_list));
}
static inline void
-koibnal_queue_tx_locked (koib_tx_t *tx, koib_conn_t *conn)
+kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn)
{
/* CAVEAT EMPTOR: tx takes caller's ref on conn */
LASSERT (tx->tx_conn == NULL); /* only set here */
tx->tx_conn = conn;
+ tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ;
list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
}
-#define KOIBNAL_SERVICE_KEY_MASK (IB_SA_SERVICE_COMP_MASK_NAME | \
- IB_SA_SERVICE_COMP_MASK_DATA8_1 | \
- IB_SA_SERVICE_COMP_MASK_DATA8_2 | \
- IB_SA_SERVICE_COMP_MASK_DATA8_3 | \
- IB_SA_SERVICE_COMP_MASK_DATA8_4 | \
- IB_SA_SERVICE_COMP_MASK_DATA8_5 | \
- IB_SA_SERVICE_COMP_MASK_DATA8_6 | \
- IB_SA_SERVICE_COMP_MASK_DATA8_7 | \
- IB_SA_SERVICE_COMP_MASK_DATA8_8)
+#define KIBNAL_SERVICE_KEY_MASK (IB_SA_SERVICE_COMP_MASK_NAME | \
+ IB_SA_SERVICE_COMP_MASK_DATA8_1 | \
+ IB_SA_SERVICE_COMP_MASK_DATA8_2 | \
+ IB_SA_SERVICE_COMP_MASK_DATA8_3 | \
+ IB_SA_SERVICE_COMP_MASK_DATA8_4 | \
+ IB_SA_SERVICE_COMP_MASK_DATA8_5 | \
+ IB_SA_SERVICE_COMP_MASK_DATA8_6 | \
+ IB_SA_SERVICE_COMP_MASK_DATA8_7 | \
+ IB_SA_SERVICE_COMP_MASK_DATA8_8)
static inline __u64*
-koibnal_service_nid_field(struct ib_common_attrib_service *srv)
+kibnal_service_nid_field(struct ib_common_attrib_service *srv)
{
- /* must be consistent with KOIBNAL_SERVICE_KEY_MASK */
+ /* must be consistent with KIBNAL_SERVICE_KEY_MASK */
return (__u64 *)srv->service_data8;
}
static inline void
-koibnal_set_service_keys(struct ib_common_attrib_service *srv, ptl_nid_t nid)
+kibnal_set_service_keys(struct ib_common_attrib_service *srv, ptl_nid_t nid)
{
- LASSERT (strlen (OPENIBNAL_SERVICE_NAME) < sizeof(srv->service_name));
+ LASSERT (strlen (IBNAL_SERVICE_NAME) < sizeof(srv->service_name));
memset (srv->service_name, 0, sizeof(srv->service_name));
- strcpy (srv->service_name, OPENIBNAL_SERVICE_NAME);
+ strcpy (srv->service_name, IBNAL_SERVICE_NAME);
- *koibnal_service_nid_field(srv) = cpu_to_le64(nid);
+ *kibnal_service_nid_field(srv) = cpu_to_le64(nid);
}
#if 0
static inline void
-koibnal_show_rdma_attr (koib_conn_t *conn)
+kibnal_show_rdma_attr (kib_conn_t *conn)
{
struct ib_qp_attribute qp_attr;
int rc;
#if CONFIG_X86
static inline __u64
-koibnal_page2phys (struct page *p)
+kibnal_page2phys (struct page *p)
{
__u64 page_number = p - mem_map;
# error "no page->phys"
#endif
-extern koib_peer_t *koibnal_create_peer (ptl_nid_t nid);
-extern void koibnal_put_peer (koib_peer_t *peer);
-extern int koibnal_del_peer (ptl_nid_t nid, int single_share);
-extern koib_peer_t *koibnal_find_peer_locked (ptl_nid_t nid);
-extern void koibnal_unlink_peer_locked (koib_peer_t *peer);
-extern int koibnal_close_stale_conns_locked (koib_peer_t *peer,
+/* CAVEAT EMPTOR:
+ * We rely on tx/rx descriptor alignment to allow us to use the lowest bit
+ * of the work request id as a flag to determine if the completion is for a
+ * transmit or a receive. It seems that that the CQ entry's 'op' field
+ * isn't always set correctly on completions that occur after QP teardown. */
+
+static inline __u64
+kibnal_ptr2wreqid (void *ptr, int isrx)
+{
+ unsigned long lptr = (unsigned long)ptr;
+
+ LASSERT ((lptr & 1) == 0);
+ return (__u64)(lptr | (isrx ? 1 : 0));
+}
+
+static inline void *
+kibnal_wreqid2ptr (__u64 wreqid)
+{
+ return (void *)(((unsigned long)wreqid) & ~1UL);
+}
+
+static inline int
+kibnal_wreqid_is_rx (__u64 wreqid)
+{
+ return (wreqid & 1) != 0;
+}
+
+extern kib_peer_t *kibnal_create_peer (ptl_nid_t nid);
+extern void kibnal_put_peer (kib_peer_t *peer);
+extern int kibnal_del_peer (ptl_nid_t nid, int single_share);
+extern kib_peer_t *kibnal_find_peer_locked (ptl_nid_t nid);
+extern void kibnal_unlink_peer_locked (kib_peer_t *peer);
+extern int kibnal_close_stale_conns_locked (kib_peer_t *peer,
__u64 incarnation);
-extern koib_conn_t *koibnal_create_conn (void);
-extern void koibnal_put_conn (koib_conn_t *conn);
-extern void koibnal_destroy_conn (koib_conn_t *conn);
-extern int koibnal_alloc_pages (koib_pages_t **pp, int npages, int access);
-extern void koibnal_free_pages (koib_pages_t *p);
+extern kib_conn_t *kibnal_create_conn (void);
+extern void kibnal_put_conn (kib_conn_t *conn);
+extern void kibnal_destroy_conn (kib_conn_t *conn);
+extern int kibnal_alloc_pages (kib_pages_t **pp, int npages, int access);
+extern void kibnal_free_pages (kib_pages_t *p);
-extern void koibnal_check_sends (koib_conn_t *conn);
+extern void kibnal_check_sends (kib_conn_t *conn);
extern tTS_IB_CM_CALLBACK_RETURN
-koibnal_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid,
+kibnal_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid,
void *param, void *arg);
extern tTS_IB_CM_CALLBACK_RETURN
-koibnal_passive_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid,
+kibnal_passive_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid,
void *param, void *arg);
-extern void koibnal_close_conn_locked (koib_conn_t *conn, int error);
-extern void koibnal_destroy_conn (koib_conn_t *conn);
-extern int koibnal_thread_start (int (*fn)(void *arg), void *arg);
-extern int koibnal_scheduler(void *arg);
-extern int koibnal_connd (void *arg);
-extern void koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg);
-extern void koibnal_tx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg);
-extern void koibnal_init_tx_msg (koib_tx_t *tx, int type, int body_nob);
-extern int koibnal_close_conn (koib_conn_t *conn, int why);
-extern void koibnal_start_active_rdma (int type, int status,
- koib_rx_t *rx, lib_msg_t *libmsg,
- unsigned int niov,
- struct iovec *iov, ptl_kiov_t *kiov,
- size_t offset, size_t nob);
+extern void kibnal_close_conn_locked (kib_conn_t *conn, int error);
+extern void kibnal_destroy_conn (kib_conn_t *conn);
+extern int kibnal_thread_start (int (*fn)(void *arg), void *arg);
+extern int kibnal_scheduler(void *arg);
+extern int kibnal_connd (void *arg);
+extern void kibnal_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg);
+extern void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob);
+extern int kibnal_close_conn (kib_conn_t *conn, int why);
+extern void kibnal_start_active_rdma (int type, int status,
+ kib_rx_t *rx, lib_msg_t *libmsg,
+ unsigned int niov,
+ struct iovec *iov, ptl_kiov_t *kiov,
+ size_t offset, size_t nob);
+
*
*/
void
-koibnal_schedule_tx_done (koib_tx_t *tx)
+kibnal_schedule_tx_done (kib_tx_t *tx)
{
unsigned long flags;
- spin_lock_irqsave (&koibnal_data.koib_sched_lock, flags);
+ spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags);
- list_add_tail(&tx->tx_list, &koibnal_data.koib_sched_txq);
- wake_up (&koibnal_data.koib_sched_waitq);
+ list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq);
+ wake_up (&kibnal_data.kib_sched_waitq);
- spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, flags);
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
}
void
-koibnal_tx_done (koib_tx_t *tx)
+kibnal_tx_done (kib_tx_t *tx)
{
ptl_err_t ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL;
unsigned long flags;
int rc;
LASSERT (tx->tx_sending == 0); /* mustn't be awaiting callback */
- LASSERT (!tx->tx_passive_rdma_wait); /* mustn't be on ibc_rdma_queue */
+ LASSERT (!tx->tx_passive_rdma_wait); /* mustn't be awaiting RDMA */
switch (tx->tx_mapped) {
default:
LBUG();
- case KOIB_TX_UNMAPPED:
+ case KIB_TX_UNMAPPED:
break;
- case KOIB_TX_MAPPED:
+ case KIB_TX_MAPPED:
if (in_interrupt()) {
/* can't deregister memory in IRQ context... */
- koibnal_schedule_tx_done(tx);
+ kibnal_schedule_tx_done(tx);
return;
}
rc = ib_memory_deregister(tx->tx_md.md_handle.mr);
LASSERT (rc == 0);
- tx->tx_mapped = KOIB_TX_UNMAPPED;
+ tx->tx_mapped = KIB_TX_UNMAPPED;
break;
-#if OPENIBNAL_FMR
- case KOIB_TX_MAPPED_FMR:
+#if IBNAL_FMR
+ case KIB_TX_MAPPED_FMR:
if (in_interrupt() && tx->tx_status != 0) {
/* can't flush FMRs in IRQ context... */
- koibnal_schedule_tx_done(tx);
+ kibnal_schedule_tx_done(tx);
return;
}
LASSERT (rc == 0);
if (tx->tx_status != 0)
- ib_fmr_pool_force_flush(koibnal_data.koib_fmr_pool);
- tx->tx_mapped = KOIB_TX_UNMAPPED;
+ ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool);
+ tx->tx_mapped = KIB_TX_UNMAPPED;
break;
#endif
}
if (tx->tx_libmsg[i] == NULL)
continue;
- lib_finalize (&koibnal_lib, NULL, tx->tx_libmsg[i], ptlrc);
+ lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc);
tx->tx_libmsg[i] = NULL;
}
if (tx->tx_conn != NULL) {
- koibnal_put_conn (tx->tx_conn);
+ kibnal_put_conn (tx->tx_conn);
tx->tx_conn = NULL;
}
tx->tx_passive_rdma = 0;
tx->tx_status = 0;
- spin_lock_irqsave (&koibnal_data.koib_tx_lock, flags);
+ spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
if (tx->tx_isnblk) {
- list_add_tail (&tx->tx_list, &koibnal_data.koib_idle_nblk_txs);
+ list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs);
} else {
- list_add_tail (&tx->tx_list, &koibnal_data.koib_idle_txs);
- wake_up (&koibnal_data.koib_idle_tx_waitq);
+ list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs);
+ wake_up (&kibnal_data.kib_idle_tx_waitq);
}
- spin_unlock_irqrestore (&koibnal_data.koib_tx_lock, flags);
+ spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
}
-koib_tx_t *
-koibnal_get_idle_tx (int may_block)
+kib_tx_t *
+kibnal_get_idle_tx (int may_block)
{
- unsigned long flags;
- koib_tx_t *tx = NULL;
+ unsigned long flags;
+ kib_tx_t *tx = NULL;
for (;;) {
- spin_lock_irqsave (&koibnal_data.koib_tx_lock, flags);
+ spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
/* "normal" descriptor is free */
- if (!list_empty (&koibnal_data.koib_idle_txs)) {
- tx = list_entry (koibnal_data.koib_idle_txs.next,
- koib_tx_t, tx_list);
+ if (!list_empty (&kibnal_data.kib_idle_txs)) {
+ tx = list_entry (kibnal_data.kib_idle_txs.next,
+ kib_tx_t, tx_list);
break;
}
if (!may_block) {
/* may dip into reserve pool */
- if (list_empty (&koibnal_data.koib_idle_nblk_txs)) {
+ if (list_empty (&kibnal_data.kib_idle_nblk_txs)) {
CERROR ("reserved tx desc pool exhausted\n");
break;
}
- tx = list_entry (koibnal_data.koib_idle_nblk_txs.next,
- koib_tx_t, tx_list);
+ tx = list_entry (kibnal_data.kib_idle_nblk_txs.next,
+ kib_tx_t, tx_list);
break;
}
/* block for idle tx */
- spin_unlock_irqrestore (&koibnal_data.koib_tx_lock, flags);
+ spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
- wait_event (koibnal_data.koib_idle_tx_waitq,
- !list_empty (&koibnal_data.koib_idle_txs) ||
- koibnal_data.koib_shutdown);
+ wait_event (kibnal_data.kib_idle_tx_waitq,
+ !list_empty (&kibnal_data.kib_idle_txs) ||
+ kibnal_data.kib_shutdown);
}
if (tx != NULL) {
/* Allocate a new passive RDMA completion cookie. It might
* not be needed, but we've got a lock right now and we're
* unlikely to wrap... */
- tx->tx_passive_rdma_cookie = koibnal_data.koib_next_tx_cookie++;
+ tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++;
- LASSERT (tx->tx_mapped == KOIB_TX_UNMAPPED);
+ LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
LASSERT (tx->tx_nsp == 0);
LASSERT (tx->tx_sending == 0);
LASSERT (tx->tx_status == 0);
LASSERT (tx->tx_libmsg[1] == NULL);
}
- spin_unlock_irqrestore (&koibnal_data.koib_tx_lock, flags);
+ spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
return (tx);
}
int
-koibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
+kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
{
- /* I would guess that if koibnal_get_peer (nid) == NULL,
+ /* I would guess that if kibnal_get_peer (nid) == NULL,
and we're not routing, then 'nid' is very distant :) */
if ( nal->libnal_ni.ni_pid.nid == nid ) {
*dist = 0;
}
void
-koibnal_complete_passive_rdma(koib_conn_t *conn, __u64 cookie, int status)
+kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status)
{
struct list_head *ttmp;
unsigned long flags;
spin_lock_irqsave (&conn->ibc_lock, flags);
- list_for_each (ttmp, &conn->ibc_rdma_queue) {
- koib_tx_t *tx = list_entry(ttmp, koib_tx_t, tx_list);
-
- LASSERT (tx->tx_passive_rdma);
- LASSERT (tx->tx_passive_rdma_wait);
+ list_for_each (ttmp, &conn->ibc_active_txs) {
+ kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list);
- if (tx->tx_passive_rdma_cookie != cookie)
- continue;
+ LASSERT (tx->tx_passive_rdma ||
+ !tx->tx_passive_rdma_wait);
- CDEBUG(D_NET, "Complete %p "LPD64"\n", tx, cookie);
+ LASSERT (tx->tx_passive_rdma_wait ||
+ tx->tx_sending != 0);
- list_del (&tx->tx_list);
+ if (!tx->tx_passive_rdma_wait ||
+ tx->tx_passive_rdma_cookie != cookie)
+ continue;
+
+ CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status);
+ tx->tx_status = status;
tx->tx_passive_rdma_wait = 0;
idle = (tx->tx_sending == 0);
- tx->tx_status = status;
+ if (idle)
+ list_del (&tx->tx_list);
spin_unlock_irqrestore (&conn->ibc_lock, flags);
/* I could be racing with tx callbacks. It's whoever
* _makes_ tx idle that frees it */
if (idle)
- koibnal_tx_done (tx);
+ kibnal_tx_done (tx);
return;
}
}
void
-koibnal_post_rx (koib_rx_t *rx, int do_credits)
+kibnal_post_rx (kib_rx_t *rx, int do_credits)
{
- koib_conn_t *conn = rx->rx_conn;
+ kib_conn_t *conn = rx->rx_conn;
int rc;
unsigned long flags;
rx->rx_gl = (struct ib_gather_scatter) {
.address = rx->rx_vaddr,
- .length = OPENIBNAL_MSG_SIZE,
- .key = conn->ibc_rx_pages->oibp_lkey,
+ .length = IBNAL_MSG_SIZE,
+ .key = conn->ibc_rx_pages->ibp_lkey,
};
-
+
rx->rx_sp = (struct ib_receive_param) {
- .work_request_id = (__u64)(unsigned long)rx,
+ .work_request_id = kibnal_ptr2wreqid(rx, 1),
.scatter_list = &rx->rx_gl,
.num_scatter_entries = 1,
.device_specific = NULL,
.signaled = 1,
};
- LASSERT (conn->ibc_state >= OPENIBNAL_CONN_ESTABLISHED);
+ LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
LASSERT (!rx->rx_posted);
rx->rx_posted = 1;
mb();
- if (conn->ibc_state != OPENIBNAL_CONN_ESTABLISHED)
+ if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
rc = -ECONNABORTED;
else
rc = ib_receive (conn->ibc_qp, &rx->rx_sp, 1);
conn->ibc_outstanding_credits++;
spin_unlock_irqrestore(&conn->ibc_lock, flags);
- koibnal_check_sends(conn);
+ kibnal_check_sends(conn);
}
return;
}
- if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED) {
+ if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
CERROR ("Error posting receive -> "LPX64": %d\n",
conn->ibc_peer->ibp_nid, rc);
- koibnal_close_conn (rx->rx_conn, rc);
+ kibnal_close_conn (rx->rx_conn, rc);
} else {
CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n",
conn->ibc_peer->ibp_nid, rc);
}
/* Drop rx's ref */
- koibnal_put_conn (conn);
+ kibnal_put_conn (conn);
}
-#if OPENIBNAL_CKSUM
-__u32 koibnal_cksum (void *ptr, int nob)
+#if IBNAL_CKSUM
+__u32 kibnal_cksum (void *ptr, int nob)
{
char *c = ptr;
__u32 sum = 0;
#endif
void
-koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
+kibnal_rx_callback (struct ib_cq_entry *e)
{
- koib_rx_t *rx = (koib_rx_t *)((unsigned long)e->work_request_id);
- koib_msg_t *msg = rx->rx_msg;
- koib_conn_t *conn = rx->rx_conn;
+ kib_rx_t *rx = (kib_rx_t *)kibnal_wreqid2ptr(e->work_request_id);
+ kib_msg_t *msg = rx->rx_msg;
+ kib_conn_t *conn = rx->rx_conn;
int nob = e->bytes_transferred;
- const int base_nob = offsetof(koib_msg_t, oibm_u);
+ const int base_nob = offsetof(kib_msg_t, ibm_u);
int credits;
int flipped;
unsigned long flags;
-#if OPENIBNAL_CKSUM
+#if IBNAL_CKSUM
__u32 msg_cksum;
__u32 computed_cksum;
#endif
/* receives complete with error in any case after we've started
* closing the QP */
- if (conn->ibc_state >= OPENIBNAL_CONN_DEATHROW)
+ if (conn->ibc_state >= IBNAL_CONN_DEATHROW)
goto failed;
/* We don't post receives until the conn is established */
- LASSERT (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED);
+ LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
if (e->status != IB_COMPLETION_STATUS_SUCCESS) {
CERROR("Rx from "LPX64" failed: %d\n",
/* Receiver does any byte flipping if necessary... */
- if (msg->oibm_magic == OPENIBNAL_MSG_MAGIC) {
+ if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
flipped = 0;
} else {
- if (msg->oibm_magic != __swab32(OPENIBNAL_MSG_MAGIC)) {
+ if (msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) {
CERROR ("Unrecognised magic: %08x from "LPX64"\n",
- msg->oibm_magic, conn->ibc_peer->ibp_nid);
+ msg->ibm_magic, conn->ibc_peer->ibp_nid);
goto failed;
}
flipped = 1;
- __swab16s (&msg->oibm_version);
- LASSERT (sizeof(msg->oibm_type) == 1);
- LASSERT (sizeof(msg->oibm_credits) == 1);
+ __swab16s (&msg->ibm_version);
+ LASSERT (sizeof(msg->ibm_type) == 1);
+ LASSERT (sizeof(msg->ibm_credits) == 1);
}
- if (msg->oibm_version != OPENIBNAL_MSG_VERSION) {
+ if (msg->ibm_version != IBNAL_MSG_VERSION) {
CERROR ("Incompatible msg version %d (%d expected)\n",
- msg->oibm_version, OPENIBNAL_MSG_VERSION);
+ msg->ibm_version, IBNAL_MSG_VERSION);
goto failed;
}
-#if OPENIBNAL_CKSUM
- if (nob != msg->oibm_nob) {
- CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->oibm_nob);
+#if IBNAL_CKSUM
+ if (nob != msg->ibm_nob) {
+ CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->ibm_nob);
goto failed;
}
- msg_cksum = le32_to_cpu(msg->oibm_cksum);
- msg->oibm_cksum = 0;
- computed_cksum = koibnal_cksum (msg, nob);
+ msg_cksum = le32_to_cpu(msg->ibm_cksum);
+ msg->ibm_cksum = 0;
+ computed_cksum = kibnal_cksum (msg, nob);
if (msg_cksum != computed_cksum) {
CERROR ("Checksum failure %d: (%d expected)\n",
#endif
/* Have I received credits that will let me send? */
- credits = msg->oibm_credits;
+ credits = msg->ibm_credits;
if (credits != 0) {
spin_lock_irqsave(&conn->ibc_lock, flags);
conn->ibc_credits += credits;
spin_unlock_irqrestore(&conn->ibc_lock, flags);
- koibnal_check_sends(conn);
+ kibnal_check_sends(conn);
}
- switch (msg->oibm_type) {
- case OPENIBNAL_MSG_NOOP:
- koibnal_post_rx (rx, 1);
+ switch (msg->ibm_type) {
+ case IBNAL_MSG_NOOP:
+ kibnal_post_rx (rx, 1);
return;
- case OPENIBNAL_MSG_IMMEDIATE:
- if (nob < base_nob + sizeof (koib_immediate_msg_t)) {
+ case IBNAL_MSG_IMMEDIATE:
+ if (nob < base_nob + sizeof (kib_immediate_msg_t)) {
CERROR ("Short IMMEDIATE from "LPX64": %d\n",
conn->ibc_peer->ibp_nid, nob);
goto failed;
}
break;
- case OPENIBNAL_MSG_PUT_RDMA:
- case OPENIBNAL_MSG_GET_RDMA:
- if (nob < base_nob + sizeof (koib_rdma_msg_t)) {
+ case IBNAL_MSG_PUT_RDMA:
+ case IBNAL_MSG_GET_RDMA:
+ if (nob < base_nob + sizeof (kib_rdma_msg_t)) {
CERROR ("Short RDMA msg from "LPX64": %d\n",
conn->ibc_peer->ibp_nid, nob);
goto failed;
}
if (flipped) {
- __swab32s(&msg->oibm_u.rdma.oibrm_desc.rd_key);
- __swab32s(&msg->oibm_u.rdma.oibrm_desc.rd_nob);
- __swab64s(&msg->oibm_u.rdma.oibrm_desc.rd_addr);
+ __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_key);
+ __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_nob);
+ __swab64s(&msg->ibm_u.rdma.ibrm_desc.rd_addr);
}
CDEBUG(D_NET, "%d RDMA: cookie "LPX64", key %x, addr "LPX64", nob %d\n",
- msg->oibm_type, msg->oibm_u.rdma.oibrm_cookie,
- msg->oibm_u.rdma.oibrm_desc.rd_key,
- msg->oibm_u.rdma.oibrm_desc.rd_addr,
- msg->oibm_u.rdma.oibrm_desc.rd_nob);
+ msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie,
+ msg->ibm_u.rdma.ibrm_desc.rd_key,
+ msg->ibm_u.rdma.ibrm_desc.rd_addr,
+ msg->ibm_u.rdma.ibrm_desc.rd_nob);
break;
- case OPENIBNAL_MSG_PUT_DONE:
- case OPENIBNAL_MSG_GET_DONE:
- if (nob < base_nob + sizeof (koib_completion_msg_t)) {
+ case IBNAL_MSG_PUT_DONE:
+ case IBNAL_MSG_GET_DONE:
+ if (nob < base_nob + sizeof (kib_completion_msg_t)) {
CERROR ("Short COMPLETION msg from "LPX64": %d\n",
conn->ibc_peer->ibp_nid, nob);
goto failed;
}
if (flipped)
- __swab32s(&msg->oibm_u.completion.oibcm_status);
+ __swab32s(&msg->ibm_u.completion.ibcm_status);
CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n",
- msg->oibm_type, msg->oibm_u.completion.oibcm_cookie,
- msg->oibm_u.completion.oibcm_status);
+ msg->ibm_type, msg->ibm_u.completion.ibcm_cookie,
+ msg->ibm_u.completion.ibcm_status);
- koibnal_complete_passive_rdma (conn,
- msg->oibm_u.completion.oibcm_cookie,
- msg->oibm_u.completion.oibcm_status);
- koibnal_post_rx (rx, 1);
+ kibnal_complete_passive_rdma (conn,
+ msg->ibm_u.completion.ibcm_cookie,
+ msg->ibm_u.completion.ibcm_status);
+ kibnal_post_rx (rx, 1);
return;
default:
CERROR ("Can't parse type from "LPX64": %d\n",
- conn->ibc_peer->ibp_nid, msg->oibm_type);
+ conn->ibc_peer->ibp_nid, msg->ibm_type);
goto failed;
}
- /* schedule for koibnal_rx() in thread context */
- spin_lock_irqsave(&koibnal_data.koib_sched_lock, flags);
+ /* schedule for kibnal_rx() in thread context */
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
- list_add_tail (&rx->rx_list, &koibnal_data.koib_sched_rxq);
- wake_up (&koibnal_data.koib_sched_waitq);
+ list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq);
+ wake_up (&kibnal_data.kib_sched_waitq);
- spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, flags);
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
return;
failed:
CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
- koibnal_close_conn(conn, -ECONNABORTED);
+ kibnal_close_conn(conn, -ECONNABORTED);
/* Don't re-post rx & drop its ref on conn */
- koibnal_put_conn(conn);
+ kibnal_put_conn(conn);
}
void
-koibnal_rx (koib_rx_t *rx)
+kibnal_rx (kib_rx_t *rx)
{
- koib_msg_t *msg = rx->rx_msg;
+ kib_msg_t *msg = rx->rx_msg;
/* Clear flag so I can detect if I've sent an RDMA completion */
rx->rx_rdma = 0;
- switch (msg->oibm_type) {
- case OPENIBNAL_MSG_GET_RDMA:
- lib_parse(&koibnal_lib, &msg->oibm_u.rdma.oibrm_hdr, rx);
+ switch (msg->ibm_type) {
+ case IBNAL_MSG_GET_RDMA:
+ lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
/* If the incoming get was matched, I'll have initiated the
* RDMA and the completion message... */
if (rx->rx_rdma)
* the peer's GET blocking for the full timeout. */
CERROR ("Completing unmatched RDMA GET from "LPX64"\n",
rx->rx_conn->ibc_peer->ibp_nid);
- koibnal_start_active_rdma (OPENIBNAL_MSG_GET_DONE, -EIO,
- rx, NULL, 0, NULL, NULL, 0, 0);
+ kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO,
+ rx, NULL, 0, NULL, NULL, 0, 0);
break;
- case OPENIBNAL_MSG_PUT_RDMA:
- lib_parse(&koibnal_lib, &msg->oibm_u.rdma.oibrm_hdr, rx);
+ case IBNAL_MSG_PUT_RDMA:
+ lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
if (rx->rx_rdma)
break;
/* This is most unusual, since even if lib_parse() didn't
rx->rx_conn->ibc_peer->ibp_nid);
break;
- case OPENIBNAL_MSG_IMMEDIATE:
- lib_parse(&koibnal_lib, &msg->oibm_u.immediate.oibim_hdr, rx);
+ case IBNAL_MSG_IMMEDIATE:
+ lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx);
LASSERT (!rx->rx_rdma);
break;
break;
}
- koibnal_post_rx (rx, 1);
+ kibnal_post_rx (rx, 1);
}
#if 0
int
-koibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp)
+kibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp)
{
struct page *page;
else if (vaddr >= PKMAP_BASE &&
vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
page = vmalloc_to_page ((void *)vaddr);
- /* in 2.4 ^ just walks the page tables */
+ /* in 2.4 ^ just walks the page tables */
#endif
else
page = virt_to_page (vaddr);
!VALID_PAGE (page))
return (-EFAULT);
- *physp = koibnal_page2phys(page) + (vaddr & (PAGE_SIZE - 1));
+ *physp = kibnal_page2phys(page) + (vaddr & (PAGE_SIZE - 1));
return (0);
}
#endif
int
-koibnal_map_iov (koib_tx_t *tx, enum ib_memory_access access,
+kibnal_map_iov (kib_tx_t *tx, enum ib_memory_access access,
int niov, struct iovec *iov, int offset, int nob)
{
LASSERT (nob > 0);
LASSERT (niov > 0);
- LASSERT (tx->tx_mapped == KOIB_TX_UNMAPPED);
+ LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
while (offset >= iov->iov_len) {
offset -= iov->iov_len;
vaddr = (void *)(((unsigned long)iov->iov_base) + offset);
tx->tx_md.md_addr = (__u64)((unsigned long)vaddr);
- rc = ib_memory_register (koibnal_data.koib_pd,
+ rc = ib_memory_register (kibnal_data.kib_pd,
vaddr, nob,
access,
&tx->tx_md.md_handle.mr,
return (rc);
}
- tx->tx_mapped = KOIB_TX_MAPPED;
+ tx->tx_mapped = KIB_TX_MAPPED;
return (0);
}
int
-koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access,
+kibnal_map_kiov (kib_tx_t *tx, enum ib_memory_access access,
int nkiov, ptl_kiov_t *kiov,
int offset, int nob)
{
-#if OPENIBNAL_FMR
+#if IBNAL_FMR
__u64 *phys;
- const int mapped = KOIB_TX_MAPPED_FMR;
+ const int mapped = KIB_TX_MAPPED_FMR;
#else
struct ib_physical_buffer *phys;
- const int mapped = KOIB_TX_MAPPED;
+ const int mapped = KIB_TX_MAPPED;
#endif
int page_offset;
int nphys;
LASSERT (nob > 0);
LASSERT (nkiov > 0);
- LASSERT (tx->tx_mapped == KOIB_TX_UNMAPPED);
+ LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
while (offset >= kiov->kiov_len) {
offset -= kiov->kiov_len;
}
page_offset = kiov->kiov_offset + offset;
-#if OPENIBNAL_FMR
- phys[0] = koibnal_page2phys(kiov->kiov_page);
+#if IBNAL_FMR
+ phys[0] = kibnal_page2phys(kiov->kiov_page);
#else
- phys[0].address = koibnal_page2phys(kiov->kiov_page);
+ phys[0].address = kibnal_page2phys(kiov->kiov_page);
phys[0].size = PAGE_SIZE;
#endif
nphys = 1;
}
LASSERT (nphys * sizeof (*phys) < phys_size);
-#if OPENIBNAL_FMR
- phys[nphys] = koibnal_page2phys(kiov->kiov_page);
+#if IBNAL_FMR
+ phys[nphys] = kibnal_page2phys(kiov->kiov_page);
#else
- phys[nphys].address = koibnal_page2phys(kiov->kiov_page);
+ phys[nphys].address = kibnal_page2phys(kiov->kiov_page);
phys[nphys].size = PAGE_SIZE;
#endif
nphys++;
for (rc = 0; rc < nphys; rc++)
CWARN (" [%d] "LPX64" / %d\n", rc, phys[rc].address, phys[rc].size);
#endif
- tx->tx_md.md_addr = OPENIBNAL_RDMA_BASE;
+ tx->tx_md.md_addr = IBNAL_RDMA_BASE;
-#if OPENIBNAL_FMR
- rc = ib_fmr_register_physical (koibnal_data.koib_fmr_pool,
+#if IBNAL_FMR
+ rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool,
phys, nphys,
&tx->tx_md.md_addr,
page_offset,
&tx->tx_md.md_lkey,
&tx->tx_md.md_rkey);
#else
- rc = ib_memory_register_physical (koibnal_data.koib_pd,
+ rc = ib_memory_register_physical (kibnal_data.kib_pd,
phys, nphys,
&tx->tx_md.md_addr,
nob, page_offset,
return (rc);
}
-koib_conn_t *
-koibnal_find_conn_locked (koib_peer_t *peer)
+kib_conn_t *
+kibnal_find_conn_locked (kib_peer_t *peer)
{
struct list_head *tmp;
/* just return the first connection */
list_for_each (tmp, &peer->ibp_conns) {
- return (list_entry(tmp, koib_conn_t, ibc_list));
+ return (list_entry(tmp, kib_conn_t, ibc_list));
}
return (NULL);
}
void
-koibnal_check_sends (koib_conn_t *conn)
+kibnal_check_sends (kib_conn_t *conn)
{
unsigned long flags;
- koib_tx_t *tx;
+ kib_tx_t *tx;
int rc;
int i;
int done;
spin_lock_irqsave (&conn->ibc_lock, flags);
+ LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE);
+
if (list_empty(&conn->ibc_tx_queue) &&
- conn->ibc_outstanding_credits >= OPENIBNAL_CREDIT_HIGHWATER) {
+ conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) {
spin_unlock_irqrestore(&conn->ibc_lock, flags);
-
- tx = koibnal_get_idle_tx(0); /* don't block */
+
+ tx = kibnal_get_idle_tx(0); /* don't block */
if (tx != NULL)
- koibnal_init_tx_msg(tx, OPENIBNAL_MSG_NOOP, 0);
+ kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
spin_lock_irqsave(&conn->ibc_lock, flags);
-
+
if (tx != NULL) {
atomic_inc(&conn->ibc_refcount);
- koibnal_queue_tx_locked(tx, conn);
+ kibnal_queue_tx_locked(tx, conn);
}
}
- LASSERT (conn->ibc_nsends_posted <= OPENIBNAL_MSG_QUEUE_SIZE);
-
while (!list_empty (&conn->ibc_tx_queue)) {
- tx = list_entry (conn->ibc_tx_queue.next, koib_tx_t, tx_list);
+ tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list);
/* We rely on this for QP sizing */
LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= 2);
LASSERT (conn->ibc_outstanding_credits >= 0);
- LASSERT (conn->ibc_outstanding_credits <= OPENIBNAL_MSG_QUEUE_SIZE);
+ LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
LASSERT (conn->ibc_credits >= 0);
- LASSERT (conn->ibc_credits <= OPENIBNAL_MSG_QUEUE_SIZE);
+ LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
/* Not on ibc_rdma_queue */
LASSERT (!tx->tx_passive_rdma_wait);
- if (conn->ibc_nsends_posted == OPENIBNAL_MSG_QUEUE_SIZE)
+ if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE)
break;
if (conn->ibc_credits == 0) /* no credits */
list_del (&tx->tx_list);
- if (tx->tx_msg->oibm_type == OPENIBNAL_MSG_NOOP &&
+ if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
(!list_empty(&conn->ibc_tx_queue) ||
- conn->ibc_outstanding_credits < OPENIBNAL_CREDIT_HIGHWATER)) {
- /* Redundant NOOP */
+ conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) {
+ /* redundant NOOP */
spin_unlock_irqrestore(&conn->ibc_lock, flags);
- koibnal_tx_done(tx);
+ kibnal_tx_done(tx);
spin_lock_irqsave(&conn->ibc_lock, flags);
continue;
}
-
- /* incoming RDMA completion can find this one now */
- if (tx->tx_passive_rdma) {
- list_add (&tx->tx_list, &conn->ibc_rdma_queue);
- tx->tx_passive_rdma_wait = 1;
- tx->tx_passive_rdma_deadline =
- jiffies + koibnal_tunables.koib_io_timeout * HZ;
- }
- tx->tx_msg->oibm_credits = conn->ibc_outstanding_credits;
+ tx->tx_msg->ibm_credits = conn->ibc_outstanding_credits;
conn->ibc_outstanding_credits = 0;
- /* use the free memory barrier when we unlock to ensure
- * sending set before we can get the tx callback. */
conn->ibc_nsends_posted++;
conn->ibc_credits--;
- tx->tx_sending = tx->tx_nsp;
-#if OPENIBNAL_CKSUM
- tx->tx_msg->oibm_cksum = 0;
- tx->tx_msg->oibm_cksum = koibnal_cksum(tx->tx_msg, tx->tx_msg->oibm_nob);
- CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->oibm_cksum, tx->tx_msg->oibm_nob);
+ tx->tx_sending = tx->tx_nsp;
+ tx->tx_passive_rdma_wait = tx->tx_passive_rdma;
+ list_add (&tx->tx_list, &conn->ibc_active_txs);
+#if IBNAL_CKSUM
+ tx->tx_msg->ibm_cksum = 0;
+ tx->tx_msg->ibm_cksum = kibnal_cksum(tx->tx_msg, tx->tx_msg->ibm_nob);
+ CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->ibm_cksum, tx->tx_msg->ibm_nob);
#endif
spin_unlock_irqrestore (&conn->ibc_lock, flags);
rc = -ECONNABORTED;
nwork = 0;
- if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED) {
+ if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
tx->tx_status = 0;
/* Driver only accepts 1 item at a time */
for (i = 0; i < tx->tx_nsp; i++) {
if (rc != 0) {
/* NB credits are transferred in the actual
* message, which can only be the last work item */
- conn->ibc_outstanding_credits += tx->tx_msg->oibm_credits;
+ conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
conn->ibc_credits++;
conn->ibc_nsends_posted--;
- tx->tx_sending -= tx->tx_nsp - nwork;
+
tx->tx_status = rc;
+ tx->tx_passive_rdma_wait = 0;
+ tx->tx_sending -= tx->tx_nsp - nwork;
+
done = (tx->tx_sending == 0);
-
- if (tx->tx_passive_rdma) {
- tx->tx_passive_rdma_wait = 0;
+ if (done)
list_del (&tx->tx_list);
- }
spin_unlock_irqrestore (&conn->ibc_lock, flags);
- if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED)
+ if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
CERROR ("Error %d posting transmit to "LPX64"\n",
rc, conn->ibc_peer->ibp_nid);
else
CDEBUG (D_NET, "Error %d posting transmit to "
LPX64"\n", rc, conn->ibc_peer->ibp_nid);
- koibnal_close_conn (conn, rc);
+ kibnal_close_conn (conn, rc);
if (done)
- koibnal_tx_done (tx);
+ kibnal_tx_done (tx);
return;
}
}
void
-koibnal_tx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
+kibnal_tx_callback (struct ib_cq_entry *e)
{
- koib_tx_t *tx = (koib_tx_t *)((unsigned long)e->work_request_id);
- koib_conn_t *conn;
+ kib_tx_t *tx = (kib_tx_t *)kibnal_wreqid2ptr(e->work_request_id);
+ kib_conn_t *conn;
unsigned long flags;
int idle;
tx->tx_sending--;
idle = (tx->tx_sending == 0) && /* This is the final callback */
(!tx->tx_passive_rdma_wait); /* Not waiting for RDMA completion */
+ if (idle)
+ list_del(&tx->tx_list);
CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
spin_unlock_irqrestore(&conn->ibc_lock, flags);
if (idle)
- koibnal_tx_done (tx);
+ kibnal_tx_done (tx);
if (e->status != IB_COMPLETION_STATUS_SUCCESS) {
CERROR ("Tx completion to "LPX64" failed: %d\n",
conn->ibc_peer->ibp_nid, e->status);
- koibnal_close_conn (conn, -ENETDOWN);
+ kibnal_close_conn (conn, -ENETDOWN);
} else {
/* can I shovel some more sends out the door? */
- koibnal_check_sends(conn);
+ kibnal_check_sends(conn);
}
- koibnal_put_conn (conn);
+ kibnal_put_conn (conn);
}
void
-koibnal_init_tx_msg (koib_tx_t *tx, int type, int body_nob)
+kibnal_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
+{
+ if (kibnal_wreqid_is_rx(e->work_request_id))
+ kibnal_rx_callback (e);
+ else
+ kibnal_tx_callback (e);
+}
+
+void
+kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
{
struct ib_gather_scatter *gl = &tx->tx_gl[tx->tx_nsp];
struct ib_send_param *sp = &tx->tx_sp[tx->tx_nsp];
int fence;
- int nob = offsetof (koib_msg_t, oibm_u) + body_nob;
+ int nob = offsetof (kib_msg_t, ibm_u) + body_nob;
LASSERT (tx->tx_nsp >= 0 &&
tx->tx_nsp < sizeof(tx->tx_sp)/sizeof(tx->tx_sp[0]));
- LASSERT (nob <= OPENIBNAL_MSG_SIZE);
+ LASSERT (nob <= IBNAL_MSG_SIZE);
- tx->tx_msg->oibm_magic = OPENIBNAL_MSG_MAGIC;
- tx->tx_msg->oibm_version = OPENIBNAL_MSG_VERSION;
- tx->tx_msg->oibm_type = type;
-#if OPENIBNAL_CKSUM
- tx->tx_msg->oibm_nob = nob;
+ tx->tx_msg->ibm_magic = IBNAL_MSG_MAGIC;
+ tx->tx_msg->ibm_version = IBNAL_MSG_VERSION;
+ tx->tx_msg->ibm_type = type;
+#if IBNAL_CKSUM
+ tx->tx_msg->ibm_nob = nob;
#endif
/* Fence the message if it's bundled with an RDMA read */
fence = (tx->tx_nsp > 0) &&
- (type == OPENIBNAL_MSG_PUT_DONE);
+ (type == IBNAL_MSG_PUT_DONE);
*gl = (struct ib_gather_scatter) {
.address = tx->tx_vaddr,
.length = nob,
- .key = koibnal_data.koib_tx_pages->oibp_lkey,
+ .key = kibnal_data.kib_tx_pages->ibp_lkey,
};
/* NB If this is an RDMA read, the completion message must wait for
* the RDMA to complete. Sends wait for previous RDMA writes
* anyway... */
*sp = (struct ib_send_param) {
- .work_request_id = (__u64)((unsigned long)tx),
+ .work_request_id = kibnal_ptr2wreqid(tx, 0),
.op = IB_OP_SEND,
.gather_list = gl,
.num_gather_entries = 1,
}
void
-koibnal_queue_tx (koib_tx_t *tx, koib_conn_t *conn)
+kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
{
unsigned long flags;
spin_lock_irqsave(&conn->ibc_lock, flags);
- koibnal_queue_tx_locked (tx, conn);
+ kibnal_queue_tx_locked (tx, conn);
spin_unlock_irqrestore(&conn->ibc_lock, flags);
- koibnal_check_sends(conn);
+ kibnal_check_sends(conn);
}
void
-koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid)
+kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
{
unsigned long flags;
- koib_peer_t *peer;
- koib_conn_t *conn;
- rwlock_t *g_lock = &koibnal_data.koib_global_lock;
+ kib_peer_t *peer;
+ kib_conn_t *conn;
+ rwlock_t *g_lock = &kibnal_data.kib_global_lock;
/* If I get here, I've committed to send, so I complete the tx with
* failure on any problems */
read_lock (g_lock);
- peer = koibnal_find_peer_locked (nid);
+ peer = kibnal_find_peer_locked (nid);
if (peer == NULL) {
read_unlock (g_lock);
tx->tx_status = -EHOSTUNREACH;
- koibnal_tx_done (tx);
+ kibnal_tx_done (tx);
return;
}
- conn = koibnal_find_conn_locked (peer);
+ conn = kibnal_find_conn_locked (peer);
if (conn != NULL) {
CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
read_unlock (g_lock);
- koibnal_queue_tx (tx, conn);
+ kibnal_queue_tx (tx, conn);
return;
}
read_unlock (g_lock);
write_lock_irqsave (g_lock, flags);
- peer = koibnal_find_peer_locked (nid);
+ peer = kibnal_find_peer_locked (nid);
if (peer == NULL) {
write_unlock_irqrestore (g_lock, flags);
tx->tx_status = -EHOSTUNREACH;
- koibnal_tx_done (tx);
+ kibnal_tx_done (tx);
return;
}
- conn = koibnal_find_conn_locked (peer);
+ conn = kibnal_find_conn_locked (peer);
if (conn != NULL) {
/* Connection exists; queue message on it */
CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
write_unlock_irqrestore (g_lock, flags);
- koibnal_queue_tx (tx, conn);
+ kibnal_queue_tx (tx, conn);
return;
}
if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) {
write_unlock_irqrestore (g_lock, flags);
tx->tx_status = -EHOSTUNREACH;
- koibnal_tx_done (tx);
+ kibnal_tx_done (tx);
return;
}
peer->ibp_connecting = 1;
atomic_inc (&peer->ibp_refcount); /* extra ref for connd */
- spin_lock (&koibnal_data.koib_connd_lock);
+ spin_lock (&kibnal_data.kib_connd_lock);
list_add_tail (&peer->ibp_connd_list,
- &koibnal_data.koib_connd_peers);
- wake_up (&koibnal_data.koib_connd_waitq);
+ &kibnal_data.kib_connd_peers);
+ wake_up (&kibnal_data.kib_connd_waitq);
- spin_unlock (&koibnal_data.koib_connd_lock);
+ spin_unlock (&kibnal_data.kib_connd_lock);
}
/* A connection is being established; queue the message... */
}
ptl_err_t
-koibnal_start_passive_rdma (int type, ptl_nid_t nid,
+kibnal_start_passive_rdma (int type, ptl_nid_t nid,
lib_msg_t *libmsg, ptl_hdr_t *hdr)
{
int nob = libmsg->md->length;
- koib_tx_t *tx;
- koib_msg_t *oibmsg;
+ kib_tx_t *tx;
+ kib_msg_t *ibmsg;
int rc;
int access;
- LASSERT (type == OPENIBNAL_MSG_PUT_RDMA ||
- type == OPENIBNAL_MSG_GET_RDMA);
+ LASSERT (type == IBNAL_MSG_PUT_RDMA ||
+ type == IBNAL_MSG_GET_RDMA);
LASSERT (nob > 0);
LASSERT (!in_interrupt()); /* Mapping could block */
- if (type == OPENIBNAL_MSG_PUT_RDMA) {
+ if (type == IBNAL_MSG_PUT_RDMA) {
access = IB_ACCESS_REMOTE_READ;
} else {
access = IB_ACCESS_REMOTE_WRITE |
IB_ACCESS_LOCAL_WRITE;
}
- tx = koibnal_get_idle_tx (1); /* May block; caller is an app thread */
+ tx = kibnal_get_idle_tx (1); /* May block; caller is an app thread */
LASSERT (tx != NULL);
if ((libmsg->md->options & PTL_MD_KIOV) == 0)
- rc = koibnal_map_iov (tx, access,
- libmsg->md->md_niov,
- libmsg->md->md_iov.iov,
- 0, nob);
+ rc = kibnal_map_iov (tx, access,
+ libmsg->md->md_niov,
+ libmsg->md->md_iov.iov,
+ 0, nob);
else
- rc = koibnal_map_kiov (tx, access,
- libmsg->md->md_niov,
- libmsg->md->md_iov.kiov,
- 0, nob);
+ rc = kibnal_map_kiov (tx, access,
+ libmsg->md->md_niov,
+ libmsg->md->md_iov.kiov,
+ 0, nob);
if (rc != 0) {
CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc);
goto failed;
}
- if (type == OPENIBNAL_MSG_GET_RDMA) {
+ if (type == IBNAL_MSG_GET_RDMA) {
/* reply gets finalized when tx completes */
- tx->tx_libmsg[1] = lib_create_reply_msg(&koibnal_lib,
+ tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib,
nid, libmsg);
if (tx->tx_libmsg[1] == NULL) {
CERROR ("Can't create reply for GET -> "LPX64"\n",
tx->tx_passive_rdma = 1;
- oibmsg = tx->tx_msg;
+ ibmsg = tx->tx_msg;
- oibmsg->oibm_u.rdma.oibrm_hdr = *hdr;
- oibmsg->oibm_u.rdma.oibrm_cookie = tx->tx_passive_rdma_cookie;
- oibmsg->oibm_u.rdma.oibrm_desc.rd_key = tx->tx_md.md_rkey;
- oibmsg->oibm_u.rdma.oibrm_desc.rd_addr = tx->tx_md.md_addr;
- oibmsg->oibm_u.rdma.oibrm_desc.rd_nob = nob;
+ ibmsg->ibm_u.rdma.ibrm_hdr = *hdr;
+ ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie;
+ ibmsg->ibm_u.rdma.ibrm_desc.rd_key = tx->tx_md.md_rkey;
+ ibmsg->ibm_u.rdma.ibrm_desc.rd_addr = tx->tx_md.md_addr;
+ ibmsg->ibm_u.rdma.ibrm_desc.rd_nob = nob;
- koibnal_init_tx_msg (tx, type, sizeof (koib_rdma_msg_t));
+ kibnal_init_tx_msg (tx, type, sizeof (kib_rdma_msg_t));
CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr "
LPX64", nob %d\n",
/* libmsg gets finalized when tx completes. */
tx->tx_libmsg[0] = libmsg;
- koibnal_launch_tx(tx, nid);
+ kibnal_launch_tx(tx, nid);
return (PTL_OK);
failed:
tx->tx_status = rc;
- koibnal_tx_done (tx);
+ kibnal_tx_done (tx);
return (PTL_FAIL);
}
void
-koibnal_start_active_rdma (int type, int status,
- koib_rx_t *rx, lib_msg_t *libmsg,
+kibnal_start_active_rdma (int type, int status,
+ kib_rx_t *rx, lib_msg_t *libmsg,
unsigned int niov,
struct iovec *iov, ptl_kiov_t *kiov,
size_t offset, size_t nob)
{
- koib_msg_t *rxmsg = rx->rx_msg;
- koib_msg_t *txmsg;
- koib_tx_t *tx;
+ kib_msg_t *rxmsg = rx->rx_msg;
+ kib_msg_t *txmsg;
+ kib_tx_t *tx;
int access;
int rdma_op;
int rc;
/* No data if we're completing with failure */
LASSERT (status == 0 || nob == 0);
- LASSERT (type == OPENIBNAL_MSG_GET_DONE ||
- type == OPENIBNAL_MSG_PUT_DONE);
+ LASSERT (type == IBNAL_MSG_GET_DONE ||
+ type == IBNAL_MSG_PUT_DONE);
/* Flag I'm completing the RDMA. Even if I fail to send the
* completion message, I will have tried my best so further
LASSERT (!rx->rx_rdma);
rx->rx_rdma = 1;
- if (type == OPENIBNAL_MSG_GET_DONE) {
+ if (type == IBNAL_MSG_GET_DONE) {
access = 0;
rdma_op = IB_OP_RDMA_WRITE;
- LASSERT (rxmsg->oibm_type == OPENIBNAL_MSG_GET_RDMA);
+ LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA);
} else {
access = IB_ACCESS_LOCAL_WRITE;
rdma_op = IB_OP_RDMA_READ;
- LASSERT (rxmsg->oibm_type == OPENIBNAL_MSG_PUT_RDMA);
+ LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA);
}
- tx = koibnal_get_idle_tx (0); /* Mustn't block */
+ tx = kibnal_get_idle_tx (0); /* Mustn't block */
if (tx == NULL) {
CERROR ("tx descs exhausted on RDMA from "LPX64
" completing locally with failure\n",
- rx->rx_conn->ibc_peer->ibp_nid);
- lib_finalize (&koibnal_lib, NULL, libmsg, PTL_NO_SPACE);
+ rx->rx_conn->ibc_peer->ibp_nid);
+ lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE);
return;
}
LASSERT (tx->tx_nsp == 0);
* message is matched) */
if (kiov != NULL)
- rc = koibnal_map_kiov (tx, access,
- niov, kiov, offset, nob);
+ rc = kibnal_map_kiov (tx, access,
+ niov, kiov, offset, nob);
else
- rc = koibnal_map_iov (tx, access,
- niov, iov, offset, nob);
+ rc = kibnal_map_iov (tx, access,
+ niov, iov, offset, nob);
if (rc != 0) {
CERROR ("Can't map RDMA -> "LPX64": %d\n",
};
tx->tx_sp[0] = (struct ib_send_param) {
- .work_request_id = (__u64)((unsigned long)tx),
+ .work_request_id = kibnal_ptr2wreqid(tx, 0),
.op = rdma_op,
.gather_list = &tx->tx_gl[0],
.num_gather_entries = 1,
- .remote_address = rxmsg->oibm_u.rdma.oibrm_desc.rd_addr,
- .rkey = rxmsg->oibm_u.rdma.oibrm_desc.rd_key,
+ .remote_address = rxmsg->ibm_u.rdma.ibrm_desc.rd_addr,
+ .rkey = rxmsg->ibm_u.rdma.ibrm_desc.rd_key,
.device_specific = NULL,
.solicited_event = 0,
.signaled = 1,
txmsg = tx->tx_msg;
- txmsg->oibm_u.completion.oibcm_cookie = rxmsg->oibm_u.rdma.oibrm_cookie;
- txmsg->oibm_u.completion.oibcm_status = status;
+ txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie;
+ txmsg->ibm_u.completion.ibcm_status = status;
- koibnal_init_tx_msg(tx, type, sizeof (koib_completion_msg_t));
+ kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
if (status == 0 && nob != 0) {
LASSERT (tx->tx_nsp > 1);
LASSERT (tx->tx_nsp == 1);
/* No RDMA: local completion happens now! */
CDEBUG(D_WARNING,"No data: immediate completion\n");
- lib_finalize (&koibnal_lib, NULL, libmsg,
+ lib_finalize (&kibnal_lib, NULL, libmsg,
status == 0 ? PTL_OK : PTL_FAIL);
}
atomic_read (&rx->rx_conn->ibc_refcount));
atomic_inc (&rx->rx_conn->ibc_refcount);
/* ...and queue it up */
- koibnal_queue_tx(tx, rx->rx_conn);
+ kibnal_queue_tx(tx, rx->rx_conn);
}
ptl_err_t
-koibnal_sendmsg(lib_nal_t *nal,
+kibnal_sendmsg(lib_nal_t *nal,
void *private,
lib_msg_t *libmsg,
ptl_hdr_t *hdr,
size_t payload_offset,
size_t payload_nob)
{
- koib_msg_t *oibmsg;
- koib_tx_t *tx;
+ kib_msg_t *ibmsg;
+ kib_tx_t *tx;
int nob;
/* NB 'private' is different depending on what we're sending.... */
case PTL_MSG_REPLY: {
/* reply's 'private' is the incoming receive */
- koib_rx_t *rx = private;
+ kib_rx_t *rx = private;
/* RDMA reply expected? */
- if (rx->rx_msg->oibm_type == OPENIBNAL_MSG_GET_RDMA) {
- koibnal_start_active_rdma(OPENIBNAL_MSG_GET_DONE, 0,
- rx, libmsg, payload_niov,
- payload_iov, payload_kiov,
- payload_offset, payload_nob);
+ if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) {
+ kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0,
+ rx, libmsg, payload_niov,
+ payload_iov, payload_kiov,
+ payload_offset, payload_nob);
return (PTL_OK);
}
/* Incoming message consistent with immediate reply? */
- if (rx->rx_msg->oibm_type != OPENIBNAL_MSG_IMMEDIATE) {
+ if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) {
CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n",
- nid, rx->rx_msg->oibm_type);
+ nid, rx->rx_msg->ibm_type);
return (PTL_FAIL);
}
/* Will it fit in a message? */
- nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[payload_nob]);
- if (nob >= OPENIBNAL_MSG_SIZE) {
+ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+ if (nob >= IBNAL_MSG_SIZE) {
CERROR("REPLY for "LPX64" too big (RDMA not requested): %d\n",
nid, payload_nob);
return (PTL_FAIL);
case PTL_MSG_GET:
/* might the REPLY message be big enough to need RDMA? */
- nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[libmsg->md->length]);
- if (nob > OPENIBNAL_MSG_SIZE)
- return (koibnal_start_passive_rdma(OPENIBNAL_MSG_GET_RDMA,
- nid, libmsg, hdr));
+ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]);
+ if (nob > IBNAL_MSG_SIZE)
+ return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA,
+ nid, libmsg, hdr));
break;
case PTL_MSG_ACK:
case PTL_MSG_PUT:
/* Is the payload big enough to need RDMA? */
- nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[payload_nob]);
- if (nob > OPENIBNAL_MSG_SIZE)
- return (koibnal_start_passive_rdma(OPENIBNAL_MSG_PUT_RDMA,
- nid, libmsg, hdr));
+ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+ if (nob > IBNAL_MSG_SIZE)
+ return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA,
+ nid, libmsg, hdr));
break;
}
- tx = koibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
- type == PTL_MSG_REPLY ||
- in_interrupt()));
+ tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
+ type == PTL_MSG_REPLY ||
+ in_interrupt()));
if (tx == NULL) {
CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n",
type, nid, in_interrupt() ? " (intr)" : "");
return (PTL_NO_SPACE);
}
- oibmsg = tx->tx_msg;
- oibmsg->oibm_u.immediate.oibim_hdr = *hdr;
+ ibmsg = tx->tx_msg;
+ ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
if (payload_nob > 0) {
if (payload_kiov != NULL)
- lib_copy_kiov2buf(oibmsg->oibm_u.immediate.oibim_payload,
+ lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload,
payload_niov, payload_kiov,
payload_offset, payload_nob);
else
- lib_copy_iov2buf(oibmsg->oibm_u.immediate.oibim_payload,
+ lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload,
payload_niov, payload_iov,
payload_offset, payload_nob);
}
- koibnal_init_tx_msg (tx, OPENIBNAL_MSG_IMMEDIATE,
- offsetof(koib_immediate_msg_t,
- oibim_payload[payload_nob]));
+ kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE,
+ offsetof(kib_immediate_msg_t,
+ ibim_payload[payload_nob]));
/* libmsg gets finalized when tx completes */
tx->tx_libmsg[0] = libmsg;
- koibnal_launch_tx(tx, nid);
+ kibnal_launch_tx(tx, nid);
return (PTL_OK);
}
ptl_err_t
-koibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
+kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
unsigned int payload_niov, struct iovec *payload_iov,
size_t payload_offset, size_t payload_len)
{
- return (koibnal_sendmsg(nal, private, cookie,
- hdr, type, nid, pid,
- payload_niov, payload_iov, NULL,
- payload_offset, payload_len));
+ return (kibnal_sendmsg(nal, private, cookie,
+ hdr, type, nid, pid,
+ payload_niov, payload_iov, NULL,
+ payload_offset, payload_len));
}
ptl_err_t
-koibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie,
+kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie,
ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
unsigned int payload_niov, ptl_kiov_t *payload_kiov,
size_t payload_offset, size_t payload_len)
{
- return (koibnal_sendmsg(nal, private, cookie,
- hdr, type, nid, pid,
- payload_niov, NULL, payload_kiov,
- payload_offset, payload_len));
+ return (kibnal_sendmsg(nal, private, cookie,
+ hdr, type, nid, pid,
+ payload_niov, NULL, payload_kiov,
+ payload_offset, payload_len));
}
ptl_err_t
-koibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
+kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov,
size_t offset, size_t mlen, size_t rlen)
{
- koib_rx_t *rx = private;
- koib_msg_t *rxmsg = rx->rx_msg;
- int msg_nob;
+ kib_rx_t *rx = private;
+ kib_msg_t *rxmsg = rx->rx_msg;
+ int msg_nob;
LASSERT (mlen <= rlen);
LASSERT (!in_interrupt ());
/* Either all pages or all vaddrs */
LASSERT (!(kiov != NULL && iov != NULL));
- switch (rxmsg->oibm_type) {
+ switch (rxmsg->ibm_type) {
default:
LBUG();
return (PTL_FAIL);
- case OPENIBNAL_MSG_IMMEDIATE:
- msg_nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[rlen]);
- if (msg_nob > OPENIBNAL_MSG_SIZE) {
+ case IBNAL_MSG_IMMEDIATE:
+ msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
+ if (msg_nob > IBNAL_MSG_SIZE) {
CERROR ("Immediate message from "LPX64" too big: %d\n",
- rxmsg->oibm_u.immediate.oibim_hdr.src_nid, rlen);
+ rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen);
return (PTL_FAIL);
}
if (kiov != NULL)
lib_copy_buf2kiov(niov, kiov, offset,
- rxmsg->oibm_u.immediate.oibim_payload,
+ rxmsg->ibm_u.immediate.ibim_payload,
mlen);
else
lib_copy_buf2iov(niov, iov, offset,
- rxmsg->oibm_u.immediate.oibim_payload,
+ rxmsg->ibm_u.immediate.ibim_payload,
mlen);
lib_finalize (nal, NULL, libmsg, PTL_OK);
return (PTL_OK);
- case OPENIBNAL_MSG_GET_RDMA:
+ case IBNAL_MSG_GET_RDMA:
/* We get called here just to discard any junk after the
* GET hdr. */
LASSERT (libmsg == NULL);
lib_finalize (nal, NULL, libmsg, PTL_OK);
return (PTL_OK);
- case OPENIBNAL_MSG_PUT_RDMA:
- koibnal_start_active_rdma (OPENIBNAL_MSG_PUT_DONE, 0,
- rx, libmsg,
- niov, iov, kiov, offset, mlen);
+ case IBNAL_MSG_PUT_RDMA:
+ kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0,
+ rx, libmsg,
+ niov, iov, kiov, offset, mlen);
return (PTL_OK);
}
}
ptl_err_t
-koibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
+kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
unsigned int niov, struct iovec *iov,
size_t offset, size_t mlen, size_t rlen)
{
- return (koibnal_recvmsg (nal, private, msg, niov, iov, NULL,
- offset, mlen, rlen));
+ return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL,
+ offset, mlen, rlen));
}
ptl_err_t
-koibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
+kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
unsigned int niov, ptl_kiov_t *kiov,
size_t offset, size_t mlen, size_t rlen)
{
- return (koibnal_recvmsg (nal, private, msg, niov, NULL, kiov,
- offset, mlen, rlen));
+ return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov,
+ offset, mlen, rlen));
}
int
-koibnal_thread_start (int (*fn)(void *arg), void *arg)
+kibnal_thread_start (int (*fn)(void *arg), void *arg)
{
long pid = kernel_thread (fn, arg, 0);
if (pid < 0)
return ((int)pid);
- atomic_inc (&koibnal_data.koib_nthreads);
+ atomic_inc (&kibnal_data.kib_nthreads);
return (0);
}
void
-koibnal_thread_fini (void)
+kibnal_thread_fini (void)
{
- atomic_dec (&koibnal_data.koib_nthreads);
+ atomic_dec (&kibnal_data.kib_nthreads);
}
void
-koibnal_close_conn_locked (koib_conn_t *conn, int error)
+kibnal_close_conn_locked (kib_conn_t *conn, int error)
{
/* This just does the immmediate housekeeping, and schedules the
* connection for the connd to finish off.
- * Caller holds koib_global_lock exclusively in irq context */
- koib_peer_t *peer = conn->ibc_peer;
+ * Caller holds kib_global_lock exclusively in irq context */
+ kib_peer_t *peer = conn->ibc_peer;
CDEBUG (error == 0 ? D_NET : D_ERROR,
"closing conn to "LPX64": error %d\n", peer->ibp_nid, error);
- LASSERT (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED ||
- conn->ibc_state == OPENIBNAL_CONN_CONNECTING);
+ LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED ||
+ conn->ibc_state == IBNAL_CONN_CONNECTING);
- if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED) {
- /* koib_connd_conns takes ibc_list's ref */
+ if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
+ /* kib_connd_conns takes ibc_list's ref */
list_del (&conn->ibc_list);
} else {
- /* new ref for koib_connd_conns */
+ /* new ref for kib_connd_conns */
CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
atomic_read (&conn->ibc_refcount));
if (list_empty (&peer->ibp_conns) &&
peer->ibp_persistence == 0) {
/* Non-persistent peer with no more conns... */
- koibnal_unlink_peer_locked (peer);
+ kibnal_unlink_peer_locked (peer);
}
- conn->ibc_state = OPENIBNAL_CONN_DEATHROW;
+ conn->ibc_state = IBNAL_CONN_DEATHROW;
/* Schedule conn for closing/destruction */
- spin_lock (&koibnal_data.koib_connd_lock);
+ spin_lock (&kibnal_data.kib_connd_lock);
- list_add_tail (&conn->ibc_list, &koibnal_data.koib_connd_conns);
- wake_up (&koibnal_data.koib_connd_waitq);
+ list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
+ wake_up (&kibnal_data.kib_connd_waitq);
- spin_unlock (&koibnal_data.koib_connd_lock);
+ spin_unlock (&kibnal_data.kib_connd_lock);
}
int
-koibnal_close_conn (koib_conn_t *conn, int why)
+kibnal_close_conn (kib_conn_t *conn, int why)
{
unsigned long flags;
int count = 0;
- write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
- LASSERT (conn->ibc_state >= OPENIBNAL_CONN_CONNECTING);
+ LASSERT (conn->ibc_state >= IBNAL_CONN_CONNECTING);
- if (conn->ibc_state <= OPENIBNAL_CONN_ESTABLISHED) {
+ if (conn->ibc_state <= IBNAL_CONN_ESTABLISHED) {
count = 1;
- koibnal_close_conn_locked (conn, why);
+ kibnal_close_conn_locked (conn, why);
}
- write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
return (count);
}
void
-koibnal_peer_connect_failed (koib_peer_t *peer, int active, int rc)
+kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc)
{
LIST_HEAD (zombies);
- koib_tx_t *tx;
+ kib_tx_t *tx;
unsigned long flags;
LASSERT (rc != 0);
- LASSERT (peer->ibp_reconnect_interval >= OPENIBNAL_MIN_RECONNECT_INTERVAL);
+ LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL);
- write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
LASSERT (peer->ibp_connecting != 0);
peer->ibp_connecting--;
if (peer->ibp_connecting != 0) {
/* another connection attempt under way (loopback?)... */
- write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
return;
}
peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval;
/* Increase reconnection interval */
peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2,
- OPENIBNAL_MAX_RECONNECT_INTERVAL);
+ IBNAL_MAX_RECONNECT_INTERVAL);
/* Take peer's blocked blocked transmits; I'll complete
* them with error */
while (!list_empty (&peer->ibp_tx_queue)) {
tx = list_entry (peer->ibp_tx_queue.next,
- koib_tx_t, tx_list);
+ kib_tx_t, tx_list);
list_del (&tx->tx_list);
list_add_tail (&tx->tx_list, &zombies);
}
- if (koibnal_peer_active(peer) &&
+ if (kibnal_peer_active(peer) &&
(peer->ibp_persistence == 0)) {
/* failed connection attempt on non-persistent peer */
- koibnal_unlink_peer_locked (peer);
+ kibnal_unlink_peer_locked (peer);
}
} else {
/* Can't have blocked transmits if there are connections */
LASSERT (list_empty(&peer->ibp_tx_queue));
}
- write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
if (!list_empty (&zombies))
CERROR ("Deleting messages for "LPX64": connection failed\n",
peer->ibp_nid);
while (!list_empty (&zombies)) {
- tx = list_entry (zombies.next, koib_tx_t, tx_list);
+ tx = list_entry (zombies.next, kib_tx_t, tx_list);
list_del (&tx->tx_list);
/* complete now */
tx->tx_status = -EHOSTUNREACH;
- koibnal_tx_done (tx);
+ kibnal_tx_done (tx);
}
}
void
-koibnal_connreq_done (koib_conn_t *conn, int active, int status)
+kibnal_connreq_done (kib_conn_t *conn, int active, int status)
{
int state = conn->ibc_state;
- koib_peer_t *peer = conn->ibc_peer;
- koib_tx_t *tx;
+ kib_peer_t *peer = conn->ibc_peer;
+ kib_tx_t *tx;
unsigned long flags;
int rc;
int i;
conn->ibc_connreq = NULL;
}
- if (state == OPENIBNAL_CONN_CONNECTING) {
+ if (state == IBNAL_CONN_CONNECTING) {
/* Install common (active/passive) callback for
* disconnect/idle notification if I got as far as getting
* a CM comm_id */
rc = tsIbCmCallbackModify(conn->ibc_comm_id,
- koibnal_conn_callback, conn);
+ kibnal_conn_callback, conn);
LASSERT (rc == 0);
}
- write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
LASSERT (peer->ibp_connecting != 0);
if (status == 0) {
/* connection established... */
- LASSERT (state == OPENIBNAL_CONN_CONNECTING);
- conn->ibc_state = OPENIBNAL_CONN_ESTABLISHED;
+ LASSERT (state == IBNAL_CONN_CONNECTING);
+ conn->ibc_state = IBNAL_CONN_ESTABLISHED;
- if (!koibnal_peer_active(peer)) {
+ if (!kibnal_peer_active(peer)) {
/* ...but peer deleted meantime */
status = -ECONNABORTED;
}
} else {
- LASSERT (state == OPENIBNAL_CONN_INIT_QP ||
- state == OPENIBNAL_CONN_CONNECTING);
+ LASSERT (state == IBNAL_CONN_INIT_QP ||
+ state == IBNAL_CONN_CONNECTING);
}
if (status == 0) {
list_add (&conn->ibc_list, &peer->ibp_conns);
/* reset reconnect interval for next attempt */
- peer->ibp_reconnect_interval = OPENIBNAL_MIN_RECONNECT_INTERVAL;
+ peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
/* post blocked sends to the new connection */
spin_lock (&conn->ibc_lock);
while (!list_empty (&peer->ibp_tx_queue)) {
tx = list_entry (peer->ibp_tx_queue.next,
- koib_tx_t, tx_list);
+ kib_tx_t, tx_list);
list_del (&tx->tx_list);
conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
atomic_read (&conn->ibc_refcount));
atomic_inc (&conn->ibc_refcount);
- koibnal_queue_tx_locked (tx, conn);
+ kibnal_queue_tx_locked (tx, conn);
}
spin_unlock (&conn->ibc_lock);
/* Nuke any dangling conns from a different peer instance... */
- koibnal_close_stale_conns_locked (conn->ibc_peer,
- conn->ibc_incarnation);
+ kibnal_close_stale_conns_locked (conn->ibc_peer,
+ conn->ibc_incarnation);
- write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
/* queue up all the receives */
- for (i = 0; i < OPENIBNAL_RX_MSGS; i++) {
+ for (i = 0; i < IBNAL_RX_MSGS; i++) {
/* +1 ref for rx desc */
CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg,
conn->ibc_rxs[i].rx_vaddr);
- koibnal_post_rx (&conn->ibc_rxs[i], 0);
+ kibnal_post_rx (&conn->ibc_rxs[i], 0);
}
- koibnal_check_sends (conn);
+ kibnal_check_sends (conn);
return;
}
/* connection failed */
- if (state == OPENIBNAL_CONN_CONNECTING) {
+ if (state == IBNAL_CONN_CONNECTING) {
/* schedule for connd to close */
- koibnal_close_conn_locked (conn, status);
+ kibnal_close_conn_locked (conn, status);
} else {
/* Don't have a CM comm_id; just wait for refs to drain */
- conn->ibc_state = OPENIBNAL_CONN_ZOMBIE;
+ conn->ibc_state = IBNAL_CONN_ZOMBIE;
}
- write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
- koibnal_peer_connect_failed (conn->ibc_peer, active, status);
+ kibnal_peer_connect_failed (conn->ibc_peer, active, status);
- if (state != OPENIBNAL_CONN_CONNECTING) {
+ if (state != IBNAL_CONN_CONNECTING) {
/* drop caller's ref if we're not waiting for the
* IB_CM_IDLE callback */
- koibnal_put_conn (conn);
+ kibnal_put_conn (conn);
}
}
int
-koibnal_accept (koib_conn_t **connp, tTS_IB_CM_COMM_ID cid,
+kibnal_accept (kib_conn_t **connp, tTS_IB_CM_COMM_ID cid,
ptl_nid_t nid, __u64 incarnation, int queue_depth)
{
- koib_conn_t *conn = koibnal_create_conn();
- koib_peer_t *peer;
- koib_peer_t *peer2;
+ kib_conn_t *conn = kibnal_create_conn();
+ kib_peer_t *peer;
+ kib_peer_t *peer2;
unsigned long flags;
if (conn == NULL)
return (-ENOMEM);
- if (queue_depth != OPENIBNAL_MSG_QUEUE_SIZE) {
+ if (queue_depth != IBNAL_MSG_QUEUE_SIZE) {
CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n",
- nid, queue_depth, OPENIBNAL_MSG_QUEUE_SIZE);
+ nid, queue_depth, IBNAL_MSG_QUEUE_SIZE);
return (-EPROTO);
}
/* assume 'nid' is a new peer */
- peer = koibnal_create_peer (nid);
+ peer = kibnal_create_peer (nid);
if (peer == NULL) {
CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n",
conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
atomic_read (&conn->ibc_refcount));
atomic_dec (&conn->ibc_refcount);
- koibnal_destroy_conn(conn);
+ kibnal_destroy_conn(conn);
return (-ENOMEM);
}
- write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
- peer2 = koibnal_find_peer_locked(nid);
+ peer2 = kibnal_find_peer_locked(nid);
if (peer2 == NULL) {
/* peer table takes my ref on peer */
list_add_tail (&peer->ibp_list,
- koibnal_nid2peerlist(nid));
+ kibnal_nid2peerlist(nid));
} else {
- koibnal_put_peer (peer);
+ kibnal_put_peer (peer);
peer = peer2;
}
atomic_inc (&peer->ibp_refcount);
peer->ibp_connecting++;
- write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
conn->ibc_peer = peer;
- conn->ibc_state = OPENIBNAL_CONN_CONNECTING;
+ conn->ibc_state = IBNAL_CONN_CONNECTING;
conn->ibc_comm_id = cid;
conn->ibc_incarnation = incarnation;
- conn->ibc_credits = OPENIBNAL_MSG_QUEUE_SIZE;
+ conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
*connp = conn;
return (0);
}
tTS_IB_CM_CALLBACK_RETURN
-koibnal_idle_conn_callback (tTS_IB_CM_EVENT event,
+kibnal_idle_conn_callback (tTS_IB_CM_EVENT event,
tTS_IB_CM_COMM_ID cid,
void *param,
void *arg)
}
tTS_IB_CM_CALLBACK_RETURN
-koibnal_conn_callback (tTS_IB_CM_EVENT event,
+kibnal_conn_callback (tTS_IB_CM_EVENT event,
tTS_IB_CM_COMM_ID cid,
void *param,
void *arg)
{
- koib_conn_t *conn = arg;
- int rc;
+ kib_conn_t *conn = arg;
+ LIST_HEAD (zombies);
+ struct list_head *tmp;
+ struct list_head *nxt;
+ kib_tx_t *tx;
+ unsigned long flags;
+ int done;
+ int rc;
/* Established Connection Notifier */
default:
CERROR("Connection %p -> "LPX64" ERROR %d\n",
conn, conn->ibc_peer->ibp_nid, event);
- koibnal_close_conn (conn, -ECONNABORTED);
+ kibnal_close_conn (conn, -ECONNABORTED);
break;
case TS_IB_CM_DISCONNECTED:
CDEBUG(D_WARNING, "Connection %p -> "LPX64" DISCONNECTED.\n",
conn, conn->ibc_peer->ibp_nid);
- koibnal_close_conn (conn, 0);
+ kibnal_close_conn (conn, 0);
break;
case TS_IB_CM_IDLE:
CDEBUG(D_NET, "Connection %p -> "LPX64" IDLE.\n",
conn, conn->ibc_peer->ibp_nid);
- koibnal_put_conn (conn); /* Lose CM's ref */
+ kibnal_put_conn (conn); /* Lose CM's ref */
/* LASSERT (no further callbacks) */
rc = tsIbCmCallbackModify(cid,
- koibnal_idle_conn_callback, conn);
+ kibnal_idle_conn_callback, conn);
LASSERT (rc == 0);
+
+ /* NB we wait until the connection has closed before
+ * completing outstanding passive RDMAs so we can be sure
+ * the network can't touch the mapped memory any more. */
+
+ spin_lock_irqsave (&conn->ibc_lock, flags);
+
+ /* grab passive RDMAs not waiting for the tx callback */
+ list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) {
+ tx = list_entry (tmp, kib_tx_t, tx_list);
+
+ LASSERT (tx->tx_passive_rdma ||
+ !tx->tx_passive_rdma_wait);
+
+ LASSERT (tx->tx_passive_rdma_wait ||
+ tx->tx_sending != 0);
+
+ /* still waiting for tx callback? */
+ if (!tx->tx_passive_rdma_wait)
+ continue;
+
+ tx->tx_status = -ECONNABORTED;
+ tx->tx_passive_rdma_wait = 0;
+ done = (tx->tx_sending == 0);
+
+ if (!done)
+ continue;
+
+ list_del (&tx->tx_list);
+ list_add (&tx->tx_list, &zombies);
+ }
+
+ /* grab all blocked transmits */
+ list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) {
+ tx = list_entry (tmp, kib_tx_t, tx_list);
+
+ list_del (&tx->tx_list);
+ list_add (&tx->tx_list, &zombies);
+ }
+
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+ while (!list_empty(&zombies)) {
+ tx = list_entry (zombies.next, kib_tx_t, tx_list);
+
+ list_del(&tx->tx_list);
+ kibnal_tx_done (tx);
+ }
break;
}
}
tTS_IB_CM_CALLBACK_RETURN
-koibnal_passive_conn_callback (tTS_IB_CM_EVENT event,
+kibnal_passive_conn_callback (tTS_IB_CM_EVENT event,
tTS_IB_CM_COMM_ID cid,
void *param,
void *arg)
{
- koib_conn_t *conn = arg;
+ kib_conn_t *conn = arg;
int rc;
switch (event) {
CERROR ("Unexpected event %p -> "LPX64": %d\n",
conn, conn->ibc_peer->ibp_nid, event);
- koibnal_connreq_done (conn, 0, -ECONNABORTED);
+ kibnal_connreq_done (conn, 0, -ECONNABORTED);
break;
case TS_IB_CM_REQ_RECEIVED: {
struct ib_cm_req_received_param *req = param;
- koib_wire_connreq_t *wcr = req->remote_private_data;
+ kib_wire_connreq_t *wcr = req->remote_private_data;
LASSERT (conn == NULL);
return TS_IB_CM_CALLBACK_ABORT;
}
- if (wcr->wcr_magic != cpu_to_le32(OPENIBNAL_MSG_MAGIC)) {
+ if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
CERROR ("Can't accept LID %04x: bad magic %08x\n",
req->dlid, le32_to_cpu(wcr->wcr_magic));
return TS_IB_CM_CALLBACK_ABORT;
}
- if (wcr->wcr_version != cpu_to_le16(OPENIBNAL_MSG_VERSION)) {
+ if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
CERROR ("Can't accept LID %04x: bad version %d\n",
req->dlid, le16_to_cpu(wcr->wcr_magic));
return TS_IB_CM_CALLBACK_ABORT;
}
- rc = koibnal_accept(&conn,
- cid,
- le64_to_cpu(wcr->wcr_nid),
- le64_to_cpu(wcr->wcr_incarnation),
- le16_to_cpu(wcr->wcr_queue_depth));
+ rc = kibnal_accept(&conn,
+ cid,
+ le64_to_cpu(wcr->wcr_nid),
+ le64_to_cpu(wcr->wcr_incarnation),
+ le16_to_cpu(wcr->wcr_queue_depth));
if (rc != 0) {
CERROR ("Can't accept "LPX64": %d\n",
le64_to_cpu(wcr->wcr_nid), rc);
/* update 'arg' for next callback */
rc = tsIbCmCallbackModify(cid,
- koibnal_passive_conn_callback, conn);
+ kibnal_passive_conn_callback, conn);
LASSERT (rc == 0);
req->accept_param.qp = conn->ibc_qp;
- *((koib_wire_connreq_t *)req->accept_param.reply_private_data)
- = (koib_wire_connreq_t) {
- .wcr_magic = cpu_to_le32(OPENIBNAL_MSG_MAGIC),
- .wcr_version = cpu_to_le16(OPENIBNAL_MSG_VERSION),
- .wcr_queue_depth = cpu_to_le32(OPENIBNAL_MSG_QUEUE_SIZE),
- .wcr_nid = cpu_to_le64(koibnal_data.koib_nid),
- .wcr_incarnation = cpu_to_le64(koibnal_data.koib_incarnation),
+ *((kib_wire_connreq_t *)req->accept_param.reply_private_data)
+ = (kib_wire_connreq_t) {
+ .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC),
+ .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION),
+ .wcr_queue_depth = cpu_to_le32(IBNAL_MSG_QUEUE_SIZE),
+ .wcr_nid = cpu_to_le64(kibnal_data.kib_nid),
+ .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
};
- req->accept_param.reply_private_data_len = sizeof(koib_wire_connreq_t);
- req->accept_param.responder_resources = OPENIBNAL_RESPONDER_RESOURCES;
- req->accept_param.initiator_depth = OPENIBNAL_RESPONDER_RESOURCES;
- req->accept_param.rnr_retry_count = OPENIBNAL_RNR_RETRY;
- req->accept_param.flow_control = OPENIBNAL_FLOW_CONTROL;
+ req->accept_param.reply_private_data_len = sizeof(kib_wire_connreq_t);
+ req->accept_param.responder_resources = IBNAL_RESPONDER_RESOURCES;
+ req->accept_param.initiator_depth = IBNAL_RESPONDER_RESOURCES;
+ req->accept_param.rnr_retry_count = IBNAL_RNR_RETRY;
+ req->accept_param.flow_control = IBNAL_FLOW_CONTROL;
CDEBUG(D_NET, "Proceeding\n");
break;
CDEBUG(D_WARNING, "Connection %p -> "LPX64" ESTABLISHED.\n",
conn, conn->ibc_peer->ibp_nid);
- koibnal_connreq_done (conn, 0, 0);
+ kibnal_connreq_done (conn, 0, 0);
break;
}
- /* NB if the connreq is done, we switch to koibnal_conn_callback */
+ /* NB if the connreq is done, we switch to kibnal_conn_callback */
return TS_IB_CM_CALLBACK_PROCEED;
}
tTS_IB_CM_CALLBACK_RETURN
-koibnal_active_conn_callback (tTS_IB_CM_EVENT event,
+kibnal_active_conn_callback (tTS_IB_CM_EVENT event,
tTS_IB_CM_COMM_ID cid,
void *param,
void *arg)
{
- koib_conn_t *conn = arg;
+ kib_conn_t *conn = arg;
switch (event) {
case TS_IB_CM_REP_RECEIVED: {
struct ib_cm_rep_received_param *rep = param;
- koib_wire_connreq_t *wcr = rep->remote_private_data;
+ kib_wire_connreq_t *wcr = rep->remote_private_data;
if (rep->remote_private_data_len < sizeof (*wcr)) {
CERROR ("Short reply from "LPX64": %d\n",
conn->ibc_peer->ibp_nid,
rep->remote_private_data_len);
- koibnal_connreq_done (conn, 1, -EPROTO);
+ kibnal_connreq_done (conn, 1, -EPROTO);
break;
}
- if (wcr->wcr_magic != cpu_to_le32(OPENIBNAL_MSG_MAGIC)) {
+ if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
CERROR ("Can't connect "LPX64": bad magic %08x\n",
conn->ibc_peer->ibp_nid, le32_to_cpu(wcr->wcr_magic));
- koibnal_connreq_done (conn, 1, -EPROTO);
+ kibnal_connreq_done (conn, 1, -EPROTO);
break;
}
- if (wcr->wcr_version != cpu_to_le16(OPENIBNAL_MSG_VERSION)) {
+ if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
CERROR ("Can't connect "LPX64": bad version %d\n",
conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_magic));
- koibnal_connreq_done (conn, 1, -EPROTO);
+ kibnal_connreq_done (conn, 1, -EPROTO);
break;
}
- if (wcr->wcr_queue_depth != cpu_to_le16(OPENIBNAL_MSG_QUEUE_SIZE)) {
+ if (wcr->wcr_queue_depth != cpu_to_le16(IBNAL_MSG_QUEUE_SIZE)) {
CERROR ("Can't connect "LPX64": bad queue depth %d\n",
conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_queue_depth));
- koibnal_connreq_done (conn, 1, -EPROTO);
+ kibnal_connreq_done (conn, 1, -EPROTO);
break;
}
if (le64_to_cpu(wcr->wcr_nid) != conn->ibc_peer->ibp_nid) {
CERROR ("Unexpected NID "LPX64" from "LPX64"\n",
le64_to_cpu(wcr->wcr_nid), conn->ibc_peer->ibp_nid);
- koibnal_connreq_done (conn, 1, -EPROTO);
+ kibnal_connreq_done (conn, 1, -EPROTO);
break;
}
conn, conn->ibc_peer->ibp_nid);
conn->ibc_incarnation = le64_to_cpu(wcr->wcr_incarnation);
- conn->ibc_credits = OPENIBNAL_MSG_QUEUE_SIZE;
+ conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
break;
}
CDEBUG(D_WARNING, "Connection %p -> "LPX64" Established\n",
conn, conn->ibc_peer->ibp_nid);
- koibnal_connreq_done (conn, 1, 0);
+ kibnal_connreq_done (conn, 1, 0);
break;
case TS_IB_CM_IDLE:
CERROR("Connection %p -> "LPX64" IDLE\n",
conn, conn->ibc_peer->ibp_nid);
/* Back out state change: I'm disengaged from CM */
- conn->ibc_state = OPENIBNAL_CONN_INIT_QP;
+ conn->ibc_state = IBNAL_CONN_INIT_QP;
- koibnal_connreq_done (conn, 1, -ECONNABORTED);
+ kibnal_connreq_done (conn, 1, -ECONNABORTED);
break;
default:
CERROR("Connection %p -> "LPX64" ERROR %d\n",
conn, conn->ibc_peer->ibp_nid, event);
- koibnal_connreq_done (conn, 1, -ECONNABORTED);
+ kibnal_connreq_done (conn, 1, -ECONNABORTED);
break;
}
- /* NB if the connreq is done, we switch to koibnal_conn_callback */
+ /* NB if the connreq is done, we switch to kibnal_conn_callback */
return TS_IB_CM_CALLBACK_PROCEED;
}
int
-koibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
+kibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
struct ib_path_record *resp, int remaining,
void *arg)
{
- koib_conn_t *conn = arg;
+ kib_conn_t *conn = arg;
if (status != 0) {
CERROR ("status %d\n", status);
- koibnal_connreq_done (conn, 1, status);
+ kibnal_connreq_done (conn, 1, status);
goto out;
}
conn->ibc_connreq->cr_path = *resp;
- conn->ibc_connreq->cr_wcr = (koib_wire_connreq_t) {
- .wcr_magic = cpu_to_le32(OPENIBNAL_MSG_MAGIC),
- .wcr_version = cpu_to_le16(OPENIBNAL_MSG_VERSION),
- .wcr_queue_depth = cpu_to_le16(OPENIBNAL_MSG_QUEUE_SIZE),
- .wcr_nid = cpu_to_le64(koibnal_data.koib_nid),
- .wcr_incarnation = cpu_to_le64(koibnal_data.koib_incarnation),
+ conn->ibc_connreq->cr_wcr = (kib_wire_connreq_t) {
+ .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC),
+ .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION),
+ .wcr_queue_depth = cpu_to_le16(IBNAL_MSG_QUEUE_SIZE),
+ .wcr_nid = cpu_to_le64(kibnal_data.kib_nid),
+ .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
};
conn->ibc_connreq->cr_connparam = (struct ib_cm_active_param) {
.qp = conn->ibc_qp,
.req_private_data = &conn->ibc_connreq->cr_wcr,
.req_private_data_len = sizeof(conn->ibc_connreq->cr_wcr),
- .responder_resources = OPENIBNAL_RESPONDER_RESOURCES,
- .initiator_depth = OPENIBNAL_RESPONDER_RESOURCES,
- .retry_count = OPENIBNAL_RETRY,
- .rnr_retry_count = OPENIBNAL_RNR_RETRY,
- .cm_response_timeout = koibnal_tunables.koib_io_timeout,
- .max_cm_retries = OPENIBNAL_CM_RETRY,
- .flow_control = OPENIBNAL_FLOW_CONTROL,
+ .responder_resources = IBNAL_RESPONDER_RESOURCES,
+ .initiator_depth = IBNAL_RESPONDER_RESOURCES,
+ .retry_count = IBNAL_RETRY,
+ .rnr_retry_count = IBNAL_RNR_RETRY,
+ .cm_response_timeout = kibnal_tunables.kib_io_timeout,
+ .max_cm_retries = IBNAL_CM_RETRY,
+ .flow_control = IBNAL_FLOW_CONTROL,
};
/* XXX set timeout just like SDP!!!*/
conn->ibc_connreq->cr_path.packet_life = 13;
/* Flag I'm getting involved with the CM... */
- conn->ibc_state = OPENIBNAL_CONN_CONNECTING;
+ conn->ibc_state = IBNAL_CONN_CONNECTING;
CDEBUG(D_NET, "Connecting to, service id "LPX64", on "LPX64"\n",
conn->ibc_connreq->cr_service.service_id,
- *koibnal_service_nid_field(&conn->ibc_connreq->cr_service));
+ *kibnal_service_nid_field(&conn->ibc_connreq->cr_service));
- /* koibnal_connect_callback gets my conn ref */
+ /* kibnal_connect_callback gets my conn ref */
status = ib_cm_connect (&conn->ibc_connreq->cr_connparam,
&conn->ibc_connreq->cr_path, NULL,
conn->ibc_connreq->cr_service.service_id, 0,
- koibnal_active_conn_callback, conn,
+ kibnal_active_conn_callback, conn,
&conn->ibc_comm_id);
if (status != 0) {
CERROR ("Connect: %d\n", status);
/* Back out state change: I've not got a CM comm_id yet... */
- conn->ibc_state = OPENIBNAL_CONN_INIT_QP;
- koibnal_connreq_done (conn, 1, status);
+ conn->ibc_state = IBNAL_CONN_INIT_QP;
+ kibnal_connreq_done (conn, 1, status);
}
out:
}
void
-koibnal_service_get_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
- struct ib_common_attrib_service *resp, void *arg)
+kibnal_service_get_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
+ struct ib_common_attrib_service *resp, void *arg)
{
- koib_conn_t *conn = arg;
+ kib_conn_t *conn = arg;
if (status != 0) {
CERROR ("status %d\n", status);
- koibnal_connreq_done (conn, 1, status);
+ kibnal_connreq_done (conn, 1, status);
return;
}
CDEBUG(D_NET, "Got status %d, service id "LPX64", on "LPX64"\n",
status, resp->service_id,
- *koibnal_service_nid_field(resp));
+ *kibnal_service_nid_field(resp));
conn->ibc_connreq->cr_service = *resp;
- status = ib_cached_gid_get(koibnal_data.koib_device,
- koibnal_data.koib_port, 0,
+ status = ib_cached_gid_get(kibnal_data.kib_device,
+ kibnal_data.kib_port, 0,
conn->ibc_connreq->cr_gid);
LASSERT (status == 0);
- /* koibnal_pathreq_callback gets my conn ref */
- status = tsIbPathRecordRequest (koibnal_data.koib_device,
- koibnal_data.koib_port,
+ /* kibnal_pathreq_callback gets my conn ref */
+ status = tsIbPathRecordRequest (kibnal_data.kib_device,
+ kibnal_data.kib_port,
conn->ibc_connreq->cr_gid,
conn->ibc_connreq->cr_service.service_gid,
conn->ibc_connreq->cr_service.service_pkey,
0,
- koibnal_tunables.koib_io_timeout * HZ,
+ kibnal_tunables.kib_io_timeout * HZ,
0,
- koibnal_pathreq_callback, conn,
+ kibnal_pathreq_callback, conn,
&conn->ibc_connreq->cr_tid);
if (status == 0)
return;
CERROR ("Path record request: %d\n", status);
- koibnal_connreq_done (conn, 1, status);
+ kibnal_connreq_done (conn, 1, status);
}
void
-koibnal_connect_peer (koib_peer_t *peer)
+kibnal_connect_peer (kib_peer_t *peer)
{
- koib_conn_t *conn = koibnal_create_conn();
+ kib_conn_t *conn = kibnal_create_conn();
int rc;
LASSERT (peer->ibp_connecting != 0);
if (conn == NULL) {
CERROR ("Can't allocate conn\n");
- koibnal_peer_connect_failed (peer, 1, -ENOMEM);
+ kibnal_peer_connect_failed (peer, 1, -ENOMEM);
return;
}
PORTAL_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
if (conn->ibc_connreq == NULL) {
CERROR ("Can't allocate connreq\n");
- koibnal_connreq_done (conn, 1, -ENOMEM);
+ kibnal_connreq_done (conn, 1, -ENOMEM);
return;
}
memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq));
- koibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid);
+ kibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid);
- /* koibnal_service_get_callback gets my conn ref */
- rc = ib_service_get (koibnal_data.koib_device,
- koibnal_data.koib_port,
+ /* kibnal_service_get_callback gets my conn ref */
+ rc = ib_service_get (kibnal_data.kib_device,
+ kibnal_data.kib_port,
&conn->ibc_connreq->cr_service,
- KOIBNAL_SERVICE_KEY_MASK,
- koibnal_tunables.koib_io_timeout * HZ,
- koibnal_service_get_callback, conn,
+ KIBNAL_SERVICE_KEY_MASK,
+ kibnal_tunables.kib_io_timeout * HZ,
+ kibnal_service_get_callback, conn,
&conn->ibc_connreq->cr_tid);
if (rc == 0)
return;
CERROR ("ib_service_get: %d\n", rc);
- koibnal_connreq_done (conn, 1, rc);
+ kibnal_connreq_done (conn, 1, rc);
}
int
-koibnal_conn_timed_out (koib_conn_t *conn)
+kibnal_conn_timed_out (kib_conn_t *conn)
{
- koib_tx_t *tx;
+ kib_tx_t *tx;
struct list_head *ttmp;
unsigned long flags;
- int rc = 0;
spin_lock_irqsave (&conn->ibc_lock, flags);
- list_for_each (ttmp, &conn->ibc_rdma_queue) {
- tx = list_entry (ttmp, koib_tx_t, tx_list);
+ list_for_each (ttmp, &conn->ibc_tx_queue) {
+ tx = list_entry (ttmp, kib_tx_t, tx_list);
- LASSERT (tx->tx_passive_rdma);
- LASSERT (tx->tx_passive_rdma_wait);
+ LASSERT (!tx->tx_passive_rdma_wait);
+ LASSERT (tx->tx_sending == 0);
- if (time_after_eq (jiffies, tx->tx_passive_rdma_deadline)) {
- rc = 1;
- break;
+ if (time_after_eq (jiffies, tx->tx_deadline)) {
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+ return 1;
}
}
+
+ list_for_each (ttmp, &conn->ibc_active_txs) {
+ tx = list_entry (ttmp, kib_tx_t, tx_list);
+
+ LASSERT (tx->tx_passive_rdma ||
+ !tx->tx_passive_rdma_wait);
+
+ LASSERT (tx->tx_passive_rdma_wait ||
+ tx->tx_sending != 0);
+
+ if (time_after_eq (jiffies, tx->tx_deadline)) {
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+ return 1;
+ }
+ }
+
spin_unlock_irqrestore (&conn->ibc_lock, flags);
- return rc;
+ return 0;
}
void
-koibnal_check_conns (int idx)
+kibnal_check_conns (int idx)
{
- struct list_head *peers = &koibnal_data.koib_peers[idx];
+ struct list_head *peers = &kibnal_data.kib_peers[idx];
struct list_head *ptmp;
- koib_peer_t *peer;
- koib_conn_t *conn;
+ kib_peer_t *peer;
+ kib_conn_t *conn;
struct list_head *ctmp;
again:
/* NB. We expect to have a look at all the peers and not find any
* rdmas to time out, so we just use a shared lock while we
* take a look... */
- read_lock (&koibnal_data.koib_global_lock);
+ read_lock (&kibnal_data.kib_global_lock);
list_for_each (ptmp, peers) {
- peer = list_entry (ptmp, koib_peer_t, ibp_list);
+ peer = list_entry (ptmp, kib_peer_t, ibp_list);
list_for_each (ctmp, &peer->ibp_conns) {
- conn = list_entry (ctmp, koib_conn_t, ibc_list);
+ conn = list_entry (ctmp, kib_conn_t, ibc_list);
+
+ LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
- LASSERT (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED);
/* In case we have enough credits to return via a
* NOOP, but there were no non-blocking tx descs
* free to do it last time... */
- koibnal_check_sends(conn);
+ kibnal_check_sends(conn);
- if (!koibnal_conn_timed_out(conn))
+ if (!kibnal_conn_timed_out(conn))
continue;
CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
atomic_read (&conn->ibc_refcount));
atomic_inc (&conn->ibc_refcount);
- read_unlock (&koibnal_data.koib_global_lock);
+ read_unlock (&kibnal_data.kib_global_lock);
CERROR("Timed out RDMA with "LPX64"\n",
peer->ibp_nid);
- koibnal_close_conn (conn, -ETIMEDOUT);
- koibnal_put_conn (conn);
+ kibnal_close_conn (conn, -ETIMEDOUT);
+ kibnal_put_conn (conn);
/* start again now I've dropped the lock */
goto again;
}
}
- read_unlock (&koibnal_data.koib_global_lock);
+ read_unlock (&kibnal_data.kib_global_lock);
}
void
-koibnal_terminate_conn (koib_conn_t *conn)
+kibnal_terminate_conn (kib_conn_t *conn)
{
- unsigned long flags;
int rc;
- int done;
CDEBUG(D_NET, "conn %p\n", conn);
- LASSERT (conn->ibc_state == OPENIBNAL_CONN_DEATHROW);
- conn->ibc_state = OPENIBNAL_CONN_ZOMBIE;
+ LASSERT (conn->ibc_state == IBNAL_CONN_DEATHROW);
+ conn->ibc_state = IBNAL_CONN_ZOMBIE;
rc = ib_cm_disconnect (conn->ibc_comm_id);
if (rc != 0)
CERROR ("Error %d disconnecting conn %p -> "LPX64"\n",
rc, conn, conn->ibc_peer->ibp_nid);
-
- /* complete blocked passive RDMAs */
- spin_lock_irqsave (&conn->ibc_lock, flags);
-
- while (!list_empty (&conn->ibc_rdma_queue)) {
- koib_tx_t *tx = list_entry (conn->ibc_rdma_queue.next,
- koib_tx_t, tx_list);
-
- LASSERT (tx->tx_passive_rdma);
- LASSERT (tx->tx_passive_rdma_wait);
-
- list_del (&tx->tx_list);
-
- tx->tx_passive_rdma_wait = 0;
- done = (tx->tx_sending == 0);
-
- tx->tx_status = -ECONNABORTED;
-
- spin_unlock_irqrestore (&conn->ibc_lock, flags);
-
- if (done)
- koibnal_tx_done (tx);
-
- spin_lock_irqsave (&conn->ibc_lock, flags);
- }
-
- spin_unlock_irqrestore (&conn->ibc_lock, flags);
-
- /* Complete all blocked transmits */
- koibnal_check_sends(conn);
}
int
-koibnal_connd (void *arg)
+kibnal_connd (void *arg)
{
wait_queue_t wait;
unsigned long flags;
- koib_conn_t *conn;
- koib_peer_t *peer;
+ kib_conn_t *conn;
+ kib_peer_t *peer;
int timeout;
int i;
int peer_index = 0;
unsigned long deadline = jiffies;
- kportal_daemonize ("koibnal_connd");
+ kportal_daemonize ("kibnal_connd");
kportal_blockallsigs ();
init_waitqueue_entry (&wait, current);
- spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags);
+ spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
for (;;) {
- if (!list_empty (&koibnal_data.koib_connd_conns)) {
- conn = list_entry (koibnal_data.koib_connd_conns.next,
- koib_conn_t, ibc_list);
+ if (!list_empty (&kibnal_data.kib_connd_conns)) {
+ conn = list_entry (kibnal_data.kib_connd_conns.next,
+ kib_conn_t, ibc_list);
list_del (&conn->ibc_list);
- spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags);
+ spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
switch (conn->ibc_state) {
- case OPENIBNAL_CONN_DEATHROW:
+ case IBNAL_CONN_DEATHROW:
LASSERT (conn->ibc_comm_id != TS_IB_CM_COMM_ID_INVALID);
/* Disconnect: conn becomes a zombie in the
* callback and last ref reschedules it
* here... */
- koibnal_terminate_conn(conn);
- koibnal_put_conn (conn);
+ kibnal_terminate_conn(conn);
+ kibnal_put_conn (conn);
break;
- case OPENIBNAL_CONN_ZOMBIE:
- koibnal_destroy_conn (conn);
+ case IBNAL_CONN_ZOMBIE:
+ kibnal_destroy_conn (conn);
break;
default:
LBUG();
}
- spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags);
+ spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
continue;
}
- if (!list_empty (&koibnal_data.koib_connd_peers)) {
- peer = list_entry (koibnal_data.koib_connd_peers.next,
- koib_peer_t, ibp_connd_list);
+ if (!list_empty (&kibnal_data.kib_connd_peers)) {
+ peer = list_entry (kibnal_data.kib_connd_peers.next,
+ kib_peer_t, ibp_connd_list);
list_del_init (&peer->ibp_connd_list);
- spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags);
+ spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
- koibnal_connect_peer (peer);
- koibnal_put_peer (peer);
+ kibnal_connect_peer (peer);
+ kibnal_put_peer (peer);
- spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags);
+ spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
}
/* shut down and nobody left to reap... */
- if (koibnal_data.koib_shutdown &&
- atomic_read(&koibnal_data.koib_nconns) == 0)
+ if (kibnal_data.kib_shutdown &&
+ atomic_read(&kibnal_data.kib_nconns) == 0)
break;
- spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags);
+ spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
/* careful with the jiffy wrap... */
while ((timeout = (int)(deadline - jiffies)) <= 0) {
const int n = 4;
const int p = 1;
- int chunk = koibnal_data.koib_peer_hash_size;
+ int chunk = kibnal_data.kib_peer_hash_size;
/* Time to check for RDMA timeouts on a few more
* peers: I do checks every 'p' seconds on a
* connection within (n+1)/n times the timeout
* interval. */
- if (koibnal_tunables.koib_io_timeout > n * p)
+ if (kibnal_tunables.kib_io_timeout > n * p)
chunk = (chunk * n * p) /
- koibnal_tunables.koib_io_timeout;
+ kibnal_tunables.kib_io_timeout;
if (chunk == 0)
chunk = 1;
for (i = 0; i < chunk; i++) {
- koibnal_check_conns (peer_index);
+ kibnal_check_conns (peer_index);
peer_index = (peer_index + 1) %
- koibnal_data.koib_peer_hash_size;
+ kibnal_data.kib_peer_hash_size;
}
deadline += p * HZ;
}
- koibnal_data.koib_connd_waketime = jiffies + timeout;
+ kibnal_data.kib_connd_waketime = jiffies + timeout;
set_current_state (TASK_INTERRUPTIBLE);
- add_wait_queue (&koibnal_data.koib_connd_waitq, &wait);
+ add_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
- if (!koibnal_data.koib_shutdown &&
- list_empty (&koibnal_data.koib_connd_conns) &&
- list_empty (&koibnal_data.koib_connd_peers))
+ if (!kibnal_data.kib_shutdown &&
+ list_empty (&kibnal_data.kib_connd_conns) &&
+ list_empty (&kibnal_data.kib_connd_peers))
schedule_timeout (timeout);
set_current_state (TASK_RUNNING);
- remove_wait_queue (&koibnal_data.koib_connd_waitq, &wait);
+ remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
- spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags);
+ spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
}
- spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags);
+ spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
- koibnal_thread_fini ();
+ kibnal_thread_fini ();
return (0);
}
int
-koibnal_scheduler(void *arg)
+kibnal_scheduler(void *arg)
{
long id = (long)arg;
char name[16];
- koib_rx_t *rx;
- koib_tx_t *tx;
+ kib_rx_t *rx;
+ kib_tx_t *tx;
unsigned long flags;
int rc;
int counter = 0;
int did_something;
- snprintf(name, sizeof(name), "koibnal_sd_%02ld", id);
+ snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
kportal_daemonize(name);
kportal_blockallsigs();
- spin_lock_irqsave(&koibnal_data.koib_sched_lock, flags);
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
for (;;) {
did_something = 0;
- while (!list_empty(&koibnal_data.koib_sched_txq)) {
- tx = list_entry(koibnal_data.koib_sched_txq.next,
- koib_tx_t, tx_list);
+ while (!list_empty(&kibnal_data.kib_sched_txq)) {
+ tx = list_entry(kibnal_data.kib_sched_txq.next,
+ kib_tx_t, tx_list);
list_del(&tx->tx_list);
- spin_unlock_irqrestore(&koibnal_data.koib_sched_lock,
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
flags);
- koibnal_tx_done(tx);
+ kibnal_tx_done(tx);
- spin_lock_irqsave(&koibnal_data.koib_sched_lock,
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock,
flags);
}
- if (!list_empty(&koibnal_data.koib_sched_rxq)) {
- rx = list_entry(koibnal_data.koib_sched_rxq.next,
- koib_rx_t, rx_list);
+ if (!list_empty(&kibnal_data.kib_sched_rxq)) {
+ rx = list_entry(kibnal_data.kib_sched_rxq.next,
+ kib_rx_t, rx_list);
list_del(&rx->rx_list);
- spin_unlock_irqrestore(&koibnal_data.koib_sched_lock,
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
flags);
- koibnal_rx(rx);
+ kibnal_rx(rx);
did_something = 1;
- spin_lock_irqsave(&koibnal_data.koib_sched_lock,
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock,
flags);
}
/* shut down and no receives to complete... */
- if (koibnal_data.koib_shutdown &&
- atomic_read(&koibnal_data.koib_nconns) == 0)
+ if (kibnal_data.kib_shutdown &&
+ atomic_read(&kibnal_data.kib_nconns) == 0)
break;
/* nothing to do or hogging CPU */
- if (!did_something || counter++ == OPENIBNAL_RESCHED) {
- spin_unlock_irqrestore(&koibnal_data.koib_sched_lock,
+ if (!did_something || counter++ == IBNAL_RESCHED) {
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
flags);
counter = 0;
if (!did_something) {
rc = wait_event_interruptible(
- koibnal_data.koib_sched_waitq,
- !list_empty(&koibnal_data.koib_sched_txq) ||
- !list_empty(&koibnal_data.koib_sched_rxq) ||
- (koibnal_data.koib_shutdown &&
- atomic_read (&koibnal_data.koib_nconns) == 0));
+ kibnal_data.kib_sched_waitq,
+ !list_empty(&kibnal_data.kib_sched_txq) ||
+ !list_empty(&kibnal_data.kib_sched_rxq) ||
+ (kibnal_data.kib_shutdown &&
+ atomic_read (&kibnal_data.kib_nconns) == 0));
} else {
our_cond_resched();
}
- spin_lock_irqsave(&koibnal_data.koib_sched_lock,
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock,
flags);
}
}
- spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, flags);
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
- koibnal_thread_fini();
+ kibnal_thread_fini();
return (0);
}
-lib_nal_t koibnal_lib = {
- libnal_data: &koibnal_data, /* NAL private data */
- libnal_send: koibnal_send,
- libnal_send_pages: koibnal_send_pages,
- libnal_recv: koibnal_recv,
- libnal_recv_pages: koibnal_recv_pages,
- libnal_dist: koibnal_dist
+lib_nal_t kibnal_lib = {
+ libnal_data: &kibnal_data, /* NAL private data */
+ libnal_send: kibnal_send,
+ libnal_send_pages: kibnal_send_pages,
+ libnal_recv: kibnal_recv,
+ libnal_recv_pages: kibnal_recv_pages,
+ libnal_dist: kibnal_dist
};
#define QSWNAL_SYSCTL 201
#define QSWNAL_SYSCTL_OPTIMIZED_GETS 1
-#define QSWNAL_SYSCTL_COPY_SMALL_FWD 2
+#define QSWNAL_SYSCTL_OPTIMIZED_PUTS 2
static ctl_table kqswnal_ctl_table[] = {
- {QSWNAL_SYSCTL_OPTIMIZED_GETS, "optimized_puts",
+ {QSWNAL_SYSCTL_OPTIMIZED_PUTS, "optimized_puts",
&kqswnal_tunables.kqn_optimized_puts, sizeof (int),
0644, NULL, &proc_dointvec},
{QSWNAL_SYSCTL_OPTIMIZED_GETS, "optimized_gets",
kqswnal_shutdown(nal_t *nal)
{
unsigned long flags;
+ kqswnal_tx_t *ktx;
+ kqswnal_rx_t *krx;
int do_lib_fini = 0;
/* NB The first ref was this module! */
* ep_dvma_release() get fixed (and releases any mappings in the
* region), we can delete all the code from here --------> */
- if (kqswnal_data.kqn_txds != NULL) {
- int i;
+ for (ktx = kqswnal_data.kqn_txds; ktx != NULL; ktx = ktx->ktx_alloclist) {
+ /* If ktx has a buffer, it got mapped; unmap now. NB only
+ * the pre-mapped stuff is still mapped since all tx descs
+ * must be idle */
- for (i = 0; i < KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS; i++) {
- kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i];
-
- /* If ktx has a buffer, it got mapped; unmap now.
- * NB only the pre-mapped stuff is still mapped
- * since all tx descs must be idle */
-
- if (ktx->ktx_buffer != NULL)
- ep_dvma_unload(kqswnal_data.kqn_ep,
- kqswnal_data.kqn_ep_tx_nmh,
- &ktx->ktx_ebuffer);
- }
+ if (ktx->ktx_buffer != NULL)
+ ep_dvma_unload(kqswnal_data.kqn_ep,
+ kqswnal_data.kqn_ep_tx_nmh,
+ &ktx->ktx_ebuffer);
}
- if (kqswnal_data.kqn_rxds != NULL) {
- int i;
-
- for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++) {
- kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i];
-
- /* If krx_kiov[0].kiov_page got allocated, it got mapped.
- * NB subsequent pages get merged */
+ for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) {
+ /* If krx_kiov[0].kiov_page got allocated, it got mapped.
+ * NB subsequent pages get merged */
- if (krx->krx_kiov[0].kiov_page != NULL)
- ep_dvma_unload(kqswnal_data.kqn_ep,
- kqswnal_data.kqn_ep_rx_nmh,
- &krx->krx_elanbuffer);
- }
+ if (krx->krx_kiov[0].kiov_page != NULL)
+ ep_dvma_unload(kqswnal_data.kqn_ep,
+ kqswnal_data.kqn_ep_rx_nmh,
+ &krx->krx_elanbuffer);
}
/* <----------- to here */
}
#endif
- if (kqswnal_data.kqn_txds != NULL)
- {
- int i;
+ while (kqswnal_data.kqn_txds != NULL) {
+ ktx = kqswnal_data.kqn_txds;
- for (i = 0; i < KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS; i++)
- {
- kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i];
-
- if (ktx->ktx_buffer != NULL)
- PORTAL_FREE(ktx->ktx_buffer,
- KQSW_TX_BUFFER_SIZE);
- }
+ if (ktx->ktx_buffer != NULL)
+ PORTAL_FREE(ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE);
- PORTAL_FREE(kqswnal_data.kqn_txds,
- sizeof (kqswnal_tx_t) * (KQSW_NTXMSGS +
- KQSW_NNBLK_TXMSGS));
+ kqswnal_data.kqn_txds = ktx->ktx_alloclist;
+ PORTAL_FREE(ktx, sizeof(*ktx));
}
- if (kqswnal_data.kqn_rxds != NULL)
- {
- int i;
- int j;
+ while (kqswnal_data.kqn_rxds != NULL) {
+ int i;
- for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++)
- {
- kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i];
+ krx = kqswnal_data.kqn_rxds;
+ for (i = 0; i < krx->krx_npages; i++)
+ if (krx->krx_kiov[i].kiov_page != NULL)
+ __free_page (krx->krx_kiov[i].kiov_page);
- for (j = 0; j < krx->krx_npages; j++)
- if (krx->krx_kiov[j].kiov_page != NULL)
- __free_page (krx->krx_kiov[j].kiov_page);
- }
-
- PORTAL_FREE(kqswnal_data.kqn_rxds,
- sizeof(kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL +
- KQSW_NRXMSGS_LARGE));
+ kqswnal_data.kqn_rxds = krx->krx_alloclist;
+ PORTAL_FREE(krx, sizeof (*krx));
}
/* resets flags, pointers to NULL etc */
#endif
int rc;
int i;
+ kqswnal_rx_t *krx;
+ kqswnal_tx_t *ktx;
int elan_page_idx;
ptl_process_id_t my_process_id;
int pkmem = atomic_read(&portal_kmemory);
/**********************************************************************/
/* Allocate/Initialise transmit descriptors */
- PORTAL_ALLOC(kqswnal_data.kqn_txds,
- sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS));
- if (kqswnal_data.kqn_txds == NULL)
- {
- kqswnal_shutdown (nal);
- return (PTL_NO_SPACE);
- }
-
- /* clear flags, null pointers etc */
- memset(kqswnal_data.kqn_txds, 0,
- sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS));
+ kqswnal_data.kqn_txds = NULL;
for (i = 0; i < (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS); i++)
{
int premapped_pages;
- kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i];
int basepage = i * KQSW_NTXMSGPAGES;
+ PORTAL_ALLOC (ktx, sizeof(*ktx));
+ if (ktx == NULL) {
+ kqswnal_shutdown (nal);
+ return (PTL_NO_SPACE);
+ }
+
+ memset(ktx, 0, sizeof(*ktx)); /* NULL pointers; zero flags */
+ ktx->ktx_alloclist = kqswnal_data.kqn_txds;
+ kqswnal_data.kqn_txds = ktx;
+
PORTAL_ALLOC (ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE);
if (ktx->ktx_buffer == NULL)
{
/**********************************************************************/
/* Allocate/Initialise receive descriptors */
-
- PORTAL_ALLOC (kqswnal_data.kqn_rxds,
- sizeof (kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE));
- if (kqswnal_data.kqn_rxds == NULL)
- {
- kqswnal_shutdown (nal);
- return (PTL_NO_SPACE);
- }
-
- memset(kqswnal_data.kqn_rxds, 0, /* clear flags, null pointers etc */
- sizeof(kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL+KQSW_NRXMSGS_LARGE));
-
+ kqswnal_data.kqn_rxds = NULL;
elan_page_idx = 0;
for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++)
{
E3_Addr elanbuffer;
#endif
int j;
- kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i];
+
+ PORTAL_ALLOC(krx, sizeof(*krx));
+ if (krx == NULL) {
+ kqswnal_shutdown(nal);
+ return (PTL_NO_SPACE);
+ }
+
+ memset(krx, 0, sizeof(*krx)); /* clear flags, null pointers etc */
+ krx->krx_alloclist = kqswnal_data.kqn_rxds;
+ kqswnal_data.kqn_rxds = krx;
if (i < KQSW_NRXMSGS_SMALL)
{
/**********************************************************************/
/* Queue receives, now that it's OK to run their completion callbacks */
- for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++)
- {
- kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i];
-
+ for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) {
/* NB this enqueue can allocate/sleep (attr == 0) */
krx->krx_state = KRX_POSTED;
#if MULTIRAIL_EKC
#define KQSW_TX_MAXCONTIG (1<<10) /* largest payload that gets made contiguous on transmit */
#define KQSW_NTXMSGS 8 /* # normal transmit messages */
-#define KQSW_NNBLK_TXMSGS 256 /* # reserved transmit messages if can't block */
+#define KQSW_NNBLK_TXMSGS 512 /* # reserved transmit messages if can't block */
#define KQSW_NRXMSGS_LARGE 64 /* # large receive buffers */
-#define KQSW_EP_ENVELOPES_LARGE 128 /* # large ep envelopes */
+#define KQSW_EP_ENVELOPES_LARGE 256 /* # large ep envelopes */
#define KQSW_NRXMSGS_SMALL 256 /* # small receive buffers */
#define KQSW_EP_ENVELOPES_SMALL 2048 /* # small ep envelopes */
#endif
} kqswnal_remotemd_t;
-typedef struct
+typedef struct kqswnal_rx
{
struct list_head krx_list; /* enqueue -> thread */
+ struct kqswnal_rx *krx_alloclist; /* stack in kqn_rxds */
EP_RCVR *krx_eprx; /* port to post receives to */
EP_RXD *krx_rxd; /* receive descriptor (for repost) */
#if MULTIRAIL_EKC
#define KRX_COMPLETING 3 /* waiting to be completed */
-typedef struct
+typedef struct kqswnal_tx
{
struct list_head ktx_list; /* enqueue idle/active */
struct list_head ktx_delayed_list; /* enqueue delayedtxds */
+ struct kqswnal_tx *ktx_alloclist; /* stack in kqn_txds */
unsigned int ktx_isnblk:1; /* reserved descriptor? */
unsigned int ktx_state:7; /* What I'm doing */
unsigned int ktx_firsttmpfrag:1; /* ktx_frags[0] is in my ebuffer ? 0 : 1 */
char kqn_shuttingdown; /* I'm trying to shut down */
atomic_t kqn_nthreads; /* # threads running */
- kqswnal_rx_t *kqn_rxds; /* all the receive descriptors */
- kqswnal_tx_t *kqn_txds; /* all the transmit descriptors */
+ kqswnal_rx_t *kqn_rxds; /* stack of all the receive descriptors */
+ kqswnal_tx_t *kqn_txds; /* stack of all the transmit descriptors */
struct list_head kqn_idletxds; /* transmit descriptors free to use */
struct list_head kqn_nblk_idletxds; /* reserved free transmit descriptors */
}
kscimacnal_data.ksci_nid = (ptl_nid_t)(ntohl(mac_physaddr));
- process_id.pid = requested_pid;
+ process_id.pid = 0;
process_id.nid = kscimacnal_data.ksci_nid;
CDEBUG(D_NET, "calling lib_init with nid "LPX64"\n",
conn2->ksnc_type != conn->ksnc_type ||
conn2->ksnc_incarnation != incarnation)
continue;
-
+
CWARN("Not creating duplicate connection to "
- "%u.%u.%u.%u type %d\n",
+ "%u.%u.%u.%u type %d\n",
HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_type);
rc = -EALREADY;
goto failed_2;
break;
}
+ /* Give conn a ref on sock->file since we're going to return success */
+ get_file(sock->file);
+
conn->ksnc_peer = peer; /* conn takes my ref on peer */
conn->ksnc_incarnation = incarnation;
peer->ksnp_last_alive = jiffies;
ksocknal_putconnsock(conn);
}
- CWARN("New conn nid:"LPX64" [type:%d] %u.%u.%u.%u -> %u.%u.%u.%u/%d"
+ CWARN("New conn nid:"LPX64" %u.%u.%u.%u -> %u.%u.%u.%u/%d"
" incarnation:"LPX64" sched[%d]/%d\n",
- nid, conn->ksnc_type, HIPQUAD(conn->ksnc_myipaddr),
+ nid, HIPQUAD(conn->ksnc_myipaddr),
HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port, incarnation,
(int)(conn->ksnc_scheduler - ksocknal_data.ksnd_schedulers), irq);
rc = -EINVAL;
break;
}
- if (rc != 0)
- fput (sock->file);
+ fput (sock->file);
break;
}
case NAL_CMD_CLOSE_CONNECTION: {
#include <portals/lib-p30.h>
#include <portals/nal.h>
#include <portals/socknal.h>
-#include <linux/lustre_idl.h>
-#include <linux/lustre_idl.h>
#define SOCKNAL_N_AUTOCONNECTD 4 /* # socknal autoconnect daemons */
#define SOCKNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */
return (0);
}
-int
-ksocknal_connect_peer (ksock_route_t *route, int type)
+static int
+ksocknal_connect_sock(struct socket **sockp, int *may_retry,
+ ksock_route_t *route, int local_port)
{
- struct sockaddr_in ipaddr;
- mm_segment_t oldmm = get_fs();
- struct timeval tv;
- int fd;
+ struct sockaddr_in locaddr;
+ struct sockaddr_in srvaddr;
struct socket *sock;
int rc;
-
+ int option;
+ mm_segment_t oldmm = get_fs();
+ struct timeval tv;
+
+ memset(&locaddr, 0, sizeof(locaddr));
+ locaddr.sin_family = AF_INET;
+ locaddr.sin_port = htons(local_port);
+ locaddr.sin_addr.s_addr =
+ (route->ksnr_myipaddr != 0) ? htonl(route->ksnr_myipaddr)
+ : INADDR_ANY;
+
+ memset (&srvaddr, 0, sizeof (srvaddr));
+ srvaddr.sin_family = AF_INET;
+ srvaddr.sin_port = htons (route->ksnr_port);
+ srvaddr.sin_addr.s_addr = htonl (route->ksnr_ipaddr);
+
+ *may_retry = 0;
+
rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock);
+ *sockp = sock;
if (rc != 0) {
CERROR ("Can't create autoconnect socket: %d\n", rc);
return (rc);
* from userspace. And we actually need the sock->file refcounting
* that this gives you :) */
- fd = sock_map_fd (sock);
- if (fd < 0) {
+ rc = sock_map_fd (sock);
+ if (rc < 0) {
sock_release (sock);
- CERROR ("sock_map_fd error %d\n", fd);
- return (fd);
+ CERROR ("sock_map_fd error %d\n", rc);
+ return (rc);
}
- /* NB the fd now owns the ref on sock->file */
+ /* NB the file descriptor (rc) now owns the ref on sock->file */
LASSERT (sock->file != NULL);
LASSERT (file_count(sock->file) == 1);
+ get_file(sock->file); /* extra ref makes sock->file */
+ sys_close(rc); /* survive this close */
+
+ /* Still got a single ref on sock->file */
+ LASSERT (file_count(sock->file) == 1);
+
/* Set the socket timeouts, so our connection attempt completes in
* finite time */
tv.tv_sec = ksocknal_tunables.ksnd_io_timeout;
if (rc != 0) {
CERROR ("Can't set send timeout %d: %d\n",
ksocknal_tunables.ksnd_io_timeout, rc);
- goto out;
+ goto failed;
}
set_fs (KERNEL_DS);
if (rc != 0) {
CERROR ("Can't set receive timeout %d: %d\n",
ksocknal_tunables.ksnd_io_timeout, rc);
- goto out;
+ goto failed;
}
- if (route->ksnr_myipaddr != 0) {
- /* Bind to the local IP address */
- memset (&ipaddr, 0, sizeof (ipaddr));
- ipaddr.sin_family = AF_INET;
- ipaddr.sin_port = htons (0); /* ANY */
- ipaddr.sin_addr.s_addr = htonl(route->ksnr_myipaddr);
+ set_fs (KERNEL_DS);
+ option = 1;
+ rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
+ (char *)&option, sizeof (option));
+ set_fs (oldmm);
+ if (rc != 0) {
+ CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc);
+ goto failed;
+ }
- rc = sock->ops->bind (sock, (struct sockaddr *)&ipaddr,
- sizeof (ipaddr));
- if (rc != 0) {
- CERROR ("Can't bind to local IP %u.%u.%u.%u: %d\n",
- HIPQUAD(route->ksnr_myipaddr), rc);
- goto out;
- }
+ rc = sock->ops->bind(sock,
+ (struct sockaddr *)&locaddr, sizeof(locaddr));
+ if (rc == -EADDRINUSE) {
+ CDEBUG(D_NET, "Port %d already in use\n", local_port);
+ *may_retry = 1;
+ goto failed;
}
-
- memset (&ipaddr, 0, sizeof (ipaddr));
- ipaddr.sin_family = AF_INET;
- ipaddr.sin_port = htons (route->ksnr_port);
- ipaddr.sin_addr.s_addr = htonl (route->ksnr_ipaddr);
-
- rc = sock->ops->connect (sock, (struct sockaddr *)&ipaddr,
- sizeof (ipaddr), sock->file->f_flags);
if (rc != 0) {
- CERROR ("Can't connect to nid "LPX64
- " local IP: %u.%u.%u.%u,"
- " remote IP: %u.%u.%u.%u/%d: %d\n",
- route->ksnr_peer->ksnp_nid,
- HIPQUAD(route->ksnr_myipaddr),
- HIPQUAD(route->ksnr_ipaddr),
- route->ksnr_port, rc);
- goto out;
+ CERROR("Error trying to bind to reserved port %d: %d\n",
+ local_port, rc);
+ goto failed;
}
- rc = ksocknal_create_conn (route, sock, type);
- if (rc == 0) {
- /* Take an extra ref on sock->file to compensate for the
- * upcoming close which will lose fd's ref on it. */
- get_file (sock->file);
+ rc = sock->ops->connect(sock,
+ (struct sockaddr *)&srvaddr, sizeof(srvaddr),
+ sock->file->f_flags);
+ if (rc == 0)
+ return 0;
+
+ /* EADDRNOTAVAIL probably means we're already connected to the same
+ * peer/port on the same local port on a differently typed
+ * connection. Let our caller retry with a different local
+ * port... */
+ *may_retry = (rc == -EADDRNOTAVAIL);
+
+ CDEBUG(*may_retry ? D_NET : D_ERROR,
+ "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc,
+ HIPQUAD(route->ksnr_myipaddr), local_port,
+ HIPQUAD(route->ksnr_ipaddr), route->ksnr_port);
+
+ failed:
+ fput(sock->file);
+ return rc;
+}
+
+int
+ksocknal_connect_peer (ksock_route_t *route, int type)
+{
+ struct socket *sock;
+ int rc;
+ int port;
+ int may_retry;
+
+ /* Iterate through reserved ports. When typed connections are
+ * used, we will need to bind to multiple ports, but we only know
+ * this at connect time. But, by that time we've already called
+ * bind() so we need a new socket. */
+
+ for (port = 1023; port > 512; --port) {
+
+ rc = ksocknal_connect_sock(&sock, &may_retry, route, port);
+
+ if (rc == 0) {
+ rc = ksocknal_create_conn(route, sock, type);
+ fput(sock->file);
+ return rc;
+ }
+
+ if (!may_retry)
+ return rc;
}
- out:
- sys_close (fd);
- return (rc);
+ CERROR("Out of ports trying to bind to a reserved port\n");
+ return (-EADDRINUSE);
}
void
LASSERT (type < SOCKNAL_CONN_NTYPES);
rc = ksocknal_connect_peer (route, type);
-
if (rc != 0)
break;
#endif
unsigned int portal_subsystem_debug = ~0 - (S_PORTALS | S_QSWNAL | S_SOCKNAL |
- S_GMNAL | S_OPENIBNAL);
+ S_GMNAL | S_IBNAL);
EXPORT_SYMBOL(portal_subsystem_debug);
unsigned int portal_debug = (D_WARNING | D_DLMTRACE | D_ERROR | D_EMERG | D_HA |
snprintf(debug_file_name, sizeof(debug_file_path) - 1,
"%s.%ld.%ld", debug_file_path, CURRENT_SECONDS, (long)arg);
+ printk(KERN_ALERT "LustreError: dumping log to %s\n", debug_file_name);
tracefile_dump_all_pages(debug_file_name);
current->journal_info = journal_info;
int portals_debug_mark_buffer(char *text)
{
CDEBUG(D_TRACE,"***************************************************\n");
- CWARN("DEBUG MARKER: %s\n", text);
+ CDEBUG(D_WARNING, "DEBUG MARKER: %s\n", text);
CDEBUG(D_TRACE,"***************************************************\n");
return 0;
char *portals_nid2str(int nal, ptl_nid_t nid, char *str)
{
if (nid == PTL_NID_ANY) {
- snprintf(str, PTL_NALFMT_SIZE - 1, "%s",
- "PTL_NID_ANY");
+ snprintf(str, PTL_NALFMT_SIZE, "%s", "PTL_NID_ANY");
return str;
}
switch(nal){
/* XXX this could be a nal method of some sort, 'cept it's config
* dependent whether (say) socknal NIDs are actually IP addresses... */
-#ifndef CRAY_PORTALS
+#if !CRAY_PORTALS
case TCPNAL:
/* userspace NAL */
+ case IIBNAL:
case OPENIBNAL:
case SOCKNAL:
- snprintf(str, PTL_NALFMT_SIZE - 1, "%u:%u.%u.%u.%u",
+ snprintf(str, PTL_NALFMT_SIZE, "%u:%u.%u.%u.%u",
(__u32)(nid >> 32), HIPQUAD(nid));
break;
case QSWNAL:
case GMNAL:
- snprintf(str, PTL_NALFMT_SIZE - 1, "%u:%u",
+ snprintf(str, PTL_NALFMT_SIZE, "%u:%u",
(__u32)(nid >> 32), (__u32)nid);
break;
#endif
default:
- snprintf(str, PTL_NALFMT_SIZE - 1, "?%d? %llx",
+ snprintf(str, PTL_NALFMT_SIZE, "?%x? %llx",
nal, (long long)nid);
break;
}
return str;
}
-/* bug #4615 */
+
char *portals_id2str(int nal, ptl_process_id_t id, char *str)
{
- switch(nal){
-#ifndef CRAY_PORTALS
- case TCPNAL:
- /* userspace NAL */
- case OPENIBNAL:
- case SOCKNAL:
- snprintf(str, PTL_NALFMT_SIZE - 1, "%u:%u.%u.%u.%u,%u",
- (__u32)(id.nid >> 32), HIPQUAD((id.nid)) , id.pid);
- break;
- case QSWNAL:
- case GMNAL:
- snprintf(str, PTL_NALFMT_SIZE - 1, "%u:%u,%u",
- (__u32)(id.nid >> 32), (__u32)id.nid, id.pid);
- break;
-#endif
- default:
- snprintf(str, PTL_NALFMT_SIZE - 1, "?%d? %llx,%lx",
- nal, (long long)id.nid, (long)id.pid );
- break;
- }
+ int len;
+
+ portals_nid2str(nal, id.nid, str);
+ len = strlen(str);
+ snprintf(str + len, PTL_NALFMT_SIZE, "-%u", id.pid);
return str;
}
-
#ifdef __KERNEL__
char stack_backtrace[LUSTRE_TRACE_SIZE];
spinlock_t stack_backtrace_lock = SPIN_LOCK_UNLOCKED;
CDEBUG(D_IOCTL, "calling handler nal: %d, cmd: %d\n", nal,
pcfg->pcfg_command);
rc = cmd->nch_handler(pcfg, cmd->nch_private);
+ } else {
+ CERROR("invalid nal: %d, cmd: %d\n", nal, pcfg->pcfg_command);
}
up(&nal_cmd_sem);
portals_debug_mark_buffer(data->ioc_inlbuf1);
RETURN(0);
#if LWT_SUPPORT
- case IOC_PORTAL_LWT_CONTROL:
+ case IOC_PORTAL_LWT_CONTROL:
err = lwt_control (data->ioc_flags, data->ioc_misc);
break;
-
+
case IOC_PORTAL_LWT_SNAPSHOT: {
cycles_t now;
int ncpu;
int total_size;
-
+
err = lwt_snapshot (&now, &ncpu, &total_size,
data->ioc_pbuf1, data->ioc_plen1);
data->ioc_nid = now;
data->ioc_misc = total_size;
/* Hedge against broken user/kernel typedefs (e.g. cycles_t) */
- data->ioc_nid = sizeof(lwt_event_t);
- data->ioc_nid2 = offsetof(lwt_event_t, lwte_where);
+ data->ioc_nid2 = sizeof(lwt_event_t);
+ data->ioc_nid3 = offsetof(lwt_event_t, lwte_where);
if (err == 0 &&
copy_to_user((char *)arg, data, sizeof (*data)))
err = -EFAULT;
break;
}
-
+
case IOC_PORTAL_LWT_LOOKUP_STRING:
err = lwt_lookup_string (&data->ioc_count, data->ioc_pbuf1,
data->ioc_pbuf2, data->ioc_plen2);
break;
}
- if (copy_from_user(&pcfg, (void *)data->ioc_pbuf1,
+ if (copy_from_user(&pcfg, (void *)data->ioc_pbuf1,
sizeof(pcfg))) {
err = -EFAULT;
break;
err = libcfs_nal_cmd(&pcfg);
if (err == 0 &&
- copy_to_user((char *)data->ioc_pbuf1, &pcfg,
+ copy_to_user((char *)data->ioc_pbuf1, &pcfg,
sizeof (pcfg)))
err = -EFAULT;
break;
#include <linux/kp30.h>
#include <linux/portals_compat25.h>
-#include <linux/lustre_compat25.h>
#include <linux/libcfs.h>
#define TCD_MAX_PAGES 1280
prefix = "Lustre";
ptype = KERN_INFO;
}
-
+
printk("%s%s: %d:%d:(%s:%d:%s()) %.*s", ptype, prefix, hdr->ph_pid,
hdr->ph_extern_pid, file, hdr->ph_line_num, fn, len, buf);
}
if (IS_ERR(filp)) {
rc = PTR_ERR(filp);
printk(KERN_ERR "LustreError: can't open %s for dump: rc %d\n",
- filename, rc);
+ filename, rc);
goto out;
}
"(%lu).\n", max * smp_num_cpus, num_physpages / 5 * 4);
return count;
}
+
for (i = 0; i < NR_CPUS; i++) {
struct trace_cpu_data *tcd;
tcd = &trace_data[i].tcd;
me->match_id.nid != src_nid)
continue;
- CDEBUG(D_NET,"match_id.pid [%x], src_pid [%x]\n", me->match_id.pid, src_pid);
+ CDEBUG(D_NET, "match_id.pid [%x], src_pid [%x]\n",
+ me->match_id.pid, src_pid);
if (me->match_id.pid != PTL_PID_ANY &&
me->match_id.pid != src_pid)
CDEBUG (D_IOCTL, "Getting nid for nal [%d]\n", data->ioc_nal);
- err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL, NULL, &nih);
+ err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL,
+ NULL, &nih);
if (!(err == PTL_OK || err == PTL_IFACE_DUP))
RETURN (-EINVAL);
CDEBUG (D_IOCTL, "fail nid: [%d] "LPU64" count %d\n",
data->ioc_nal, data->ioc_nid, data->ioc_count);
- err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL, NULL, &nih);
+ err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL,
+ NULL, &nih);
if (!(err == PTL_OK || err == PTL_IFACE_DUP))
return (-EINVAL);
*start = page + prd->skip;
user_len = -prd->skip;
- for (; prd->curr != &kpr_routes; prd->curr = prd->curr->next) {
+ while ((prd->curr != NULL) && (prd->curr != &kpr_routes)) {
re = list_entry(prd->curr, kpr_route_entry_t, kpre_list);
ge = re->kpre_gateway;
chunk_len += line_len;
user_len += line_len;
- /* The route table will exceed one page */
- if ((chunk_len > (PAGE_SIZE - 80)) || (user_len > count)) {
- prd->curr = prd->curr->next;
- break;
+ /* Abort the route list changed */
+ if (prd->curr->next == NULL) {
+ prd->curr = NULL;
+ read_unlock(&kpr_rwlock);
+ return sprintf(page, "\nError: Routes Changed\n");
}
+
+ prd->curr = prd->curr->next;
+
+ /* The route table will exceed one page, break the while loop
+ * so the function can be re-called with a new page.
+ */
+ if ((chunk_len > (PAGE_SIZE - 80)) || (user_len > count))
+ break;
}
*eof = 0;
{
connection conn;
struct sockaddr_in addr;
+ struct sockaddr_in locaddr;
unsigned int id[2];
struct timeval tv;
__u64 incarnation;
+ int fd;
+ int option;
+ int rc;
+ int rport;
+ ptl_nid_t peernid = PTL_NID_ANY;
+
port = tcpnal_acceptor_port;
id[0] = ip;
pthread_mutex_lock(&m->conn_lock);
conn = hash_table_find(m->connections, id);
- if (!conn) {
- int fd;
- int option;
- ptl_nid_t peernid = PTL_NID_ANY;
-
- bzero((char *) &addr, sizeof(addr));
- addr.sin_family = AF_INET;
- addr.sin_addr.s_addr = htonl(ip);
- addr.sin_port = htons(port);
-
- if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
- perror("tcpnal socket failed");
- exit(-1);
- }
- if (connect(fd, (struct sockaddr *)&addr,
- sizeof(struct sockaddr_in))) {
- perror("tcpnal connect");
- return(0);
- }
+ if (conn)
+ goto out;
+ memset(&addr, 0, sizeof(addr));
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = htonl(ip);
+ addr.sin_port = htons(port);
+
+ memset(&locaddr, 0, sizeof(locaddr));
+ locaddr.sin_family = AF_INET;
+ locaddr.sin_addr.s_addr = INADDR_ANY;
+
+ for (rport = IPPORT_RESERVED - 1; rport > IPPORT_RESERVED / 2; --rport) {
+ fd = socket(AF_INET, SOCK_STREAM, 0);
+ if (fd < 0) {
+ perror("tcpnal socket failed");
+ goto out;
+ }
+
+ option = 1;
+ rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR,
+ &option, sizeof(option));
+ if (rc != 0) {
+ perror ("Can't set SO_REUSEADDR for socket");
+ close(fd);
+ goto out;
+ }
+
+ locaddr.sin_port = htons(rport);
+ rc = bind(fd, (struct sockaddr *)&locaddr, sizeof(locaddr));
+ if (rc == 0 || errno == EACCES) {
+ rc = connect(fd, (struct sockaddr *)&addr,
+ sizeof(struct sockaddr_in));
+ if (rc == 0) {
+ break;
+ } else if (errno != EADDRINUSE && errno != EADDRNOTAVAIL) {
+ perror("Error connecting to remote host");
+ close(fd);
+ goto out;
+ }
+ } else if (errno != EADDRINUSE) {
+ perror("Error binding to privileged port");
+ close(fd);
+ goto out;
+ }
+ close(fd);
+ }
+
+ if (rport == IPPORT_RESERVED / 2) {
+ fprintf(stderr, "Out of ports trying to bind to a reserved port\n");
+ goto out;
+ }
+
#if 1
- option = 1;
- setsockopt(fd, SOL_TCP, TCP_NODELAY, &option, sizeof(option));
- option = 1<<20;
- setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &option, sizeof(option));
- option = 1<<20;
- setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &option, sizeof(option));
+ option = 1;
+ setsockopt(fd, SOL_TCP, TCP_NODELAY, &option, sizeof(option));
+ option = 1<<20;
+ setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &option, sizeof(option));
+ option = 1<<20;
+ setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &option, sizeof(option));
#endif
- gettimeofday(&tv, NULL);
- incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+ gettimeofday(&tv, NULL);
+ incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
- /* say hello */
- if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, incarnation))
+ /* say hello */
+ if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, incarnation))
exit(-1);
+
+ conn = allocate_connection(m, ip, port, fd);
+
+ /* let nal thread know this event right away */
+ if (conn)
+ procbridge_wakeup_nal(pb);
- conn = allocate_connection(m, ip, port, fd);
-
- /* let nal thread know this event right away */
- if (conn)
- procbridge_wakeup_nal(pb);
- }
-
+out:
pthread_mutex_unlock(&m->conn_lock);
return (conn);
}
void init_unix_timer(void);
void select_timer_block(when until);
when now(void);
+
+/*
+ * hacking for CFS internal MPI testing
+ */
+#if !CRAY_PORTALS
+#define ENABLE_SELECT_DISPATCH
+#endif
ptl_nid_t tcpnal_mynid;
+#ifdef ENABLE_SELECT_DISPATCH
+procbridge __global_procbridge = NULL;
+#endif
+
/* Function: procbridge_startup
*
* Arguments: pid: requested process id (port offset)
return PTL_FAIL;
}
+#ifdef ENABLE_SELECT_DISPATCH
+ __global_procbridge = p;
+#endif
+
/* create nal thread */
if (pthread_create(&p->t, NULL, nal_thread, &args)) {
perror("nal_init: pthread_create");
#include <sys/time.h>
#include <sys/types.h>
#include <stdlib.h>
+#include <syscall.h>
+#include <pthread.h>
+#include <errno.h>
#include <pqtimer.h>
#include <dispatch.h>
+#include <procbridge.h>
static struct timeval beginning_of_epoch;
i->disabled=1;
}
-static void set_flag(io_handler n,fd_set *fds)
+static void set_flag(io_handler n,fd_set *r, fd_set *w, fd_set *e)
{
- if (n->type & READ_HANDLER) FD_SET(n->fd, &fds[0]);
- if (n->type & WRITE_HANDLER) FD_SET(n->fd,&fds[1]);
- if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd, &fds[2]);
+ if (n->type & READ_HANDLER) FD_SET(n->fd, r);
+ if (n->type & WRITE_HANDLER) FD_SET(n->fd, w);
+ if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd, e);
}
-
-/* Function: select_timer_block
- * Arguments: until: an absolute time when the select should return
- *
- * This function dispatches the various file descriptors' handler
- * functions, if the kernel indicates there is io available.
- */
-void select_timer_block(when until)
+static int prepare_fd_sets(fd_set *r, fd_set *w, fd_set *e)
{
- fd_set fds[3];
- struct timeval timeout;
- struct timeval *timeout_pointer;
- int result;
io_handler j;
io_handler *k;
+ int max = 0;
- /* TODO: loop until the entire interval is expired*/
- if (until){
- when interval=until-now();
- timeout.tv_sec=(interval>>32);
- timeout.tv_usec=((interval<<32)/1000000)>>32;
- timeout_pointer=&timeout;
- } else timeout_pointer=0;
-
- FD_ZERO(&fds[0]);
- FD_ZERO(&fds[1]);
- FD_ZERO(&fds[2]);
+ FD_ZERO(r);
+ FD_ZERO(w);
+ FD_ZERO(e);
for (k=&io_handlers;*k;){
if ((*k)->disabled){
j=*k;
free(j);
}
if (*k) {
- set_flag(*k,fds);
+ set_flag(*k,r,w,e);
+ if ((*k)->fd > max)
+ max = (*k)->fd;
k=&(*k)->next;
}
}
+ return max + 1;
+}
+
+static int execute_callbacks(fd_set *r, fd_set *w, fd_set *e)
+{
+ io_handler j;
+ int n = 0, t;
+
+ for (j = io_handlers; j; j = j->next) {
+ if (j->disabled)
+ continue;
+
+ t = 0;
+ if (FD_ISSET(j->fd, r) && (j->type & READ_HANDLER)) {
+ FD_CLR(j->fd, r);
+ t++;
+ }
+ if (FD_ISSET(j->fd, w) && (j->type & WRITE_HANDLER)) {
+ FD_CLR(j->fd, w);
+ t++;
+ }
+ if (FD_ISSET(j->fd, e) && (j->type & EXCEPTION_HANDLER)) {
+ FD_CLR(j->fd, e);
+ t++;
+ }
+ if (t == 0)
+ continue;
+
+ if (!(*j->function)(j->argument))
+ j->disabled = 1;
+
+ n += t;
+ }
+
+ return n;
+}
- result=select(FD_SETSIZE, &fds[0], &fds[1], &fds[2], timeout_pointer);
+#ifdef ENABLE_SELECT_DISPATCH
- if (result > 0)
- for (j=io_handlers;j;j=j->next){
- if (!(j->disabled) &&
- ((FD_ISSET(j->fd, &fds[0]) && (j->type & READ_HANDLER)) ||
- (FD_ISSET(j->fd, &fds[1]) && (j->type & WRITE_HANDLER)) ||
- (FD_ISSET(j->fd, &fds[2]) && (j->type & EXCEPTION_HANDLER)))){
- if (!(*j->function)(j->argument))
- j->disabled=1;
+static struct {
+ pthread_mutex_t mutex;
+ pthread_cond_t cond;
+ int submitted;
+ int nready;
+ int maxfd;
+ fd_set *rset;
+ fd_set *wset;
+ fd_set *eset;
+ struct timeval *timeout;
+ struct timeval submit_time;
+} fd_extra = {
+ PTHREAD_MUTEX_INITIALIZER,
+ PTHREAD_COND_INITIALIZER,
+ 0, 0, 0,
+ NULL, NULL, NULL, NULL,
+};
+
+extern int liblustre_wait_event(int timeout);
+extern procbridge __global_procbridge;
+
+/*
+ * this will intercept syscall select() of user apps
+ * such as MPI libs.
+ */
+int select(int n, fd_set *rset, fd_set *wset, fd_set *eset,
+ struct timeval *timeout)
+{
+ LASSERT(fd_extra.submitted == 0);
+
+ fd_extra.nready = 0;
+ fd_extra.maxfd = n;
+ fd_extra.rset = rset;
+ fd_extra.wset = wset;
+ fd_extra.eset = eset;
+ fd_extra.timeout = timeout;
+
+ liblustre_wait_event(0);
+ pthread_mutex_lock(&fd_extra.mutex);
+ gettimeofday(&fd_extra.submit_time, NULL);
+ fd_extra.submitted = 1;
+ LASSERT(__global_procbridge);
+ procbridge_wakeup_nal(__global_procbridge);
+
+again:
+ if (fd_extra.submitted)
+ pthread_cond_wait(&fd_extra.cond, &fd_extra.mutex);
+ pthread_mutex_unlock(&fd_extra.mutex);
+
+ liblustre_wait_event(0);
+
+ pthread_mutex_lock(&fd_extra.mutex);
+ if (fd_extra.submitted)
+ goto again;
+ pthread_mutex_unlock(&fd_extra.mutex);
+
+ LASSERT(fd_extra.nready >= 0);
+ LASSERT(fd_extra.submitted == 0);
+ return fd_extra.nready;
+}
+
+static int merge_fds(int max, fd_set *rset, fd_set *wset, fd_set *eset)
+{
+ int i;
+
+ LASSERT(rset);
+ LASSERT(wset);
+ LASSERT(eset);
+
+ for (i = 0; i < __FD_SETSIZE/__NFDBITS; i++) {
+ LASSERT(!fd_extra.rset ||
+ !(__FDS_BITS(rset)[i] & __FDS_BITS(fd_extra.rset)[i]));
+ LASSERT(!fd_extra.wset ||
+ !(__FDS_BITS(wset)[i] & __FDS_BITS(fd_extra.wset)[i]));
+ LASSERT(!fd_extra.eset ||
+ !(__FDS_BITS(eset)[i] & __FDS_BITS(fd_extra.eset)[i]));
+
+ if (fd_extra.rset && __FDS_BITS(fd_extra.rset)[i])
+ __FDS_BITS(rset)[i] |= __FDS_BITS(fd_extra.rset)[i];
+ if (fd_extra.wset && __FDS_BITS(fd_extra.wset)[i])
+ __FDS_BITS(wset)[i] |= __FDS_BITS(fd_extra.wset)[i];
+ if (fd_extra.eset && __FDS_BITS(fd_extra.eset)[i])
+ __FDS_BITS(eset)[i] |= __FDS_BITS(fd_extra.eset)[i];
+ }
+
+ return (fd_extra.maxfd > max ? fd_extra.maxfd : max);
+}
+
+static inline
+int timeval_ge(struct timeval *tv1, struct timeval *tv2)
+{
+ LASSERT(tv1 && tv2);
+ return ((tv1->tv_sec - tv2->tv_sec) * 1000000 +
+ (tv1->tv_usec - tv2->tv_usec) >= 0);
+}
+
+/*
+ * choose the most recent timeout value
+ */
+static struct timeval *choose_timeout(struct timeval *tv1,
+ struct timeval *tv2)
+{
+ if (!tv1)
+ return tv2;
+ else if (!tv2)
+ return tv1;
+
+ if (timeval_ge(tv1, tv2))
+ return tv2;
+ else
+ return tv1;
+}
+
+/* Function: select_timer_block
+ * Arguments: until: an absolute time when the select should return
+ *
+ * This function dispatches the various file descriptors' handler
+ * functions, if the kernel indicates there is io available.
+ */
+void select_timer_block(when until)
+{
+ fd_set fds[3];
+ struct timeval timeout;
+ struct timeval *timeout_pointer, *select_timeout;
+ int max, nready, nexec;
+ int fd_handling;
+
+again:
+ if (until) {
+ when interval;
+
+ interval = until - now();
+ timeout.tv_sec = (interval >> 32);
+ timeout.tv_usec = ((interval << 32) / 1000000) >> 32;
+ timeout_pointer = &timeout;
+ } else
+ timeout_pointer = NULL;
+
+ fd_handling = 0;
+ max = prepare_fd_sets(&fds[0], &fds[1], &fds[2]);
+ select_timeout = timeout_pointer;
+
+ pthread_mutex_lock(&fd_extra.mutex);
+ fd_handling = fd_extra.submitted;
+ pthread_mutex_unlock(&fd_extra.mutex);
+ if (fd_handling) {
+ max = merge_fds(max, &fds[0], &fds[1], &fds[2]);
+ select_timeout = choose_timeout(timeout_pointer, fd_extra.timeout);
+ }
+
+ /* XXX only compile for linux */
+#if __WORDSIZE == 64
+ nready = syscall(SYS_select, max, &fds[0], &fds[1], &fds[2],
+ select_timeout);
+#else
+ nready = syscall(SYS__newselect, max, &fds[0], &fds[1], &fds[2],
+ select_timeout);
+#endif
+ if (nready < 0) {
+ CERROR("select return err %d, errno %d\n", nready, errno);
+ return;
+ }
+
+ if (nready) {
+ nexec = execute_callbacks(&fds[0], &fds[1], &fds[2]);
+ nready -= nexec;
+ } else
+ nexec = 0;
+
+ /* even both nready & nexec are 0, we still need try to wakeup
+ * upper thread since it may have timed out
+ */
+ if (fd_handling) {
+ LASSERT(nready >= 0);
+
+ pthread_mutex_lock(&fd_extra.mutex);
+ if (nready) {
+ if (fd_extra.rset)
+ *fd_extra.rset = fds[0];
+ if (fd_extra.wset)
+ *fd_extra.wset = fds[1];
+ if (fd_extra.eset)
+ *fd_extra.eset = fds[2];
+ fd_extra.nready = nready;
+ fd_extra.submitted = 0;
+ } else {
+ struct timeval t;
+
+ fd_extra.nready = 0;
+ if (fd_extra.timeout) {
+ gettimeofday(&t, NULL);
+ if (timeval_ge(&t, &fd_extra.submit_time))
+ fd_extra.submitted = 0;
}
}
+
+ pthread_cond_signal(&fd_extra.cond);
+ pthread_mutex_unlock(&fd_extra.mutex);
+ }
+
+ /* haven't found portals event, go back to loop if time
+ * is not expired */
+ if (!nexec) {
+ if (timeout_pointer == NULL || now() >= until)
+ goto again;
+ }
+}
+
+#else /* !ENABLE_SELECT_DISPATCH */
+
+/* Function: select_timer_block
+ * Arguments: until: an absolute time when the select should return
+ *
+ * This function dispatches the various file descriptors' handler
+ * functions, if the kernel indicates there is io available.
+ */
+void select_timer_block(when until)
+{
+ fd_set fds[3];
+ struct timeval timeout;
+ struct timeval *timeout_pointer;
+ int max, nready;
+
+again:
+ if (until) {
+ when interval;
+ interval = until - now();
+ timeout.tv_sec = (interval >> 32);
+ timeout.tv_usec = ((interval << 32) / 1000000) >> 32;
+ timeout_pointer = &timeout;
+ } else
+ timeout_pointer = NULL;
+
+ max = prepare_fd_sets(&fds[0], &fds[1], &fds[2]);
+
+ nready = select(max, &fds[0], &fds[1], &fds[2], timeout_pointer);
+ if (nready > 0)
+ execute_callbacks(&fds[0], &fds[1], &fds[2]);
}
+#endif /* ENABLE_SELECT_DISPATCH */
/* Function: init_unix_timer()
* is called to initialize the library
{
connection conn;
struct sockaddr_in addr;
+ struct sockaddr_in locaddr;
unsigned int id[2];
struct timeval tv;
__u64 incarnation;
+ int fd;
+ int option;
+ int rc;
+ int rport;
+ ptl_nid_t peernid = PTL_NID_ANY;
+
port = tcpnal_acceptor_port;
id[0] = ip;
pthread_mutex_lock(&m->conn_lock);
conn = hash_table_find(m->connections, id);
- if (!conn) {
- int fd;
- int option;
- ptl_nid_t peernid = PTL_NID_ANY;
-
- bzero((char *) &addr, sizeof(addr));
- addr.sin_family = AF_INET;
- addr.sin_addr.s_addr = htonl(ip);
- addr.sin_port = htons(port);
-
- if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
- perror("tcpnal socket failed");
- exit(-1);
- }
- if (connect(fd, (struct sockaddr *)&addr,
- sizeof(struct sockaddr_in))) {
- perror("tcpnal connect");
- return(0);
- }
+ if (conn)
+ goto out;
+ memset(&addr, 0, sizeof(addr));
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = htonl(ip);
+ addr.sin_port = htons(port);
+
+ memset(&locaddr, 0, sizeof(locaddr));
+ locaddr.sin_family = AF_INET;
+ locaddr.sin_addr.s_addr = INADDR_ANY;
+
+ for (rport = IPPORT_RESERVED - 1; rport > IPPORT_RESERVED / 2; --rport) {
+ fd = socket(AF_INET, SOCK_STREAM, 0);
+ if (fd < 0) {
+ perror("tcpnal socket failed");
+ goto out;
+ }
+
+ option = 1;
+ rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR,
+ &option, sizeof(option));
+ if (rc != 0) {
+ perror ("Can't set SO_REUSEADDR for socket");
+ close(fd);
+ goto out;
+ }
+
+ locaddr.sin_port = htons(rport);
+ rc = bind(fd, (struct sockaddr *)&locaddr, sizeof(locaddr));
+ if (rc == 0 || errno == EACCES) {
+ rc = connect(fd, (struct sockaddr *)&addr,
+ sizeof(struct sockaddr_in));
+ if (rc == 0) {
+ break;
+ } else if (errno != EADDRINUSE && errno != EADDRNOTAVAIL) {
+ perror("Error connecting to remote host");
+ close(fd);
+ goto out;
+ }
+ } else if (errno != EADDRINUSE) {
+ perror("Error binding to privileged port");
+ close(fd);
+ goto out;
+ }
+ close(fd);
+ }
+
+ if (rport == IPPORT_RESERVED / 2) {
+ fprintf(stderr, "Out of ports trying to bind to a reserved port\n");
+ goto out;
+ }
+
#if 1
- option = 1;
- setsockopt(fd, SOL_TCP, TCP_NODELAY, &option, sizeof(option));
- option = 1<<20;
- setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &option, sizeof(option));
- option = 1<<20;
- setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &option, sizeof(option));
+ option = 1;
+ setsockopt(fd, SOL_TCP, TCP_NODELAY, &option, sizeof(option));
+ option = 1<<20;
+ setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &option, sizeof(option));
+ option = 1<<20;
+ setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &option, sizeof(option));
#endif
- gettimeofday(&tv, NULL);
- incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+ gettimeofday(&tv, NULL);
+ incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
- /* say hello */
- if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, incarnation))
+ /* say hello */
+ if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, incarnation))
exit(-1);
+
+ conn = allocate_connection(m, ip, port, fd);
+
+ /* let nal thread know this event right away */
+ if (conn)
+ procbridge_wakeup_nal(pb);
- conn = allocate_connection(m, ip, port, fd);
-
- /* let nal thread know this event right away */
- if (conn)
- procbridge_wakeup_nal(pb);
- }
-
+out:
pthread_mutex_unlock(&m->conn_lock);
return (conn);
}
void init_unix_timer(void);
void select_timer_block(when until);
when now(void);
+
+/*
+ * hacking for CFS internal MPI testing
+ */
+#if !CRAY_PORTALS
+#define ENABLE_SELECT_DISPATCH
+#endif
ptl_nid_t tcpnal_mynid;
+#ifdef ENABLE_SELECT_DISPATCH
+procbridge __global_procbridge = NULL;
+#endif
+
/* Function: procbridge_startup
*
* Arguments: pid: requested process id (port offset)
return PTL_FAIL;
}
+#ifdef ENABLE_SELECT_DISPATCH
+ __global_procbridge = p;
+#endif
+
/* create nal thread */
if (pthread_create(&p->t, NULL, nal_thread, &args)) {
perror("nal_init: pthread_create");
#include <sys/time.h>
#include <sys/types.h>
#include <stdlib.h>
+#include <syscall.h>
+#include <pthread.h>
+#include <errno.h>
#include <pqtimer.h>
#include <dispatch.h>
+#include <procbridge.h>
static struct timeval beginning_of_epoch;
i->disabled=1;
}
-static void set_flag(io_handler n,fd_set *fds)
+static void set_flag(io_handler n,fd_set *r, fd_set *w, fd_set *e)
{
- if (n->type & READ_HANDLER) FD_SET(n->fd, &fds[0]);
- if (n->type & WRITE_HANDLER) FD_SET(n->fd,&fds[1]);
- if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd, &fds[2]);
+ if (n->type & READ_HANDLER) FD_SET(n->fd, r);
+ if (n->type & WRITE_HANDLER) FD_SET(n->fd, w);
+ if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd, e);
}
-
-/* Function: select_timer_block
- * Arguments: until: an absolute time when the select should return
- *
- * This function dispatches the various file descriptors' handler
- * functions, if the kernel indicates there is io available.
- */
-void select_timer_block(when until)
+static int prepare_fd_sets(fd_set *r, fd_set *w, fd_set *e)
{
- fd_set fds[3];
- struct timeval timeout;
- struct timeval *timeout_pointer;
- int result;
io_handler j;
io_handler *k;
+ int max = 0;
- /* TODO: loop until the entire interval is expired*/
- if (until){
- when interval=until-now();
- timeout.tv_sec=(interval>>32);
- timeout.tv_usec=((interval<<32)/1000000)>>32;
- timeout_pointer=&timeout;
- } else timeout_pointer=0;
-
- FD_ZERO(&fds[0]);
- FD_ZERO(&fds[1]);
- FD_ZERO(&fds[2]);
+ FD_ZERO(r);
+ FD_ZERO(w);
+ FD_ZERO(e);
for (k=&io_handlers;*k;){
if ((*k)->disabled){
j=*k;
free(j);
}
if (*k) {
- set_flag(*k,fds);
+ set_flag(*k,r,w,e);
+ if ((*k)->fd > max)
+ max = (*k)->fd;
k=&(*k)->next;
}
}
+ return max + 1;
+}
+
+static int execute_callbacks(fd_set *r, fd_set *w, fd_set *e)
+{
+ io_handler j;
+ int n = 0, t;
+
+ for (j = io_handlers; j; j = j->next) {
+ if (j->disabled)
+ continue;
+
+ t = 0;
+ if (FD_ISSET(j->fd, r) && (j->type & READ_HANDLER)) {
+ FD_CLR(j->fd, r);
+ t++;
+ }
+ if (FD_ISSET(j->fd, w) && (j->type & WRITE_HANDLER)) {
+ FD_CLR(j->fd, w);
+ t++;
+ }
+ if (FD_ISSET(j->fd, e) && (j->type & EXCEPTION_HANDLER)) {
+ FD_CLR(j->fd, e);
+ t++;
+ }
+ if (t == 0)
+ continue;
+
+ if (!(*j->function)(j->argument))
+ j->disabled = 1;
+
+ n += t;
+ }
+
+ return n;
+}
- result=select(FD_SETSIZE, &fds[0], &fds[1], &fds[2], timeout_pointer);
+#ifdef ENABLE_SELECT_DISPATCH
- if (result > 0)
- for (j=io_handlers;j;j=j->next){
- if (!(j->disabled) &&
- ((FD_ISSET(j->fd, &fds[0]) && (j->type & READ_HANDLER)) ||
- (FD_ISSET(j->fd, &fds[1]) && (j->type & WRITE_HANDLER)) ||
- (FD_ISSET(j->fd, &fds[2]) && (j->type & EXCEPTION_HANDLER)))){
- if (!(*j->function)(j->argument))
- j->disabled=1;
+static struct {
+ pthread_mutex_t mutex;
+ pthread_cond_t cond;
+ int submitted;
+ int nready;
+ int maxfd;
+ fd_set *rset;
+ fd_set *wset;
+ fd_set *eset;
+ struct timeval *timeout;
+ struct timeval submit_time;
+} fd_extra = {
+ PTHREAD_MUTEX_INITIALIZER,
+ PTHREAD_COND_INITIALIZER,
+ 0, 0, 0,
+ NULL, NULL, NULL, NULL,
+};
+
+extern int liblustre_wait_event(int timeout);
+extern procbridge __global_procbridge;
+
+/*
+ * this will intercept syscall select() of user apps
+ * such as MPI libs.
+ */
+int select(int n, fd_set *rset, fd_set *wset, fd_set *eset,
+ struct timeval *timeout)
+{
+ LASSERT(fd_extra.submitted == 0);
+
+ fd_extra.nready = 0;
+ fd_extra.maxfd = n;
+ fd_extra.rset = rset;
+ fd_extra.wset = wset;
+ fd_extra.eset = eset;
+ fd_extra.timeout = timeout;
+
+ liblustre_wait_event(0);
+ pthread_mutex_lock(&fd_extra.mutex);
+ gettimeofday(&fd_extra.submit_time, NULL);
+ fd_extra.submitted = 1;
+ LASSERT(__global_procbridge);
+ procbridge_wakeup_nal(__global_procbridge);
+
+again:
+ if (fd_extra.submitted)
+ pthread_cond_wait(&fd_extra.cond, &fd_extra.mutex);
+ pthread_mutex_unlock(&fd_extra.mutex);
+
+ liblustre_wait_event(0);
+
+ pthread_mutex_lock(&fd_extra.mutex);
+ if (fd_extra.submitted)
+ goto again;
+ pthread_mutex_unlock(&fd_extra.mutex);
+
+ LASSERT(fd_extra.nready >= 0);
+ LASSERT(fd_extra.submitted == 0);
+ return fd_extra.nready;
+}
+
+static int merge_fds(int max, fd_set *rset, fd_set *wset, fd_set *eset)
+{
+ int i;
+
+ LASSERT(rset);
+ LASSERT(wset);
+ LASSERT(eset);
+
+ for (i = 0; i < __FD_SETSIZE/__NFDBITS; i++) {
+ LASSERT(!fd_extra.rset ||
+ !(__FDS_BITS(rset)[i] & __FDS_BITS(fd_extra.rset)[i]));
+ LASSERT(!fd_extra.wset ||
+ !(__FDS_BITS(wset)[i] & __FDS_BITS(fd_extra.wset)[i]));
+ LASSERT(!fd_extra.eset ||
+ !(__FDS_BITS(eset)[i] & __FDS_BITS(fd_extra.eset)[i]));
+
+ if (fd_extra.rset && __FDS_BITS(fd_extra.rset)[i])
+ __FDS_BITS(rset)[i] |= __FDS_BITS(fd_extra.rset)[i];
+ if (fd_extra.wset && __FDS_BITS(fd_extra.wset)[i])
+ __FDS_BITS(wset)[i] |= __FDS_BITS(fd_extra.wset)[i];
+ if (fd_extra.eset && __FDS_BITS(fd_extra.eset)[i])
+ __FDS_BITS(eset)[i] |= __FDS_BITS(fd_extra.eset)[i];
+ }
+
+ return (fd_extra.maxfd > max ? fd_extra.maxfd : max);
+}
+
+static inline
+int timeval_ge(struct timeval *tv1, struct timeval *tv2)
+{
+ LASSERT(tv1 && tv2);
+ return ((tv1->tv_sec - tv2->tv_sec) * 1000000 +
+ (tv1->tv_usec - tv2->tv_usec) >= 0);
+}
+
+/*
+ * choose the most recent timeout value
+ */
+static struct timeval *choose_timeout(struct timeval *tv1,
+ struct timeval *tv2)
+{
+ if (!tv1)
+ return tv2;
+ else if (!tv2)
+ return tv1;
+
+ if (timeval_ge(tv1, tv2))
+ return tv2;
+ else
+ return tv1;
+}
+
+/* Function: select_timer_block
+ * Arguments: until: an absolute time when the select should return
+ *
+ * This function dispatches the various file descriptors' handler
+ * functions, if the kernel indicates there is io available.
+ */
+void select_timer_block(when until)
+{
+ fd_set fds[3];
+ struct timeval timeout;
+ struct timeval *timeout_pointer, *select_timeout;
+ int max, nready, nexec;
+ int fd_handling;
+
+again:
+ if (until) {
+ when interval;
+
+ interval = until - now();
+ timeout.tv_sec = (interval >> 32);
+ timeout.tv_usec = ((interval << 32) / 1000000) >> 32;
+ timeout_pointer = &timeout;
+ } else
+ timeout_pointer = NULL;
+
+ fd_handling = 0;
+ max = prepare_fd_sets(&fds[0], &fds[1], &fds[2]);
+ select_timeout = timeout_pointer;
+
+ pthread_mutex_lock(&fd_extra.mutex);
+ fd_handling = fd_extra.submitted;
+ pthread_mutex_unlock(&fd_extra.mutex);
+ if (fd_handling) {
+ max = merge_fds(max, &fds[0], &fds[1], &fds[2]);
+ select_timeout = choose_timeout(timeout_pointer, fd_extra.timeout);
+ }
+
+ /* XXX only compile for linux */
+#if __WORDSIZE == 64
+ nready = syscall(SYS_select, max, &fds[0], &fds[1], &fds[2],
+ select_timeout);
+#else
+ nready = syscall(SYS__newselect, max, &fds[0], &fds[1], &fds[2],
+ select_timeout);
+#endif
+ if (nready < 0) {
+ CERROR("select return err %d, errno %d\n", nready, errno);
+ return;
+ }
+
+ if (nready) {
+ nexec = execute_callbacks(&fds[0], &fds[1], &fds[2]);
+ nready -= nexec;
+ } else
+ nexec = 0;
+
+ /* even both nready & nexec are 0, we still need try to wakeup
+ * upper thread since it may have timed out
+ */
+ if (fd_handling) {
+ LASSERT(nready >= 0);
+
+ pthread_mutex_lock(&fd_extra.mutex);
+ if (nready) {
+ if (fd_extra.rset)
+ *fd_extra.rset = fds[0];
+ if (fd_extra.wset)
+ *fd_extra.wset = fds[1];
+ if (fd_extra.eset)
+ *fd_extra.eset = fds[2];
+ fd_extra.nready = nready;
+ fd_extra.submitted = 0;
+ } else {
+ struct timeval t;
+
+ fd_extra.nready = 0;
+ if (fd_extra.timeout) {
+ gettimeofday(&t, NULL);
+ if (timeval_ge(&t, &fd_extra.submit_time))
+ fd_extra.submitted = 0;
}
}
+
+ pthread_cond_signal(&fd_extra.cond);
+ pthread_mutex_unlock(&fd_extra.mutex);
+ }
+
+ /* haven't found portals event, go back to loop if time
+ * is not expired */
+ if (!nexec) {
+ if (timeout_pointer == NULL || now() >= until)
+ goto again;
+ }
+}
+
+#else /* !ENABLE_SELECT_DISPATCH */
+
+/* Function: select_timer_block
+ * Arguments: until: an absolute time when the select should return
+ *
+ * This function dispatches the various file descriptors' handler
+ * functions, if the kernel indicates there is io available.
+ */
+void select_timer_block(when until)
+{
+ fd_set fds[3];
+ struct timeval timeout;
+ struct timeval *timeout_pointer;
+ int max, nready;
+
+again:
+ if (until) {
+ when interval;
+ interval = until - now();
+ timeout.tv_sec = (interval >> 32);
+ timeout.tv_usec = ((interval << 32) / 1000000) >> 32;
+ timeout_pointer = &timeout;
+ } else
+ timeout_pointer = NULL;
+
+ max = prepare_fd_sets(&fds[0], &fds[1], &fds[2]);
+
+ nready = select(max, &fds[0], &fds[1], &fds[2], timeout_pointer);
+ if (nready > 0)
+ execute_callbacks(&fds[0], &fds[1], &fds[2]);
}
+#endif /* ENABLE_SELECT_DISPATCH */
/* Function: init_unix_timer()
* is called to initialize the library
newly created junk */
return(PTL_NAL_FAILED);
}
- /* XXX cfs hack */
-// b->lib_nal->libnal_ni.ni_pid.pid=0;
b->lower=m;
return(PTL_OK);
}
newly created junk */
return(PTL_NAL_FAILED);
}
- /* XXX cfs hack */
-// b->lib_nal->libnal_ni.ni_pid.pid=0;
b->lower=m;
return(PTL_OK);
}
void
usage (char *myname)
{
- fprintf (stderr, "Usage: %s [-N nal_id] port\n", myname);
+ fprintf (stderr,
+ "Usage: %s [-N nal_id] [-p] [-l] port\n\n"
+ " -l\tKeep stdin/stdout open\n"
+ " -p\tAllow connections from non-privileged ports\n",
+ myname);
exit (1);
}
int c;
int noclose = 0;
int nal = SOCKNAL;
+ int rport;
+ int require_privports = 1;
- while ((c = getopt (argc, argv, "N:l")) != -1)
- switch (c)
- {
- case 'l':
- noclose = 1;
- break;
-
+ while ((c = getopt (argc, argv, "N:lp")) != -1) {
+ switch (c) {
case 'N':
if (sscanf(optarg, "%d", &nal) != 1 ||
nal < 0 || nal > NAL_MAX_NR)
usage(argv[0]);
break;
-
+ case 'l':
+ noclose = 1;
+ break;
+ case 'p':
+ require_privports = 0;
+ break;
default:
usage (argv[0]);
break;
}
+ }
if (optind >= argc)
usage (argv[0]);
exit(1);
}
- rc = daemon(1, noclose);
+ rc = daemon(0, noclose);
if (rc < 0) {
perror("daemon(): ");
exit(1);
struct portals_cfg pcfg;
#ifdef HAVE_LIBWRAP
struct request_info request;
- char addrstr[INET_ADDRSTRLEN];
#endif
+ char addrstr[INET_ADDRSTRLEN];
cfd = accept(fd, (struct sockaddr *)&clntaddr, &len);
if ( cfd < 0 ) {
continue;
}
#endif
+
+ if (require_privports && ntohs(clntaddr.sin_port) >= IPPORT_RESERVED) {
+ inet_ntop(AF_INET, &clntaddr.sin_addr,
+ addrstr, INET_ADDRSTRLEN);
+ syslog(LOG_ERR, "Closing non-privileged connection from %s:%d\n",
+ addrstr, ntohs(clntaddr.sin_port));
+ rc = close(cfd);
+ if (rc)
+ perror ("close un-privileged client failed");
+ continue;
+ }
+
show_connection (cfd, clntaddr.sin_addr.s_addr);
PCFG_INIT(pcfg, NAL_CMD_REGISTER_PEER_FD);
#include <portals/list.h>
#include <stdio.h>
+#ifdef HAVE_NETDB_H
#include <netdb.h>
+#endif
#include <stdlib.h>
#include <string.h>
+#include "ioctl.h"
#include <fcntl.h>
#include <errno.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/mman.h>
+#ifdef HAVE_LINUX_VERSION_H
#include <linux/version.h>
#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
#define BUG() /* workaround for module.h includes */
#include <linux/module.h>
#endif
+#endif /* !HAVE_LINUX_VERSION_H */
+
#include <sys/utsname.h>
#include <portals/api-support.h>
static char rawbuf[8192];
static char *buf = rawbuf;
static int max = 8192;
-//static int g_pfd = -1;
+/*static int g_pfd = -1;*/
static int subsystem_mask = ~0;
static int debug_mask = ~0;
{"undefined", "mdc", "mds", "osc", "ost", "class", "log", "llite",
"rpc", "mgmt", "portals", "libcfs", "socknal", "qswnal", "pinger",
"filter", "ptlbd", "echo", "ldlm", "lov", "gmnal", "router", "cobd",
- "openibnal", "lmv", "smfs", "cmobd", NULL};
+ "ibnal", NULL};
static const char *portal_debug_masks[] =
{"trace", "inode", "super", "ext2", "malloc", "cache", "info", "ioctl",
"blocks", "net", "warning", "buffs", "other", "dentry", "portals",
fprintf(stderr, "usage: %s [file] [raw]\n", argv[0]);
return 0;
}
- sprintf(filename, "%s.%lu.%u", argc > 1 ? argv[1] : "/tmp/lustre-log",
- time(NULL), getpid());
- if (argc > 2)
+ if (argc > 2) {
raw = atoi(argv[2]);
+ } else if (argc > 1 && (argv[1][0] == '0' || argv[1][0] == '1')) {
+ raw = atoi(argv[1]);
+ argc--;
+ } else {
+ sprintf(filename, "%s.%lu.%u", argc > 1 ? argv[1] :
+ "/tmp/lustre-log", time(NULL), getpid());
+ }
+
unlink(filename);
fd = open("/proc/sys/portals/dump_kernel", O_WRONLY);
if (fd < 0) {
+ if (errno == ENOENT) /* no dump file created */
+ return 0;
+
fprintf(stderr, "open(dump_kernel) failed: %s\n",
strerror(errno));
return 1;
int jt_dbg_debug_daemon(int argc, char **argv)
{
int rc, fd;
-
+
if (argc <= 1) {
fprintf(stderr, debug_daemon_usage);
return 0;
}
-
+
fd = open("/proc/sys/portals/daemon_file", O_WRONLY);
if (fd < 0) {
fprintf(stderr, "open(daemon_file) failed: %s\n",
strerror(errno));
return 1;
}
-
+
if (strcasecmp(argv[1], "start") == 0) {
if (argc != 3) {
fprintf(stderr, debug_daemon_usage);
return 1;
}
-
+
rc = write(fd, argv[2], strlen(argv[2]));
if (rc != strlen(argv[2])) {
fprintf(stderr, "write(%s) failed: %s\n", argv[2],
fprintf(stderr, debug_daemon_usage);
return 1;
}
-
+
close(fd);
return 0;
}
{"obdfilter", "lustre/obdfilter"},
{"extN", "lustre/extN"},
{"lov", "lustre/lov"},
- {"lmv", "lustre/lmv"},
{"fsfilt_ext3", "lustre/lvfs"},
{"fsfilt_extN", "lustre/lvfs"},
{"fsfilt_reiserfs", "lustre/lvfs"},
{"ptlbd", "lustre/ptlbd"},
{"mgmt_svc", "lustre/mgmt"},
{"mgmt_cli", "lustre/mgmt"},
- {"cobd", "lustre/cobd"},
- {"cmobd", "lustre/cmobd"},
+ {"conf_obd", "lustre/obdclass"},
{NULL, NULL}
};
static int jt_dbg_modules_2_4(int argc, char **argv)
{
+#ifdef HAVE_LINUX_VERSION_H
#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
struct mod_paths *mp;
char *path = "..";
}
return 0;
-#else /* Headers are 2.6-only */
+#endif /* Headers are 2.6-only */
+#endif /* !HAVE_LINUX_VERSION_H */
return -EINVAL;
-#endif
}
static int jt_dbg_modules_2_5(int argc, char **argv)
#include <stdio.h>
#include <sys/types.h>
+#ifdef HAVE_NETDB_H
#include <netdb.h>
+#endif
#include <sys/socket.h>
+#ifdef HAVE_NETINET_TCP_H
#include <netinet/tcp.h>
-#include <netdb.h>
+#endif
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
+#include "ioctl.h"
#include <sys/ioctl.h>
#include <errno.h>
#include <unistd.h>
static unsigned int g_nal = 0;
-static int g_socket_txmem = 0;
-static int g_socket_rxmem = 0;
-static int g_socket_nonagle = 1;
-
typedef struct
{
char *name;
{"elan", QSWNAL},
{"gm", GMNAL},
{"openib", OPENIBNAL},
+ {"iib", IIBNAL},
{NULL, -1}
};
return ((e == NULL) ? "???" : e->name);
}
+#ifdef HAVE_GETHOSTBYNAME
static struct hostent *
ptl_gethostbyname(char * hname) {
struct hostent *he;
}
return he;
}
+#endif
int
ptl_parse_port (int *port, char *str)
int
ptl_parse_ipaddr (__u32 *ipaddrp, char *str)
{
+#ifdef HAVE_GETHOSTBYNAME
struct hostent *he;
+#endif
if (!strcmp (str, "_all_"))
{
if (ptl_parse_ipquad(ipaddrp, str) == 0)
return (0);
-
+
+#if HAVE_GETHOSTBYNAME
if ((('a' <= str[0] && str[0] <= 'z') ||
('A' <= str[0] && str[0] <= 'Z')) &&
(he = ptl_gethostbyname (str)) != NULL)
*ipaddrp = ntohl(addr); /* HOST byte order */
return (0);
}
+#endif
return (-1);
}
char *
ptl_ipaddr_2_str (__u32 ipaddr, char *str, int lookup)
{
+#ifdef HAVE_GETHOSTBYNAME
__u32 net_ip;
struct hostent *he;
return (str);
}
}
-
+#endif
+
sprintf (str, "%d.%d.%d.%d",
(ipaddr >> 24) & 0xff, (ipaddr >> 16) & 0xff,
(ipaddr >> 8) & 0xff, ipaddr & 0xff);
ptl_nid2str (char *buffer, ptl_nid_t nid)
{
__u64 nid64 = ptl_nid2u64(nid);
+#ifdef HAVE_GETHOSTBYNAME
struct hostent *he = 0;
/* Don't try to resolve NIDs that are e.g. Elan host IDs. Assume
if (he != NULL)
sprintf(buffer, "%#x:%s", (int)(nid64 >> 32), he->h_name);
else
+#endif /* HAVE_GETHOSTBYNAME */
sprintf(buffer, LPX64, nid64);
return (buffer);
return (-1);
}
-
int
jt_ptl_print_interfaces (int argc, char **argv)
{
__u32 ipaddr;
int rc;
__u32 netmask = 0xffffff00;
+ int i;
+ int count;
+ char *end;
if (argc < 2 || argc > 3) {
fprintf (stderr, "usage: %s ipaddr [netmask]\n", argv[0]);
fprintf (stderr, "Can't parse ip: %s\n", argv[1]);
return -1;
}
-
- if (argc > 2 &&
- ptl_parse_ipquad(&netmask, argv[2]) != 0) {
- fprintf (stderr, "Can't parse netmask: %s\n", argv[2]);
- return -1;
+
+ if (argc > 2 ) {
+ count = strtol(argv[2], &end, 0);
+ if (count > 0 && count < 32 && *end == 0) {
+ netmask = 0;
+ for (i = count; i > 0; i--)
+ netmask = netmask|(1<<(32-i));
+ } else if (ptl_parse_ipquad(&netmask, argv[2]) != 0) {
+ fprintf (stderr, "Can't parse netmask: %s\n", argv[2]);
+ return -1;
+ }
}
-
+
PCFG_INIT(pcfg, NAL_CMD_ADD_INTERFACE);
pcfg.pcfg_id = ipaddr;
pcfg.pcfg_misc = netmask;
strerror (errno));
return -1;
}
-
+
return 0;
}
strerror (errno));
return -1;
}
-
+
return 0;
}
-int
+int
jt_ptl_print_peers (int argc, char **argv)
{
struct portals_cfg pcfg;
int index;
int rc;
- if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, 0))
+ if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0))
return -1;
for (index = 0;;index++) {
int port = 0;
int rc;
- if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, 0))
+ if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0))
return -1;
if (g_nal_is_compatible(NULL, SOCKNAL, 0)) {
return 0;
}
} else if (argc != 2) {
- fprintf (stderr, "usage(openib): %s nid\n", argv[0]);
+ fprintf (stderr, "usage(openib,iib): %s nid\n", argv[0]);
return 0;
}
int argidx;
int rc;
- if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, 0))
+ if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0))
return -1;
if (g_nal_is_compatible(NULL, SOCKNAL, 0)) {
}
if (argc > argidx) {
- if (!strcmp (argv[3], "single_share")) {
+ if (!strcmp (argv[argidx], "single_share")) {
single_share = 1;
} else {
fprintf (stderr, "Unrecognised arg %s'\n", argv[3]);
int index;
int rc;
- if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, 0))
+ if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0))
return -1;
for (index = 0;;index++) {
int jt_ptl_connect(int argc, char **argv)
{
+#ifndef HAVE_CONNECT
+ /* no connect() support */
+ return -1;
+#else /* HAVE_CONNECT */
struct portals_cfg pcfg;
struct sockaddr_in srvaddr;
+ struct sockaddr_in locaddr;
__u32 ipaddr;
char *flag;
int fd, rc;
int type = SOCKNAL_CONN_ANY;
- int port;
+ int port, rport;
+ int o;
if (argc < 3) {
fprintf(stderr, "usage: %s ip port [type]\n", argv[0]);
return (-1);
}
+ memset(&locaddr, 0, sizeof(locaddr));
+ locaddr.sin_family = AF_INET;
+ locaddr.sin_addr.s_addr = INADDR_ANY;
+
memset(&srvaddr, 0, sizeof(srvaddr));
srvaddr.sin_family = AF_INET;
srvaddr.sin_port = htons(port);
srvaddr.sin_addr.s_addr = htonl(ipaddr);
- fd = socket(PF_INET, SOCK_STREAM, 0);
- if ( fd < 0 ) {
- fprintf(stderr, "socket() failed: %s\n", strerror(errno));
- return -1;
+
+ for (rport = IPPORT_RESERVED - 1; rport > IPPORT_RESERVED / 2; --rport) {
+ fd = socket(PF_INET, SOCK_STREAM, 0);
+ if ( fd < 0 ) {
+ fprintf(stderr, "socket() failed: %s\n", strerror(errno));
+ return -1;
+ }
+
+ o = 1;
+ rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR,
+ &o, sizeof(o));
+
+ locaddr.sin_port = htons(rport);
+ rc = bind(fd, (struct sockaddr *)&locaddr, sizeof(locaddr));
+ if (rc == 0 || errno == EACCES) {
+ rc = connect(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr));
+ if (rc == 0) {
+ break;
+ } else if (errno != EADDRINUSE) {
+ fprintf(stderr, "Error connecting to host: %s\n", strerror(errno));
+ close(fd);
+ return -1;
+ }
+ } else if (errno != EADDRINUSE) {
+ fprintf(stderr, "Error binding to port %d: %d: %s\n", port, errno, strerror(errno));
+ close(fd);
+ return -1;
+ }
}
- rc = connect(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr));
- if ( rc == -1 ) {
- fprintf(stderr, "connect() failed: %s\n", strerror(errno));
+ if (rport == IPPORT_RESERVED / 2) {
+ fprintf(stderr,
+ "Warning: all privileged ports are in use.\n");
return -1;
}
fprintf(stderr, "close failed: %d\n", rc);
return 0;
+#endif /* HAVE_CONNECT */
}
int jt_ptl_disconnect(int argc, char **argv)
return 0;
}
- if (!g_nal_is_compatible (NULL, SOCKNAL, OPENIBNAL, 0))
+ if (!g_nal_is_compatible (NULL, SOCKNAL, OPENIBNAL, IIBNAL, 0))
return 0;
if (argc >= 2 &&
}
/* crappy overloads */
- if (data.ioc_nid != sizeof(lwt_event_t) ||
- data.ioc_nid2 != offsetof(lwt_event_t, lwte_where)) {
+ if (data.ioc_nid2 != sizeof(lwt_event_t) ||
+ data.ioc_nid3 != offsetof(lwt_event_t, lwte_where)) {
fprintf(stderr,"kernel/user LWT event mismatch %d(%d),%d(%d)\n",
- (int)data.ioc_nid, sizeof(lwt_event_t),
- (int)data.ioc_nid2,
+ (int)data.ioc_nid2, sizeof(lwt_event_t),
+ (int)data.ioc_nid3,
(int)offsetof(lwt_event_t, lwte_where));
return (-1);
}
static int
lwt_print(FILE *f, cycles_t t0, cycles_t tlast, double mhz, int cpu, lwt_event_t *e)
{
+#ifndef __WORDSIZE
+# error "__WORDSIZE not defined"
+#elif __WORDSIZE == 32
+# define XFMT "%#010lx"
+#elif __WORDSIZE== 64
+# define XFMT "%#018lx"
+#else
+# error "Unexpected __WORDSIZE"
+#endif
char *where = lwt_get_string(e->lwte_where);
if (where == NULL)
return (-1);
- fprintf(f, "%#010lx %#010lx %#010lx %#010lx: %#010lx %1d %10.6f %10.2f %s\n",
+ fprintf(f, XFMT" "XFMT" "XFMT" "XFMT": "XFMT" %2d %10.6f %10.2f %s\n",
e->lwte_p1, e->lwte_p2, e->lwte_p3, e->lwte_p4,
(long)e->lwte_task, cpu, (e->lwte_when - t0) / (mhz * 1000000.0),
(t0 == e->lwte_when) ? 0.0 : (e->lwte_when - tlast) / mhz,
lwt_put_string(where);
return (0);
+#undef XFMT
}
double
portals/knals/autoMakefile
portals/knals/gmnal/Makefile
portals/knals/gmnal/autoMakefile
+portals/knals/iibnal/Makefile
+portals/knals/iibnal/autoMakefile
portals/knals/openibnal/Makefile
portals/knals/openibnal/autoMakefile
portals/knals/qswnal/Makefile
+ Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
-+#ident "$Id: kksymoops-2.4.24.vanilla.patch,v 1.9 2004/10/24 17:00:18 yury Exp $"
++#ident "$Id: kksymoops-2.4.24.vanilla.patch,v 1.10 2004/10/29 15:04:35 eeb Exp $"
+
+#ifndef MODUTILS_KALLSYMS_H
+#define MODUTILS_KALLSYMS_H 1
AM_CONDITIONAL(INKERNEL, test x$enable_inkernel = xyes)
# -------- are we building against an external portals? -------
-AC_MSG_CHECKING([if Cray portals should be used])
+AC_MSG_CHECKING([for Cray portals])
AC_ARG_WITH([cray-portals],
AC_HELP_STRING([--with-cray-portals=path],
[path to cray portals]),
[
if test "$with_cray_portals" != no; then
- if test -r $with_cray_portals/include/portals/api.h ; then
- CRAY_PORTALS_PATH=$with_cray_portals
- CRAY_PORTALS_INCLUDE="-I$with_cray_portals/include"
- AC_DEFINE(CRAY_PORTALS, 1, [Building with Cray Portals])
- else
- AC_MSG_ERROR([--with-cray-portals specified badly])
- fi
- fi
+ CRAY_PORTALS_PATH=$with_cray_portals
+ CRAY_PORTALS_INCLUDES="$with_cray_portals/include"
+ CRAY_PORTALS_LIBS="$with_cray_portals"
+ fi
],[with_cray_portals=no])
AC_SUBST(CRAY_PORTALS_PATH)
-AC_MSG_RESULT([$with_cray_portals])
+AC_MSG_RESULT([$CRAY_PORTALS_PATH])
+
+AC_MSG_CHECKING([for Cray portals includes])
+AC_ARG_WITH([cray-portals-includes],
+ AC_HELP_STRING([--with-cray-portals-includes=path],
+ [path to cray portals includes]),
+ [
+ if test "$with_cray_portals_includes" != no; then
+ CRAY_PORTALS_INCLUDES="$with_cray_portals_includes"
+ fi
+ ])
+AC_SUBST(CRAY_PORTALS_INCLUDES)
+AC_MSG_RESULT([$CRAY_PORTALS_INCLUDES])
+
+AC_MSG_CHECKING([for Cray portals libs])
+AC_ARG_WITH([cray-portals-libs],
+ AC_HELP_STRING([--with-cray-portals-libs=path],
+ [path to cray portals libs]),
+ [
+ if test "$with_cray_portals_libs" != no; then
+ CRAY_PORTALS_LIBS="$with_cray_portals_libs"
+ fi
+ ])
+AC_SUBST(CRAY_PORTALS_LIBS)
+AC_MSG_RESULT([$CRAY_PORTALS_LIBS])
+
+if test x$CRAY_PORTALS_INCLUDES != x ; then
+ if test ! -r $CRAY_PORTALS_INCLUDES/portals/api.h ; then
+ AC_MSG_ERROR([Cray portals headers were not found in $CRAY_PORTALS_INCLUDES. Please check the paths passed to --with-cray-portals or --with-cray-portals-includes.])
+ fi
+fi
+if test x$CRAY_PORTALS_LIBS != x ; then
+ if test ! -r $CRAY_PORTALS_LIBS/libportals.a ; then
+ AC_MSG_ERROR([Cray portals libraries were not found in $CRAY_PORTALS_LIBS. Please check the paths passed to --with-cray-portals or --with-cray-portals-libs.])
+ fi
+fi
+AC_MSG_CHECKING([whether to use Cray portals])
+if test x$CRAY_PORTALS_INCLUDES != x -a x$CRAY_PORTALS_LIBS != x ; then
+ with_cray_portals=yes
+ AC_DEFINE(CRAY_PORTALS, 1, [Building with Cray Portals])
+ CRAY_PORTALS_INCLUDES="-I$CRAY_PORTALS_INCLUDES"
+else
+ with_cray_portals=no
+fi
+AC_MSG_RESULT([$with_cray_portals])
AM_CONDITIONAL(CRAY_PORTALS, test x$with_cray_portals != xno)
+# ----------------------------------------
+# some tests for catamount-like systems
+# ----------------------------------------
+AC_ARG_ENABLE([sysio_init],
+ AC_HELP_STRING([--disable-sysio-init],
+ [call sysio init functions when initializing liblustre]),
+ [],[enable_sysio_init=yes])
+AC_MSG_CHECKING([whether to initialize libsysio])
+AC_MSG_RESULT([$enable_sysio_init])
+if test x$enable_sysio_init != xno ; then
+ AC_DEFINE([INIT_SYSIO], 1, [call sysio init functions])
+fi
+
+AC_ARG_ENABLE([urandom],
+ AC_HELP_STRING([--disable-urandom],
+ [disable use of /dev/urandom for liblustre]),
+ [],[enable_urandom=yes])
+AC_MSG_CHECKING([whether to use /dev/urandom for liblustre])
+AC_MSG_RESULT([$enable_urandom])
+if test x$enable_urandom != xno ; then
+ AC_DEFINE([LIBLUSTRE_USE_URANDOM], 1, [use /dev/urandom for random data])
+fi
+
+# -------- check for -lcap and -lpthread ----
+if test x$enable_liblustre = xyes ; then
+ AC_CHECK_LIB([cap], [cap_get_proc],
+ [
+ CAP_LIBS="-lcap"
+ AC_DEFINE([HAVE_LIBCAP], 1, [use libcap])
+ ],
+ [CAP_LIBS=""])
+ AC_SUBST(CAP_LIBS)
+ AC_CHECK_LIB([pthread], [pthread_create],
+ [
+ PTHREAD_LIBS="-lpthread"
+ AC_DEFINE([HAVE_LIBPTHREAD], 1, [use libpthread])
+ ],
+ [PTHREAD_LIBS=""])
+ AC_SUBST(PTHREAD_LIBS)
+fi
+
# -------- enable tests and utils? -------
if test x$enable_tests = xno ; then
AC_MSG_NOTICE([disabling tests])
# ------- Makeflags ------------------
-CPPFLAGS="$CRAY_PORTALS_INCLUDE $CRAY_PORTALS_COMMANDLINE -I\$(top_srcdir)/include -I\$(top_srcdir)/portals/include"
+CPPFLAGS="$CPPFLAGS $CRAY_PORTALS_INCLUDES -I\$(top_srcdir)/include -I\$(top_srcdir)/portals/include"
# liblustre are all the same
LLCPPFLAGS="-D__arch_lib__ -D_LARGEFILE64_SOURCE=1"
AC_DEFINE(CONFIG_LDISKFS_FS_SECURITY, 1, [enable fs security])
fi
-EXTRA_KCFLAGS="-g $CRAY_PORTALS_INCLUDE $CRAY_PORTALS_COMMANDLINE -I$PWD/portals/include -I$PWD/include"
+EXTRA_KCFLAGS="-g $CRAY_PORTALS_INCLUDES -I$PWD/portals/include -I$PWD/include"
# these are like AC_TRY_COMPILE, but try to build modules against the
# kernel, inside the kernel-tests directory
AC_SUBST(OPENIBCPPFLAGS)
AC_SUBST(OPENIBNAL)
+ #### Infinicon IB
+ AC_MSG_CHECKING([if Infinicon IB kernel headers are present])
+ # for how the only infinicon ib build has headers in /usr/include/iba
+ IIBCPPFLAGS="-I/usr/include -DIN_TREE_BUILD"
+ EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS"
+ EXTRA_KCFLAGS="$EXTRA_KCFLAGS $IIBCPPFLAGS"
+ LUSTRE_MODULE_TRY_COMPILE(
+ [
+ #include <linux/iba/ibt.h>
+ ],[
+ IBT_INTERFACE_UNION interfaces;
+ FSTATUS rc;
+
+ rc = IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2,
+ &interfaces);
+
+ return rc == FSUCCESS ? 0 : 1;
+ ],[
+ AC_MSG_RESULT([yes])
+ IIBNAL="iibnal"
+ ],[
+ AC_MSG_RESULT([no])
+ IIBNAL=""
+ IIBCPPFLAGS=""
+ ])
+ EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save"
+ AC_SUBST(IIBCPPFLAGS)
+ AC_SUBST(IIBNAL)
+
# ---------- Red Hat 2.4.18 has iobuf->dovary --------------
# But other kernels don't
AM_CONDITIONAL(BUILD_QSWNAL, test x$QSWNAL = "xqswnal")
AM_CONDITIONAL(BUILD_GMNAL, test x$GMNAL = "xgmnal")
AM_CONDITIONAL(BUILD_OPENIBNAL, test x$OPENIBNAL = "xopenibnal")
+AM_CONDITIONAL(BUILD_IIBNAL, test x$IIBNAL = "xiibnal")
+
+# portals/utils/portals.c
+AC_CHECK_HEADERS([netdb.h netinet/tcp.h asm/types.h])
+AC_CHECK_FUNCS([gethostbyname socket connect])
+
+# portals/utils/debug.c
+AC_CHECK_HEADERS([linux/version.h])
+
+# include/liblustre.h
+AC_CHECK_HEADERS([asm/page.h sys/user.h stdint.h])
+
+# liblustre/llite_lib.h
+AC_CHECK_HEADERS([xtio.h file.h])
+
+# liblustre/dir.c
+AC_CHECK_HEADERS([linux/types.h sys/types.h linux/unistd.h unistd.h])
+
+# liblustre/lutil.c
+AC_CHECK_HEADERS([netinet/in.h arpa/inet.h catamount/data.h])
+AC_CHECK_FUNCS([inet_ntoa])
CPPFLAGS="-include \$(top_builddir)/include/config.h $CPPFLAGS"
EXTRA_KCFLAGS="-include $PWD/include/config.h $EXTRA_KCFLAGS"
AC_SUBST(EXTRA_KCFLAGS)
-#echo "KCPPFLAGS: $KCPPFLAGS"
-#echo "KCFLAGS: $KCFLAGS"
-#echo "LLCPPFLAGS: $LLCPPFLAGS"
-#echo "LLCFLAGS: $LLCFLAGS"
-#echo "MOD_LINK: $MOD_LINK"
-#echo "CFLAGS: $CFLAGS"
-#echo "CPPFLAGS: $CPPFLAGS"
+echo "CPPFLAGS: $CPPFLAGS"
+echo "LLCPPFLAGS: $LLCPPFLAGS"
+echo "CFLAGS: $CFLAGS"
+echo "EXTRA_KCFLAGS: $EXTRA_KCFLAGS"
+echo "LLCFLAGS: $LLCFLAGS"
"gcc version 2.96 20000731 (Mandrake Linux 8.1 2.96-0.62mdk)")
bad_cc
;;
+ # unpatched 'gcc' on rh9. miscompiles a
+ # struct = (type) { .member = value, };
+ # asignment in the iibnal where the struct is a mix
+ # of u64 and u32 bit-fields.
+ "gcc version 3.2.2 20030222 (Red Hat Linux 3.2.2-5)")
+ bad_cc
+ ;;
*)
AC_MSG_RESULT([no known problems])
;;
LIBWRAP=""
fi
AC_SUBST(LIBWRAP)
+
+AC_SUBST(LIBS)
--- /dev/null
+Makefile
+Makefile.in
# include <unistd.h>
# include <time.h>
# include <limits.h>
-# include <asm/types.h>
# ifndef DEBUG_SUBSYSTEM
# define DEBUG_SUBSYSTEM S_UNDEFINED
# endif
printf("%02x:%06x (@%lu %s:%s,l. %d %d %lu): " format, \
(subsys), (mask), (long)time(0), file, fn, line, \
getpid() , stack, ## a);
+
+#undef CWARN
+#undef CERROR
+#define CWARN(format, a...) CDEBUG(D_WARNING, format, ## a)
+#define CERROR(format, a...) CDEBUG(D_ERROR, format, ## a)
#endif
/* support decl needed both by kernel and liblustre */
#define LWT_MEMORY (16<<20)
#if !KLWT_SUPPORT
+# if defined(__KERNEL__)
+# if !defined(BITS_PER_LONG)
+# error "BITS_PER_LONG not defined"
+# endif
+# elif !defined(__WORDSIZE)
+# error "__WORDSIZE not defined"
+# else
+# define BITS_PER_LONG __WORDSIZE
+# endif
+
/* kernel hasn't defined this? */
typedef struct {
long long lwte_when;
data = (struct portal_ioctl_data *)buf;
err = copy_from_user(buf, (void *)arg, sizeof(*hdr));
- if ( err ) {
- EXIT;
- return err;
- }
+ if (err)
+ RETURN(err);
if (hdr->ioc_version != PORTAL_IOCTL_VERSION) {
- CERROR ("PORTALS: version mismatch kernel vs application\n");
- return -EINVAL;
+ CERROR("PORTALS: version mismatch kernel vs application\n");
+ RETURN(-EINVAL);
}
if (hdr->ioc_len + buf >= end) {
- CERROR ("PORTALS: user buffer exceeds kernel buffer\n");
- return -EINVAL;
+ CERROR("PORTALS: user buffer exceeds kernel buffer\n");
+ RETURN(-EINVAL);
}
if (hdr->ioc_len < sizeof(struct portal_ioctl_data)) {
- CERROR ("PORTALS: user buffer too small for ioctl\n");
- return -EINVAL;
+ CERROR("PORTALS: user buffer too small for ioctl\n");
+ RETURN(-EINVAL);
}
err = copy_from_user(buf, (void *)arg, hdr->ioc_len);
- if ( err ) {
- EXIT;
- return err;
- }
+ if (err)
+ RETURN(err);
if (portal_ioctl_is_invalid(data)) {
- CERROR ("PORTALS: ioctl not correctly formatted\n");
- return -EINVAL;
+ CERROR("PORTALS: ioctl not correctly formatted\n");
+ RETURN(-EINVAL);
}
- if (data->ioc_inllen1) {
+ if (data->ioc_inllen1)
data->ioc_inlbuf1 = &data->ioc_bulk[0];
- }
- if (data->ioc_inllen2) {
+ if (data->ioc_inllen2)
data->ioc_inlbuf2 = &data->ioc_bulk[0] +
size_round(data->ioc_inllen1);
- }
- EXIT;
- return 0;
+ RETURN(0);
}
#endif
TCPNAL = 5,
ROUTER = 6,
OPENIBNAL = 7,
+ IIBNAL = 8,
NAL_ENUM_END_MARKER
};
-#define PTL_NALFMT_SIZE 30 /* %u:%u.%u.%u.%u,%u (10+4+4+4+3+4+1) */
+#define PTL_NALFMT_SIZE 32 /* %u:%u.%u.%u.%u,%u (10+4+4+4+3+5+1) */
#define NAL_MAX_NR (NAL_ENUM_END_MARKER - 1)
#ifndef _LIBCFS_H
#define _LIBCFS_H
+#ifdef HAVE_ASM_TYPES_H
#include <asm/types.h>
+#else
+#include "types.h"
+#endif
#ifdef __KERNEL__
# include <linux/time.h>
extern unsigned int portal_debug;
extern unsigned int portal_printk;
-#include <asm/types.h>
struct ptldebug_header {
__u32 ph_len;
__u32 ph_flags;
#define S_GMNAL 0x00080000
#define S_PTLROUTER 0x00100000
#define S_COBD 0x00200000
-#define S_OPENIBNAL 0x00400000
+#define S_IBNAL 0x00400000 /* All IB NALs */
#define S_SM 0x00800000
#define S_ASOBD 0x01000000
#define S_LMV 0x02000000
CDEBUG_STACK, format, ## a); \
} while (0)
-#define CWARN(format, a...) CDEBUG(D_WARNING, format, ## a)
-#define CERROR(format, a...) CDEBUG(D_ERROR, format, ## a)
+#define CDEBUG_MAX_LIMIT 600
+#define CDEBUG_LIMIT(cdebug_mask, cdebug_format, a...) \
+do { \
+ static unsigned long cdebug_next; \
+ static int cdebug_count, cdebug_delay = 1; \
+ \
+ CHECK_STACK(CDEBUG_STACK); \
+ if (time_after(jiffies, cdebug_next)) { \
+ portals_debug_msg(DEBUG_SUBSYSTEM, cdebug_mask, __FILE__, \
+ __FUNCTION__, __LINE__, CDEBUG_STACK, \
+ cdebug_format, ## a); \
+ if (cdebug_count) { \
+ portals_debug_msg(DEBUG_SUBSYSTEM, cdebug_mask, \
+ __FILE__, __FUNCTION__, __LINE__, \
+ CDEBUG_STACK, cdebug_format, ## a); \
+ cdebug_count = 0; \
+ } \
+ if (time_after(jiffies, cdebug_next+(CDEBUG_MAX_LIMIT+10)*HZ))\
+ cdebug_delay = cdebug_delay > 8 ? cdebug_delay/8 : 1; \
+ else \
+ cdebug_delay = cdebug_delay*2 >= CDEBUG_MAX_LIMIT*HZ ?\
+ CDEBUG_MAX_LIMIT*HZ : cdebug_delay*2; \
+ cdebug_next = jiffies + cdebug_delay; \
+ } else { \
+ portals_debug_msg(DEBUG_SUBSYSTEM, \
+ portal_debug & ~(D_EMERG|D_ERROR|D_WARNING),\
+ __FILE__, __FUNCTION__, __LINE__, \
+ CDEBUG_STACK, cdebug_format, ## a); \
+ cdebug_count++; \
+ } \
+} while (0)
+
+#define CWARN(format, a...) CDEBUG_LIMIT(D_WARNING, format, ## a)
+#define CERROR(format, a...) CDEBUG_LIMIT(D_ERROR, format, ## a)
#define CEMERG(format, a...) CDEBUG(D_EMERG, format, ## a)
#define GOTO(label, rc) \
/* initial pid */
# if CRAY_PORTALS
/*
+ * 1) ptl_pid_t in cray portals is only 16 bits, not 32 bits, therefore this
+ * is too big.
*
- * 1) ptl_pid_t in cray portals is only 16 bits, not 32 bits, therefore this is too
- * big.
- *
- * 2) the implementation of ernal in cray portals further restricts the pid space
- * that may be used to 0 <= pid <= 255 (an 8 bit value). Returns an error at nal
- * init time for any pid outside this range. Other nals in cray portals don't have
- * this restriction.
+ * 2) the implementation of ernal in cray portals further restricts the pid
+ * space that may be used to 0 <= pid <= 255 (an 8 bit value). Returns
+ * an error at nal init time for any pid outside this range. Other nals
+ * in cray portals don't have this restriction.
* */
#define LUSTRE_PTL_PID 9
# else
call_usermodehelper(path, argv, envp, 1)
# define RECALC_SIGPENDING recalc_sigpending()
# define CURRENT_SECONDS get_seconds()
+# define smp_num_cpus NR_CPUS
+
#elif defined(CONFIG_RH_2_4_20) /* RH 2.4.x */
--- /dev/null
+Makefile
+Makefile.in
#ifndef _BUILD_CHECK_H
#define _BUILD_CHECK_H
-#ifdef CRAY_PORTALS
+#if CRAY_PORTALS
#error "an application got to me instead of cray's includes"
#endif
#define PORTALS_DEV_PATH "/dev/portals"
#define OBD_DEV_ID 1
#define OBD_DEV_PATH "/dev/obd"
-#define SMFS_DEV_ID 2
-#define SMFS_DEV_PATH "/dev/snapdev"
int ptl_name2nal(char *str);
int ptl_parse_ipaddr (__u32 *ipaddrp, char *str);
int ptl_initialize(int argc, char **argv);
int jt_ptl_network(int argc, char **argv);
-int jt_ptl_print_autoconnects (int argc, char **argv);
-int jt_ptl_add_autoconnect (int argc, char **argv);
-int jt_ptl_del_autoconnect (int argc, char **argv);
int jt_ptl_print_interfaces(int argc, char **argv);
int jt_ptl_add_interface(int argc, char **argv);
int jt_ptl_del_interface(int argc, char **argv);
int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility */
int jt_ptl_close_uuid(int argc, char **argv);
int jt_ptl_del_uuid(int argc, char **argv);
-int jt_ptl_rxmem (int argc, char **argv);
-int jt_ptl_txmem (int argc, char **argv);
-int jt_ptl_nagle (int argc, char **argv);
int jt_ptl_add_route (int argc, char **argv);
int jt_ptl_del_route (int argc, char **argv);
int jt_ptl_notify_router (int argc, char **argv);
@BUILD_GMNAL_TRUE@subdir-m += gmnal
@BUILD_OPENIBNAL_TRUE@subdir-m += openibnal
+@BUILD_IIBNAL_TRUE@subdir-m += iibnal
@BUILD_QSWNAL_TRUE@subdir-m += qswnal
subdir-m += socknal
# This code is issued under the GNU General Public License.
# See the file COPYING in this distribution
-SUBDIRS = gmnal openibnal qswnal socknal
+SUBDIRS = gmnal iibnal openibnal qswnal socknal
--- /dev/null
+.deps
+Makefile
+.*.cmd
+autoMakefile.in
+autoMakefile
+*.ko
+*.mod.c
+.*.flags
+.tmp_versions
+.depend
--- /dev/null
+MODULES := kiibnal
+kiibnal-objs := iibnal.o iibnal_cb.o
+
+EXTRA_POST_CFLAGS := @IIBCPPFLAGS@
+
+@INCLUDE_RULES@
--- /dev/null
+# Copyright (C) 2001 Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include $(src)/../../Kernelenv
+
+obj-y += kiibnal.o
+kiibnal-objs := iibnal.o iibnal_cb.o
+
--- /dev/null
+# Copyright (C) 2001 Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+if MODULES
+if !CRAY_PORTALS
+if BUILD_IIBNAL
+modulenet_DATA = kiibnal$(KMODEXT)
+endif
+endif
+endif
+
+MOSTLYCLEANFILES = *.o *.ko *.mod.c
+DIST_SOURCES = $(kiibnal-objs:%.o=%.c) iibnal.h
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "iibnal.h"
+
+nal_t kibnal_api;
+ptl_handle_ni_t kibnal_ni;
+kib_tunables_t kibnal_tunables;
+
+kib_data_t kibnal_data = {
+ .kib_service_id = IBNAL_SERVICE_NUMBER,
+};
+
+#ifdef CONFIG_SYSCTL
+#define IBNAL_SYSCTL 202
+
+#define IBNAL_SYSCTL_TIMEOUT 1
+
+static ctl_table kibnal_ctl_table[] = {
+ {IBNAL_SYSCTL_TIMEOUT, "timeout",
+ &kibnal_tunables.kib_io_timeout, sizeof (int),
+ 0644, NULL, &proc_dointvec},
+ { 0 }
+};
+
+static ctl_table kibnal_top_ctl_table[] = {
+ {IBNAL_SYSCTL, "iibnal", NULL, 0, 0555, kibnal_ctl_table},
+ { 0 }
+};
+#endif
+
+#ifdef unused
+void
+print_service(IB_SERVICE_RECORD *service, char *tag, int rc)
+{
+ char name[32];
+
+ if (service == NULL)
+ {
+ CWARN("tag : %s\n"
+ "status : %d (NULL)\n", tag, rc);
+ return;
+ }
+ strncpy (name, service->ServiceName, sizeof(name)-1);
+ name[sizeof(name)-1] = 0;
+
+ CWARN("tag : %s\n"
+ "status : %d\n"
+ "service id: "LPX64"\n"
+ "name : %s\n"
+ "NID : "LPX64"\n", tag, rc,
+ service->RID.ServiceID, name,
+ *kibnal_service_nid_field(service));
+}
+#endif
+
+static void
+kibnal_service_setunset_done (void *arg, FABRIC_OPERATION_DATA *fod,
+ FSTATUS frc, uint32 madrc)
+{
+ *(FSTATUS *)arg = frc;
+ up (&kibnal_data.kib_nid_signal);
+}
+
+#if IBNAL_CHECK_ADVERT
+static void
+kibnal_service_query_done (void *arg, QUERY *qry,
+ QUERY_RESULT_VALUES *qry_result)
+{
+ FSTATUS frc = qry_result->Status;
+
+ if (frc != FSUCCESS &&
+ qry_result->ResultDataSize == 0)
+ frc = FERROR;
+
+ *(FSTATUS *)arg = frc;
+ up (&kibnal_data.kib_nid_signal);
+}
+
+static void
+kibnal_check_advert (void)
+{
+ QUERY *qry;
+ IB_SERVICE_RECORD *svc;
+ FSTATUS frc;
+ FSTATUS frc2;
+
+ PORTAL_ALLOC(qry, sizeof(*qry));
+ if (qry == NULL)
+ return;
+
+ memset (qry, 0, sizeof(*qry));
+ qry->InputType = InputTypeServiceRecord;
+ qry->OutputType = OutputTypeServiceRecord;
+ qry->InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK;
+ svc = &qry->InputValue.ServiceRecordValue.ServiceRecord;
+ kibnal_set_service_keys(svc, kibnal_data.kib_nid);
+
+ frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd,
+ kibnal_data.kib_port_guid,
+ qry,
+ kibnal_service_query_done,
+ NULL, &frc2);
+ if (frc != FSUCCESS && frc != FPENDING) {
+ CERROR ("Immediate error %d checking SM service\n", frc);
+ } else {
+ down (&kibnal_data.kib_nid_signal);
+ frc = frc2;
+
+ if (frc != 0)
+ CERROR ("Error %d checking SM service\n", rc);
+ }
+
+ return (rc);
+}
+#endif
+
+static void fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type)
+{
+ IB_SERVICE_RECORD *svc;
+
+ memset (fod, 0, sizeof(*fod));
+ fod->Type = type;
+
+ svc = &fod->Value.ServiceRecordValue.ServiceRecord;
+ svc->RID.ServiceID = kibnal_data.kib_service_id;
+ svc->RID.ServiceGID.Type.Global.InterfaceID = kibnal_data.kib_port_guid;
+ svc->RID.ServiceGID.Type.Global.SubnetPrefix = DEFAULT_SUBNET_PREFIX;
+ svc->RID.ServiceP_Key = kibnal_data.kib_port_pkey;
+ svc->ServiceLease = 0xffffffff;
+
+ kibnal_set_service_keys(svc, kibnal_data.kib_nid);
+}
+
+static int
+kibnal_advertise (void)
+{
+ FABRIC_OPERATION_DATA *fod;
+ IB_SERVICE_RECORD *svc;
+ FSTATUS frc;
+ FSTATUS frc2;
+
+ LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
+
+ PORTAL_ALLOC(fod, sizeof(*fod));
+ if (fod == NULL)
+ return (-ENOMEM);
+
+ fill_fod(fod, FabOpSetServiceRecord);
+ svc = &fod->Value.ServiceRecordValue.ServiceRecord;
+
+ CDEBUG(D_NET, "Advertising service id "LPX64" %s:"LPX64"\n",
+ svc->RID.ServiceID,
+ svc->ServiceName, *kibnal_service_nid_field(svc));
+
+ frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd,
+ kibnal_data.kib_port_guid,
+ fod, kibnal_service_setunset_done,
+ NULL, &frc2);
+
+ if (frc != FSUCCESS && frc != FPENDING) {
+ CERROR ("Immediate error %d advertising NID "LPX64"\n",
+ frc, kibnal_data.kib_nid);
+ goto out;
+ }
+
+ down (&kibnal_data.kib_nid_signal);
+
+ frc = frc2;
+ if (frc != FSUCCESS)
+ CERROR ("Error %d advertising BUD "LPX64"\n",
+ frc, kibnal_data.kib_nid);
+out:
+ PORTAL_FREE(fod, sizeof(*fod));
+ return (frc == FSUCCESS) ? 0 : -EINVAL;
+}
+
+static void
+kibnal_unadvertise (int expect_success)
+{
+ FABRIC_OPERATION_DATA *fod;
+ IB_SERVICE_RECORD *svc;
+ FSTATUS frc;
+ FSTATUS frc2;
+
+ LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
+
+ PORTAL_ALLOC(fod, sizeof(*fod));
+ if (fod == NULL)
+ return;
+
+ fill_fod(fod, FabOpDeleteServiceRecord);
+ svc = &fod->Value.ServiceRecordValue.ServiceRecord;
+
+ CDEBUG(D_NET, "Unadvertising service %s:"LPX64"\n",
+ svc->ServiceName, *kibnal_service_nid_field(svc));
+
+ frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd,
+ kibnal_data.kib_port_guid,
+ fod, kibnal_service_setunset_done,
+ NULL, &frc2);
+
+ if (frc != FSUCCESS && frc != FPENDING) {
+ CERROR ("Immediate error %d unadvertising NID "LPX64"\n",
+ frc, kibnal_data.kib_nid);
+ goto out;
+ }
+
+ down (&kibnal_data.kib_nid_signal);
+
+ if ((frc2 == FSUCCESS) == !!expect_success)
+ goto out;
+
+ if (expect_success)
+ CERROR("Error %d unadvertising NID "LPX64"\n",
+ frc2, kibnal_data.kib_nid);
+ else
+ CWARN("Removed conflicting NID "LPX64"\n",
+ kibnal_data.kib_nid);
+ out:
+ PORTAL_FREE(fod, sizeof(*fod));
+}
+
+static int
+kibnal_set_mynid(ptl_nid_t nid)
+{
+ struct timeval tv;
+ lib_ni_t *ni = &kibnal_lib.libnal_ni;
+ int rc;
+ FSTATUS frc;
+
+ CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
+ nid, ni->ni_pid.nid);
+
+ do_gettimeofday(&tv);
+
+ down (&kibnal_data.kib_nid_mutex);
+
+ if (nid == kibnal_data.kib_nid) {
+ /* no change of NID */
+ up (&kibnal_data.kib_nid_mutex);
+ return (0);
+ }
+
+ CDEBUG(D_NET, "NID "LPX64"("LPX64")\n",
+ kibnal_data.kib_nid, nid);
+
+ if (kibnal_data.kib_nid != PTL_NID_ANY) {
+
+ kibnal_unadvertise (1);
+
+ frc = iibt_cm_cancel(kibnal_data.kib_cep);
+ if (frc != FSUCCESS && frc != FPENDING)
+ CERROR ("Error %d stopping listener\n", frc);
+
+ frc = iibt_cm_destroy_cep(kibnal_data.kib_cep);
+ if (frc != FSUCCESS)
+ CERROR ("Error %d destroying CEP\n", frc);
+
+ kibnal_data.kib_cep = NULL;
+ }
+
+ kibnal_data.kib_nid = ni->ni_pid.nid = nid;
+ kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+
+ /* Delete all existing peers and their connections after new
+ * NID/incarnation set to ensure no old connections in our brave
+ * new world. */
+ kibnal_del_peer (PTL_NID_ANY, 0);
+
+ if (kibnal_data.kib_nid == PTL_NID_ANY) {
+ /* No new NID to install */
+ up (&kibnal_data.kib_nid_mutex);
+ return (0);
+ }
+
+ /* remove any previous advert (crashed node etc) */
+ kibnal_unadvertise(0);
+
+ kibnal_data.kib_cep = iibt_cm_create_cep(CM_RC_TYPE);
+ if (kibnal_data.kib_cep == NULL) {
+ CERROR ("Can't create CEP\n");
+ rc = -ENOMEM;
+ } else {
+ CM_LISTEN_INFO info;
+ memset (&info, 0, sizeof(info));
+ info.ListenAddr.EndPt.SID = kibnal_data.kib_service_id;
+
+ frc = iibt_cm_listen(kibnal_data.kib_cep, &info,
+ kibnal_listen_callback, NULL);
+ if (frc != FSUCCESS && frc != FPENDING) {
+ CERROR ("iibt_cm_listen error: %d\n", frc);
+ rc = -EINVAL;
+ } else {
+ rc = 0;
+ }
+ }
+
+ if (rc == 0) {
+ rc = kibnal_advertise();
+ if (rc == 0) {
+#if IBNAL_CHECK_ADVERT
+ kibnal_check_advert();
+#endif
+ up (&kibnal_data.kib_nid_mutex);
+ return (0);
+ }
+
+ iibt_cm_cancel (kibnal_data.kib_cep);
+ iibt_cm_destroy_cep (kibnal_data.kib_cep);
+ /* remove any peers that sprung up while I failed to
+ * advertise myself */
+ kibnal_del_peer (PTL_NID_ANY, 0);
+ }
+
+ kibnal_data.kib_nid = PTL_NID_ANY;
+ up (&kibnal_data.kib_nid_mutex);
+ return (rc);
+}
+
+kib_peer_t *
+kibnal_create_peer (ptl_nid_t nid)
+{
+ kib_peer_t *peer;
+
+ LASSERT (nid != PTL_NID_ANY);
+
+ PORTAL_ALLOC (peer, sizeof (*peer));
+ if (peer == NULL)
+ return (NULL);
+
+ memset(peer, 0, sizeof(*peer)); /* zero flags etc */
+
+ peer->ibp_nid = nid;
+ atomic_set (&peer->ibp_refcount, 1); /* 1 ref for caller */
+
+ INIT_LIST_HEAD (&peer->ibp_list); /* not in the peer table yet */
+ INIT_LIST_HEAD (&peer->ibp_conns);
+ INIT_LIST_HEAD (&peer->ibp_tx_queue);
+
+ peer->ibp_reconnect_time = jiffies;
+ peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
+
+ atomic_inc (&kibnal_data.kib_npeers);
+ return (peer);
+}
+
+void
+kibnal_destroy_peer (kib_peer_t *peer)
+{
+
+ LASSERT (atomic_read (&peer->ibp_refcount) == 0);
+ LASSERT (peer->ibp_persistence == 0);
+ LASSERT (!kibnal_peer_active(peer));
+ LASSERT (peer->ibp_connecting == 0);
+ LASSERT (list_empty (&peer->ibp_conns));
+ LASSERT (list_empty (&peer->ibp_tx_queue));
+
+ PORTAL_FREE (peer, sizeof (*peer));
+
+ /* NB a peer's connections keep a reference on their peer until
+ * they are destroyed, so we can be assured that _all_ state to do
+ * with this peer has been cleaned up when its refcount drops to
+ * zero. */
+ atomic_dec (&kibnal_data.kib_npeers);
+}
+
+/* the caller is responsible for accounting for the additional reference
+ * that this creates */
+kib_peer_t *
+kibnal_find_peer_locked (ptl_nid_t nid)
+{
+ struct list_head *peer_list = kibnal_nid2peerlist (nid);
+ struct list_head *tmp;
+ kib_peer_t *peer;
+
+ list_for_each (tmp, peer_list) {
+
+ peer = list_entry (tmp, kib_peer_t, ibp_list);
+
+ LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
+ peer->ibp_connecting != 0 || /* creating conns */
+ !list_empty (&peer->ibp_conns)); /* active conn */
+
+ if (peer->ibp_nid != nid)
+ continue;
+
+ CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n",
+ peer, nid, atomic_read (&peer->ibp_refcount));
+ return (peer);
+ }
+ return (NULL);
+}
+
+kib_peer_t *
+kibnal_get_peer (ptl_nid_t nid)
+{
+ kib_peer_t *peer;
+
+ read_lock (&kibnal_data.kib_global_lock);
+ peer = kibnal_find_peer_locked (nid);
+ if (peer != NULL) /* +1 ref for caller? */
+ kib_peer_addref(peer);
+ read_unlock (&kibnal_data.kib_global_lock);
+
+ return (peer);
+}
+
+void
+kibnal_unlink_peer_locked (kib_peer_t *peer)
+{
+ LASSERT (peer->ibp_persistence == 0);
+ LASSERT (list_empty(&peer->ibp_conns));
+
+ LASSERT (kibnal_peer_active(peer));
+ list_del_init (&peer->ibp_list);
+ /* lose peerlist's ref */
+ kib_peer_decref(peer);
+}
+
+static int
+kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
+{
+ kib_peer_t *peer;
+ struct list_head *ptmp;
+ int i;
+
+ read_lock (&kibnal_data.kib_global_lock);
+
+ for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
+
+ list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
+
+ peer = list_entry (ptmp, kib_peer_t, ibp_list);
+ LASSERT (peer->ibp_persistence != 0 ||
+ peer->ibp_connecting != 0 ||
+ !list_empty (&peer->ibp_conns));
+
+ if (index-- > 0)
+ continue;
+
+ *nidp = peer->ibp_nid;
+ *persistencep = peer->ibp_persistence;
+
+ read_unlock (&kibnal_data.kib_global_lock);
+ return (0);
+ }
+ }
+
+ read_unlock (&kibnal_data.kib_global_lock);
+ return (-ENOENT);
+}
+
+static int
+kibnal_add_persistent_peer (ptl_nid_t nid)
+{
+ unsigned long flags;
+ kib_peer_t *peer;
+ kib_peer_t *peer2;
+
+ if (nid == PTL_NID_ANY)
+ return (-EINVAL);
+
+ peer = kibnal_create_peer (nid);
+ if (peer == NULL)
+ return (-ENOMEM);
+
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+ peer2 = kibnal_find_peer_locked (nid);
+ if (peer2 != NULL) {
+ kib_peer_decref (peer);
+ peer = peer2;
+ } else {
+ /* peer table takes existing ref on peer */
+ list_add_tail (&peer->ibp_list,
+ kibnal_nid2peerlist (nid));
+ }
+
+ peer->ibp_persistence++;
+
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+ return (0);
+}
+
+static void
+kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
+{
+ struct list_head *ctmp;
+ struct list_head *cnxt;
+ kib_conn_t *conn;
+
+ if (!single_share)
+ peer->ibp_persistence = 0;
+ else if (peer->ibp_persistence > 0)
+ peer->ibp_persistence--;
+
+ if (peer->ibp_persistence != 0)
+ return;
+
+ list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+ conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+ kibnal_close_conn_locked (conn, 0);
+ }
+
+ /* NB peer unlinks itself when last conn is closed */
+}
+
+int
+kibnal_del_peer (ptl_nid_t nid, int single_share)
+{
+ unsigned long flags;
+ struct list_head *ptmp;
+ struct list_head *pnxt;
+ kib_peer_t *peer;
+ int lo;
+ int hi;
+ int i;
+ int rc = -ENOENT;
+
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+ if (nid != PTL_NID_ANY)
+ lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
+ else {
+ lo = 0;
+ hi = kibnal_data.kib_peer_hash_size - 1;
+ }
+
+ for (i = lo; i <= hi; i++) {
+ list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
+ peer = list_entry (ptmp, kib_peer_t, ibp_list);
+ LASSERT (peer->ibp_persistence != 0 ||
+ peer->ibp_connecting != 0 ||
+ !list_empty (&peer->ibp_conns));
+
+ if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid))
+ continue;
+
+ kibnal_del_peer_locked (peer, single_share);
+ rc = 0; /* matched something */
+
+ if (single_share)
+ goto out;
+ }
+ }
+ out:
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+ return (rc);
+}
+
+static kib_conn_t *
+kibnal_get_conn_by_idx (int index)
+{
+ kib_peer_t *peer;
+ struct list_head *ptmp;
+ kib_conn_t *conn;
+ struct list_head *ctmp;
+ int i;
+
+ read_lock (&kibnal_data.kib_global_lock);
+
+ for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
+ list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
+
+ peer = list_entry (ptmp, kib_peer_t, ibp_list);
+ LASSERT (peer->ibp_persistence > 0 ||
+ peer->ibp_connecting != 0 ||
+ !list_empty (&peer->ibp_conns));
+
+ list_for_each (ctmp, &peer->ibp_conns) {
+ if (index-- > 0)
+ continue;
+
+ conn = list_entry (ctmp, kib_conn_t, ibc_list);
+ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+ atomic_inc (&conn->ibc_refcount);
+ read_unlock (&kibnal_data.kib_global_lock);
+ return (conn);
+ }
+ }
+ }
+
+ read_unlock (&kibnal_data.kib_global_lock);
+ return (NULL);
+}
+
+kib_conn_t *
+kibnal_create_conn (void)
+{
+ kib_conn_t *conn;
+ int i;
+ __u64 vaddr = 0;
+ __u64 vaddr_base;
+ int page_offset;
+ int ipage;
+ int rc;
+ FSTATUS frc;
+ union {
+ IB_QP_ATTRIBUTES_CREATE qp_create;
+ IB_QP_ATTRIBUTES_MODIFY qp_attr;
+ } params;
+
+ PORTAL_ALLOC (conn, sizeof (*conn));
+ if (conn == NULL) {
+ CERROR ("Can't allocate connection\n");
+ return (NULL);
+ }
+
+ /* zero flags, NULL pointers etc... */
+ memset (conn, 0, sizeof (*conn));
+
+ INIT_LIST_HEAD (&conn->ibc_tx_queue);
+ INIT_LIST_HEAD (&conn->ibc_active_txs);
+ spin_lock_init (&conn->ibc_lock);
+
+ atomic_inc (&kibnal_data.kib_nconns);
+ /* well not really, but I call destroy() on failure, which decrements */
+
+ PORTAL_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
+ if (conn->ibc_rxs == NULL)
+ goto failed;
+ memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
+
+ rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES, 1);
+ if (rc != 0)
+ goto failed;
+
+ vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr;
+
+ for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
+ struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
+ kib_rx_t *rx = &conn->ibc_rxs[i];
+
+ rx->rx_conn = conn;
+ rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) +
+ page_offset);
+
+ if (kibnal_whole_mem())
+ rx->rx_vaddr = kibnal_page2phys(page) +
+ page_offset +
+ kibnal_data.kib_md.md_addr;
+ else
+ rx->rx_vaddr = vaddr;
+
+ vaddr += IBNAL_MSG_SIZE;
+ LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
+
+ page_offset += IBNAL_MSG_SIZE;
+ LASSERT (page_offset <= PAGE_SIZE);
+
+ if (page_offset == PAGE_SIZE) {
+ page_offset = 0;
+ ipage++;
+ LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
+ }
+ }
+
+ params.qp_create = (IB_QP_ATTRIBUTES_CREATE) {
+ .Type = QPTypeReliableConnected,
+ .SendQDepth = IBNAL_TX_MAX_SG *
+ IBNAL_MSG_QUEUE_SIZE,
+ .RecvQDepth = IBNAL_MSG_QUEUE_SIZE,
+ .SendDSListDepth = 1,
+ .RecvDSListDepth = 1,
+ .SendCQHandle = kibnal_data.kib_cq,
+ .RecvCQHandle = kibnal_data.kib_cq,
+ .PDHandle = kibnal_data.kib_pd,
+ .SendSignaledCompletions = TRUE,
+ };
+ frc = iibt_qp_create(kibnal_data.kib_hca, ¶ms.qp_create, NULL,
+ &conn->ibc_qp, &conn->ibc_qp_attrs);
+ if (rc != 0) {
+ CERROR ("Failed to create queue pair: %d\n", rc);
+ goto failed;
+ }
+
+ /* Mark QP created */
+ conn->ibc_state = IBNAL_CONN_INIT_QP;
+
+ params.qp_attr = (IB_QP_ATTRIBUTES_MODIFY) {
+ .RequestState = QPStateInit,
+ .Attrs = (IB_QP_ATTR_PORTGUID |
+ IB_QP_ATTR_PKEYINDEX |
+ IB_QP_ATTR_ACCESSCONTROL),
+ .PortGUID = kibnal_data.kib_port_guid,
+ .PkeyIndex = 0,
+ .AccessControl = {
+ .s = {
+ .RdmaWrite = 1,
+ .RdmaRead = 1,
+ },
+ },
+ };
+ rc = iibt_qp_modify(conn->ibc_qp, ¶ms.qp_attr, NULL);
+ if (rc != 0) {
+ CERROR ("Failed to modify queue pair: %d\n", rc);
+ goto failed;
+ }
+
+ /* 1 ref for caller */
+ atomic_set (&conn->ibc_refcount, 1);
+ return (conn);
+
+ failed:
+ kibnal_destroy_conn (conn);
+ return (NULL);
+}
+
+void
+kibnal_destroy_conn (kib_conn_t *conn)
+{
+ int rc;
+ FSTATUS frc;
+
+ CDEBUG (D_NET, "connection %p\n", conn);
+
+ LASSERT (atomic_read (&conn->ibc_refcount) == 0);
+ LASSERT (list_empty(&conn->ibc_tx_queue));
+ LASSERT (list_empty(&conn->ibc_active_txs));
+ LASSERT (conn->ibc_nsends_posted == 0);
+ LASSERT (conn->ibc_connreq == NULL);
+
+ switch (conn->ibc_state) {
+ case IBNAL_CONN_DISCONNECTED:
+ /* called after connection sequence initiated */
+ /* fall through */
+
+ case IBNAL_CONN_INIT_QP:
+ /* _destroy includes an implicit Reset of the QP which
+ * discards posted work */
+ rc = iibt_qp_destroy(conn->ibc_qp);
+ if (rc != 0)
+ CERROR("Can't destroy QP: %d\n", rc);
+ /* fall through */
+
+ case IBNAL_CONN_INIT_NOTHING:
+ break;
+
+ default:
+ LASSERT (0);
+ }
+
+ if (conn->ibc_cep != NULL) {
+ frc = iibt_cm_destroy_cep(conn->ibc_cep);
+ if (frc != 0)
+ CERROR("Can't destroy CEP %p: %d\n", conn->ibc_cep,
+ frc);
+ }
+
+ if (conn->ibc_rx_pages != NULL)
+ kibnal_free_pages(conn->ibc_rx_pages);
+
+ if (conn->ibc_rxs != NULL)
+ PORTAL_FREE(conn->ibc_rxs,
+ IBNAL_RX_MSGS * sizeof(kib_rx_t));
+
+ if (conn->ibc_peer != NULL)
+ kib_peer_decref(conn->ibc_peer);
+
+ PORTAL_FREE(conn, sizeof (*conn));
+
+ atomic_dec(&kibnal_data.kib_nconns);
+
+ if (atomic_read (&kibnal_data.kib_nconns) == 0 &&
+ kibnal_data.kib_shutdown) {
+ /* I just nuked the last connection on shutdown; wake up
+ * everyone so they can exit. */
+ wake_up_all(&kibnal_data.kib_sched_waitq);
+ wake_up_all(&kibnal_data.kib_connd_waitq);
+ }
+}
+
+void
+kibnal_put_conn (kib_conn_t *conn)
+{
+ unsigned long flags;
+
+ CDEBUG (D_NET, "putting conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+
+ LASSERT (atomic_read (&conn->ibc_refcount) > 0);
+ if (!atomic_dec_and_test (&conn->ibc_refcount))
+ return;
+
+ /* must disconnect before dropping the final ref */
+ LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECTED);
+
+ spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+
+ list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns);
+ wake_up (&kibnal_data.kib_connd_waitq);
+
+ spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+}
+
+static int
+kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
+{
+ kib_conn_t *conn;
+ struct list_head *ctmp;
+ struct list_head *cnxt;
+ int count = 0;
+
+ list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+ conn = list_entry (ctmp, kib_conn_t, ibc_list);
+
+ count++;
+ kibnal_close_conn_locked (conn, why);
+ }
+
+ return (count);
+}
+
+int
+kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
+{
+ kib_conn_t *conn;
+ struct list_head *ctmp;
+ struct list_head *cnxt;
+ int count = 0;
+
+ list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+ conn = list_entry (ctmp, kib_conn_t, ibc_list);
+
+ if (conn->ibc_incarnation == incarnation)
+ continue;
+
+ CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n",
+ peer->ibp_nid, conn->ibc_incarnation, incarnation);
+
+ count++;
+ kibnal_close_conn_locked (conn, -ESTALE);
+ }
+
+ return (count);
+}
+
+static int
+kibnal_close_matching_conns (ptl_nid_t nid)
+{
+ unsigned long flags;
+ kib_peer_t *peer;
+ struct list_head *ptmp;
+ struct list_head *pnxt;
+ int lo;
+ int hi;
+ int i;
+ int count = 0;
+
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+ if (nid != PTL_NID_ANY)
+ lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
+ else {
+ lo = 0;
+ hi = kibnal_data.kib_peer_hash_size - 1;
+ }
+
+ for (i = lo; i <= hi; i++) {
+ list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
+
+ peer = list_entry (ptmp, kib_peer_t, ibp_list);
+ LASSERT (peer->ibp_persistence != 0 ||
+ peer->ibp_connecting != 0 ||
+ !list_empty (&peer->ibp_conns));
+
+ if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid))
+ continue;
+
+ count += kibnal_close_peer_conns_locked (peer, 0);
+ }
+ }
+
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+ /* wildcards always succeed */
+ if (nid == PTL_NID_ANY)
+ return (0);
+
+ return (count == 0 ? -ENOENT : 0);
+}
+
+static int
+kibnal_cmd(struct portals_cfg *pcfg, void * private)
+{
+ int rc = -EINVAL;
+ ENTRY;
+
+ LASSERT (pcfg != NULL);
+
+ switch(pcfg->pcfg_command) {
+ case NAL_CMD_GET_PEER: {
+ ptl_nid_t nid = 0;
+ int share_count = 0;
+
+ rc = kibnal_get_peer_info(pcfg->pcfg_count,
+ &nid, &share_count);
+ pcfg->pcfg_nid = nid;
+ pcfg->pcfg_size = 0;
+ pcfg->pcfg_id = 0;
+ pcfg->pcfg_misc = 0;
+ pcfg->pcfg_count = 0;
+ pcfg->pcfg_wait = share_count;
+ break;
+ }
+ case NAL_CMD_ADD_PEER: {
+ rc = kibnal_add_persistent_peer (pcfg->pcfg_nid);
+ break;
+ }
+ case NAL_CMD_DEL_PEER: {
+ rc = kibnal_del_peer (pcfg->pcfg_nid,
+ /* flags == single_share */
+ pcfg->pcfg_flags != 0);
+ break;
+ }
+ case NAL_CMD_GET_CONN: {
+ kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count);
+
+ if (conn == NULL)
+ rc = -ENOENT;
+ else {
+ rc = 0;
+ pcfg->pcfg_nid = conn->ibc_peer->ibp_nid;
+ pcfg->pcfg_id = 0;
+ pcfg->pcfg_misc = 0;
+ pcfg->pcfg_flags = 0;
+ kibnal_put_conn (conn);
+ }
+ break;
+ }
+ case NAL_CMD_CLOSE_CONNECTION: {
+ rc = kibnal_close_matching_conns (pcfg->pcfg_nid);
+ break;
+ }
+ case NAL_CMD_REGISTER_MYNID: {
+ if (pcfg->pcfg_nid == PTL_NID_ANY)
+ rc = -EINVAL;
+ else
+ rc = kibnal_set_mynid (pcfg->pcfg_nid);
+ break;
+ }
+ }
+
+ RETURN(rc);
+}
+
+void
+kibnal_free_pages (kib_pages_t *p)
+{
+ int npages = p->ibp_npages;
+ int rc;
+ int i;
+
+ if (p->ibp_mapped) {
+ rc = iibt_deregister_memory(p->ibp_handle);
+ if (rc != 0)
+ CERROR ("Deregister error: %d\n", rc);
+ }
+
+ for (i = 0; i < npages; i++)
+ if (p->ibp_pages[i] != NULL)
+ __free_page(p->ibp_pages[i]);
+
+ PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
+}
+
+int
+kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write)
+{
+ kib_pages_t *p;
+ __u64 *phys_pages;
+ int i;
+ FSTATUS frc;
+ IB_ACCESS_CONTROL access;
+
+ memset(&access, 0, sizeof(access));
+ access.s.MWBindable = 1;
+ access.s.LocalWrite = 1;
+ access.s.RdmaRead = 1;
+ access.s.RdmaWrite = 1;
+
+ PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
+ if (p == NULL) {
+ CERROR ("Can't allocate buffer %d\n", npages);
+ return (-ENOMEM);
+ }
+
+ memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
+ p->ibp_npages = npages;
+
+ for (i = 0; i < npages; i++) {
+ p->ibp_pages[i] = alloc_page (GFP_KERNEL);
+ if (p->ibp_pages[i] == NULL) {
+ CERROR ("Can't allocate page %d of %d\n", i, npages);
+ kibnal_free_pages(p);
+ return (-ENOMEM);
+ }
+ }
+
+ if (kibnal_whole_mem())
+ goto out;
+
+ PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages));
+ if (phys_pages == NULL) {
+ CERROR ("Can't allocate physarray for %d pages\n", npages);
+ /* XXX free ibp_pages? */
+ kibnal_free_pages(p);
+ return (-ENOMEM);
+ }
+
+ /* if we were using the _contig_ registration variant we would have
+ * an array of PhysAddr/Length pairs, but the discontiguous variant
+ * just takes the PhysAddr */
+ for (i = 0; i < npages; i++)
+ phys_pages[i] = kibnal_page2phys(p->ibp_pages[i]);
+
+ frc = iibt_register_physical_memory(kibnal_data.kib_hca,
+ 0, /* requested vaddr */
+ phys_pages, npages,
+ 0, /* offset */
+ kibnal_data.kib_pd,
+ access,
+ &p->ibp_handle, &p->ibp_vaddr,
+ &p->ibp_lkey, &p->ibp_rkey);
+
+ PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages));
+
+ if (frc != FSUCCESS) {
+ CERROR ("Error %d mapping %d pages\n", frc, npages);
+ kibnal_free_pages(p);
+ return (-ENOMEM);
+ }
+
+ CDEBUG(D_NET, "registered %d pages; handle: %p vaddr "LPX64" "
+ "lkey %x rkey %x\n", npages, p->ibp_handle,
+ p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey);
+
+ p->ibp_mapped = 1;
+out:
+ *pp = p;
+ return (0);
+}
+
+static int
+kibnal_setup_tx_descs (void)
+{
+ int ipage = 0;
+ int page_offset = 0;
+ __u64 vaddr;
+ __u64 vaddr_base;
+ struct page *page;
+ kib_tx_t *tx;
+ int i;
+ int rc;
+
+ /* pre-mapped messages are not bigger than 1 page */
+ LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
+
+ /* No fancy arithmetic when we do the buffer calculations */
+ LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
+
+ rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES,
+ 0);
+ if (rc != 0)
+ return (rc);
+
+ /* ignored for the whole_mem case */
+ vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
+
+ for (i = 0; i < IBNAL_TX_MSGS; i++) {
+ page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
+ tx = &kibnal_data.kib_tx_descs[i];
+
+ memset (tx, 0, sizeof(*tx)); /* zero flags etc */
+
+ tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) +
+ page_offset);
+
+ if (kibnal_whole_mem())
+ tx->tx_vaddr = kibnal_page2phys(page) +
+ page_offset +
+ kibnal_data.kib_md.md_addr;
+ else
+ tx->tx_vaddr = vaddr;
+
+ tx->tx_isnblk = (i >= IBNAL_NTX);
+ tx->tx_mapped = KIB_TX_UNMAPPED;
+
+ CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n",
+ i, tx, tx->tx_msg, tx->tx_vaddr);
+
+ if (tx->tx_isnblk)
+ list_add (&tx->tx_list,
+ &kibnal_data.kib_idle_nblk_txs);
+ else
+ list_add (&tx->tx_list,
+ &kibnal_data.kib_idle_txs);
+
+ vaddr += IBNAL_MSG_SIZE;
+ LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
+
+ page_offset += IBNAL_MSG_SIZE;
+ LASSERT (page_offset <= PAGE_SIZE);
+
+ if (page_offset == PAGE_SIZE) {
+ page_offset = 0;
+ ipage++;
+ LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
+ }
+ }
+
+ return (0);
+}
+
+static void
+kibnal_api_shutdown (nal_t *nal)
+{
+ int i;
+ int rc;
+
+ if (nal->nal_refct != 0) {
+ /* This module got the first ref */
+ PORTAL_MODULE_UNUSE;
+ return;
+ }
+
+ CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
+ atomic_read (&portal_kmemory));
+
+ LASSERT(nal == &kibnal_api);
+
+ switch (kibnal_data.kib_init) {
+ default:
+ CERROR ("Unexpected state %d\n", kibnal_data.kib_init);
+ LBUG();
+
+ case IBNAL_INIT_ALL:
+ /* stop calls to nal_cmd */
+ libcfs_nal_cmd_unregister(IIBNAL);
+ /* No new peers */
+
+ /* resetting my NID to unadvertises me, removes my
+ * listener and nukes all current peers */
+ kibnal_set_mynid (PTL_NID_ANY);
+
+ /* Wait for all peer state to clean up (crazy) */
+ i = 2;
+ while (atomic_read (&kibnal_data.kib_npeers) != 0) {
+ i++;
+ CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+ "waiting for %d peers to disconnect (can take a few seconds)\n",
+ atomic_read (&kibnal_data.kib_npeers));
+ set_current_state (TASK_UNINTERRUPTIBLE);
+ schedule_timeout (HZ);
+ }
+ /* fall through */
+
+ case IBNAL_INIT_CQ:
+ rc = iibt_cq_destroy(kibnal_data.kib_cq);
+ if (rc != 0)
+ CERROR ("Destroy CQ error: %d\n", rc);
+ /* fall through */
+
+ case IBNAL_INIT_TXD:
+ kibnal_free_pages (kibnal_data.kib_tx_pages);
+ /* fall through */
+
+ case IBNAL_INIT_MR:
+ if (kibnal_data.kib_md.md_handle != NULL) {
+ rc = iibt_deregister_memory(kibnal_data.kib_md.md_handle);
+ if (rc != FSUCCESS)
+ CERROR ("Deregister memory: %d\n", rc);
+ }
+ /* fall through */
+
+#if IBNAL_FMR
+ case IBNAL_INIT_FMR:
+ rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool);
+ if (rc != 0)
+ CERROR ("Destroy FMR pool error: %d\n", rc);
+ /* fall through */
+#endif
+ case IBNAL_INIT_PD:
+ rc = iibt_pd_free(kibnal_data.kib_pd);
+ if (rc != 0)
+ CERROR ("Destroy PD error: %d\n", rc);
+ /* fall through */
+
+ case IBNAL_INIT_SD:
+ rc = iibt_sd_deregister(kibnal_data.kib_sd);
+ if (rc != 0)
+ CERROR ("Deregister SD error: %d\n", rc);
+ /* fall through */
+
+ case IBNAL_INIT_PORT:
+ /* XXX ??? */
+ /* fall through */
+
+ case IBNAL_INIT_PORTATTRS:
+ PORTAL_FREE(kibnal_data.kib_hca_attrs.PortAttributesList,
+ kibnal_data.kib_hca_attrs.PortAttributesListSize);
+ /* fall through */
+
+ case IBNAL_INIT_HCA:
+ rc = iibt_close_hca(kibnal_data.kib_hca);
+ if (rc != 0)
+ CERROR ("Close HCA error: %d\n", rc);
+ /* fall through */
+
+ case IBNAL_INIT_LIB:
+ lib_fini(&kibnal_lib);
+ /* fall through */
+
+ case IBNAL_INIT_DATA:
+ /* Module refcount only gets to zero when all peers
+ * have been closed so all lists must be empty */
+ LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
+ LASSERT (kibnal_data.kib_peers != NULL);
+ for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
+ LASSERT (list_empty (&kibnal_data.kib_peers[i]));
+ }
+ LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
+ LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
+ LASSERT (list_empty (&kibnal_data.kib_sched_txq));
+ LASSERT (list_empty (&kibnal_data.kib_connd_conns));
+ LASSERT (list_empty (&kibnal_data.kib_connd_peers));
+
+ /* flag threads to terminate; wake and wait for them to die */
+ kibnal_data.kib_shutdown = 1;
+ wake_up_all (&kibnal_data.kib_sched_waitq);
+ wake_up_all (&kibnal_data.kib_connd_waitq);
+
+ i = 2;
+ while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
+ i++;
+ CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+ "Waiting for %d threads to terminate\n",
+ atomic_read (&kibnal_data.kib_nthreads));
+ set_current_state (TASK_INTERRUPTIBLE);
+ schedule_timeout (HZ);
+ }
+ /* fall through */
+
+ case IBNAL_INIT_NOTHING:
+ break;
+ }
+
+ if (kibnal_data.kib_tx_descs != NULL)
+ PORTAL_FREE (kibnal_data.kib_tx_descs,
+ IBNAL_TX_MSGS * sizeof(kib_tx_t));
+
+ if (kibnal_data.kib_peers != NULL)
+ PORTAL_FREE (kibnal_data.kib_peers,
+ sizeof (struct list_head) *
+ kibnal_data.kib_peer_hash_size);
+
+ CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+ atomic_read (&portal_kmemory));
+ printk(KERN_INFO "Lustre: Infinicon IB NAL unloaded (final mem %d)\n",
+ atomic_read(&portal_kmemory));
+
+ kibnal_data.kib_init = IBNAL_INIT_NOTHING;
+}
+
+#define roundup_power(val, power) \
+ ( (val + (__u64)(power - 1)) & ~((__u64)(power - 1)) )
+
+/* this isn't very portable or sturdy in the face of funny mem/bus configs */
+static __u64 max_phys_mem(IB_CA_ATTRIBUTES *ca_attr)
+{
+ struct sysinfo si;
+ __u64 ret;
+
+ /* XXX we don't bother with first-gen cards */
+ if (ca_attr->VendorId == 0xd0b7 && ca_attr->DeviceId == 0x3101)
+ return 0ULL;
+
+ si_meminfo(&si);
+ ret = (__u64)max(si.totalram, max_mapnr) * si.mem_unit;
+ return roundup_power(ret, 128 * 1024 * 1024);
+}
+#undef roundup_power
+
+static int
+kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
+ ptl_ni_limits_t *requested_limits,
+ ptl_ni_limits_t *actual_limits)
+{
+ ptl_process_id_t process_id;
+ int pkmem = atomic_read(&portal_kmemory);
+ IB_PORT_ATTRIBUTES *pattr;
+ FSTATUS frc;
+ int rc;
+ int n;
+ int i;
+
+ LASSERT (nal == &kibnal_api);
+
+ if (nal->nal_refct != 0) {
+ if (actual_limits != NULL)
+ *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits;
+ /* This module got the first ref */
+ PORTAL_MODULE_USE;
+ return (PTL_OK);
+ }
+
+ LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING);
+
+ frc = IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2,
+ &kibnal_data.kib_interfaces);
+ if (frc != FSUCCESS) {
+ CERROR("IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2) = %d\n",
+ frc);
+ return -ENOSYS;
+ }
+
+ init_MUTEX (&kibnal_data.kib_nid_mutex);
+ init_MUTEX_LOCKED (&kibnal_data.kib_nid_signal);
+ kibnal_data.kib_nid = PTL_NID_ANY;
+
+ rwlock_init(&kibnal_data.kib_global_lock);
+
+ kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
+ PORTAL_ALLOC (kibnal_data.kib_peers,
+ sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
+ if (kibnal_data.kib_peers == NULL) {
+ goto failed;
+ }
+ for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
+ INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
+
+ spin_lock_init (&kibnal_data.kib_connd_lock);
+ INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
+ INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
+ init_waitqueue_head (&kibnal_data.kib_connd_waitq);
+
+ spin_lock_init (&kibnal_data.kib_sched_lock);
+ INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
+ INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
+ init_waitqueue_head (&kibnal_data.kib_sched_waitq);
+
+ spin_lock_init (&kibnal_data.kib_tx_lock);
+ INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
+ INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs);
+ init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq);
+
+ PORTAL_ALLOC (kibnal_data.kib_tx_descs,
+ IBNAL_TX_MSGS * sizeof(kib_tx_t));
+ if (kibnal_data.kib_tx_descs == NULL) {
+ CERROR ("Can't allocate tx descs\n");
+ goto failed;
+ }
+
+ /* lists/ptrs/locks initialised */
+ kibnal_data.kib_init = IBNAL_INIT_DATA;
+ /*****************************************************/
+
+ process_id.pid = 0;
+ process_id.nid = kibnal_data.kib_nid;
+
+ rc = lib_init(&kibnal_lib, nal, process_id,
+ requested_limits, actual_limits);
+ if (rc != PTL_OK) {
+ CERROR("lib_init failed: error %d\n", rc);
+ goto failed;
+ }
+
+ /* lib interface initialised */
+ kibnal_data.kib_init = IBNAL_INIT_LIB;
+ /*****************************************************/
+
+ for (i = 0; i < IBNAL_N_SCHED; i++) {
+ rc = kibnal_thread_start (kibnal_scheduler, (void *)i);
+ if (rc != 0) {
+ CERROR("Can't spawn iibnal scheduler[%d]: %d\n",
+ i, rc);
+ goto failed;
+ }
+ }
+
+ rc = kibnal_thread_start (kibnal_connd, NULL);
+ if (rc != 0) {
+ CERROR ("Can't spawn iibnal connd: %d\n", rc);
+ goto failed;
+ }
+
+ n = sizeof(kibnal_data.kib_hca_guids) /
+ sizeof(kibnal_data.kib_hca_guids[0]);
+ frc = iibt_get_hca_guids(&n, kibnal_data.kib_hca_guids);
+ if (frc != FSUCCESS) {
+ CERROR ("Can't get channel adapter guids: %d\n", frc);
+ goto failed;
+ }
+ if (n == 0) {
+ CERROR ("No channel adapters found\n");
+ goto failed;
+ }
+
+ /* Infinicon has per-HCA rather than per CQ completion handlers */
+ frc = iibt_open_hca(kibnal_data.kib_hca_guids[0],
+ kibnal_ca_callback,
+ kibnal_ca_async_callback,
+ &kibnal_data.kib_hca,
+ &kibnal_data.kib_hca);
+ if (frc != FSUCCESS) {
+ CERROR ("Can't open CA[0]: %d\n", frc);
+ goto failed;
+ }
+
+ /* Channel Adapter opened */
+ kibnal_data.kib_init = IBNAL_INIT_HCA;
+ /*****************************************************/
+
+ kibnal_data.kib_hca_attrs.PortAttributesList = NULL;
+ kibnal_data.kib_hca_attrs.PortAttributesListSize = 0;
+ frc = iibt_query_hca(kibnal_data.kib_hca,
+ &kibnal_data.kib_hca_attrs, NULL);
+ if (frc != FSUCCESS) {
+ CERROR ("Can't size port attrs: %d\n", frc);
+ goto failed;
+ }
+
+ PORTAL_ALLOC(kibnal_data.kib_hca_attrs.PortAttributesList,
+ kibnal_data.kib_hca_attrs.PortAttributesListSize);
+ if (kibnal_data.kib_hca_attrs.PortAttributesList == NULL)
+ goto failed;
+
+ /* Port attrs allocated */
+ kibnal_data.kib_init = IBNAL_INIT_PORTATTRS;
+ /*****************************************************/
+
+ frc = iibt_query_hca(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs,
+ NULL);
+ if (frc != FSUCCESS) {
+ CERROR ("Can't get port attrs for CA 0: %d\n", frc);
+ goto failed;
+ }
+
+ for (i = 0, pattr = kibnal_data.kib_hca_attrs.PortAttributesList;
+ pattr != NULL;
+ i++, pattr = pattr->Next) {
+ switch (pattr->PortState) {
+ default:
+ CERROR("Unexpected port[%d] state %d\n",
+ i, pattr->PortState);
+ continue;
+ case PortStateDown:
+ CDEBUG(D_NET, "port[%d] Down\n", i);
+ continue;
+ case PortStateInit:
+ CDEBUG(D_NET, "port[%d] Init\n", i);
+ continue;
+ case PortStateArmed:
+ CDEBUG(D_NET, "port[%d] Armed\n", i);
+ continue;
+
+ case PortStateActive:
+ CDEBUG(D_NET, "port[%d] Active\n", i);
+ kibnal_data.kib_port = i;
+ kibnal_data.kib_port_guid = pattr->GUID;
+ kibnal_data.kib_port_pkey = pattr->PkeyTable[0];
+ break;
+ }
+ break;
+ }
+
+ if (pattr == NULL) {
+ CERROR ("Can't find an active port\n");
+ goto failed;
+ }
+
+ CDEBUG(D_NET, "got guid "LPX64"\n", kibnal_data.kib_port_guid);
+
+ /* Active port found */
+ kibnal_data.kib_init = IBNAL_INIT_PORT;
+ /*****************************************************/
+
+ frc = iibt_sd_register(&kibnal_data.kib_sd, NULL);
+ if (frc != FSUCCESS) {
+ CERROR ("Can't register with SD: %d\n", frc);
+ goto failed;
+ }
+
+ /* Registered with SD OK */
+ kibnal_data.kib_init = IBNAL_INIT_SD;
+ /*****************************************************/
+
+ frc = iibt_pd_allocate(kibnal_data.kib_hca, 0, &kibnal_data.kib_pd);
+ if (frc != FSUCCESS) {
+ CERROR ("Can't create PD: %d\n", rc);
+ goto failed;
+ }
+
+ /* flag PD initialised */
+ kibnal_data.kib_init = IBNAL_INIT_PD;
+ /*****************************************************/
+
+#if IBNAL_FMR
+ {
+ const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK;
+ struct ib_fmr_pool_param params = {
+ .max_pages_per_fmr = PTL_MTU/PAGE_SIZE,
+ .access = (IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_REMOTE_READ),
+ .pool_size = pool_size,
+ .dirty_watermark = (pool_size * 3)/4,
+ .flush_function = NULL,
+ .flush_arg = NULL,
+ .cache = 1,
+ };
+ rc = ib_fmr_pool_create(kibnal_data.kib_pd, ¶ms,
+ &kibnal_data.kib_fmr_pool);
+ if (rc != 0) {
+ CERROR ("Can't create FMR pool size %d: %d\n",
+ pool_size, rc);
+ goto failed;
+ }
+ }
+
+ /* flag FMR pool initialised */
+ kibnal_data.kib_init = IBNAL_INIT_FMR;
+#endif
+ /*****************************************************/
+ if (IBNAL_WHOLE_MEM) {
+ IB_MR_PHYS_BUFFER phys;
+ IB_ACCESS_CONTROL access;
+ kib_md_t *md = &kibnal_data.kib_md;
+
+ memset(&access, 0, sizeof(access));
+ access.s.MWBindable = 1;
+ access.s.LocalWrite = 1;
+ access.s.RdmaRead = 1;
+ access.s.RdmaWrite = 1;
+
+ phys.PhysAddr = 0;
+ phys.Length = max_phys_mem(&kibnal_data.kib_hca_attrs);
+ if (phys.Length == 0) {
+ CERROR ("couldn't determine the end of phys mem\n");
+ goto failed;
+ }
+
+ rc = iibt_register_contig_physical_memory(kibnal_data.kib_hca,
+ 0,
+ &phys, 1,
+ 0,
+ kibnal_data.kib_pd,
+ access,
+ &md->md_handle,
+ &md->md_addr,
+ &md->md_lkey,
+ &md->md_rkey);
+ if (rc != FSUCCESS) {
+ CERROR("registering physical memory failed: %d\n",
+ rc);
+ CERROR("falling back to registration per-rdma\n");
+ md->md_handle = NULL;
+ } else {
+ CDEBUG(D_NET, "registered "LPU64" bytes of mem\n",
+ phys.Length);
+ kibnal_data.kib_init = IBNAL_INIT_MR;
+ }
+ }
+
+ /*****************************************************/
+
+ rc = kibnal_setup_tx_descs();
+ if (rc != 0) {
+ CERROR ("Can't register tx descs: %d\n", rc);
+ goto failed;
+ }
+
+ /* flag TX descs initialised */
+ kibnal_data.kib_init = IBNAL_INIT_TXD;
+ /*****************************************************/
+
+ {
+ uint32 nentries;
+
+ frc = iibt_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES,
+ &kibnal_data.kib_cq, &kibnal_data.kib_cq,
+ &nentries);
+ if (frc != FSUCCESS) {
+ CERROR ("Can't create RX CQ: %d\n", frc);
+ goto failed;
+ }
+
+ /* flag CQ initialised */
+ kibnal_data.kib_init = IBNAL_INIT_CQ;
+
+ if (nentries < IBNAL_CQ_ENTRIES) {
+ CERROR ("CQ only has %d entries, need %d\n",
+ nentries, IBNAL_CQ_ENTRIES);
+ goto failed;
+ }
+
+ rc = iibt_cq_rearm(kibnal_data.kib_cq, CQEventSelNextWC);
+ if (rc != 0) {
+ CERROR ("Failed to re-arm completion queue: %d\n", rc);
+ goto failed;
+ }
+ }
+
+ /*****************************************************/
+
+ rc = libcfs_nal_cmd_register(IIBNAL, &kibnal_cmd, NULL);
+ if (rc != 0) {
+ CERROR ("Can't initialise command interface (rc = %d)\n", rc);
+ goto failed;
+ }
+
+ /* flag everything initialised */
+ kibnal_data.kib_init = IBNAL_INIT_ALL;
+ /*****************************************************/
+
+ printk(KERN_INFO "Lustre: Infinicon IB NAL loaded "
+ "(initial mem %d)\n", pkmem);
+
+ return (PTL_OK);
+
+ failed:
+ kibnal_api_shutdown (&kibnal_api);
+ return (PTL_FAIL);
+}
+
+void __exit
+kibnal_module_fini (void)
+{
+#ifdef CONFIG_SYSCTL
+ if (kibnal_tunables.kib_sysctl != NULL)
+ unregister_sysctl_table (kibnal_tunables.kib_sysctl);
+#endif
+ PtlNIFini(kibnal_ni);
+
+ ptl_unregister_nal(IIBNAL);
+}
+
+int __init
+kibnal_module_init (void)
+{
+ int rc;
+
+ if (sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN) {
+ CERROR("sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN\n");
+ return -EINVAL;
+ }
+
+ /* the following must be sizeof(int) for proc_dointvec() */
+ if (sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)) {
+ CERROR("sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)\n");
+ return -EINVAL;
+ }
+
+ kibnal_api.nal_ni_init = kibnal_api_startup;
+ kibnal_api.nal_ni_fini = kibnal_api_shutdown;
+
+ /* Initialise dynamic tunables to defaults once only */
+ kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
+
+ rc = ptl_register_nal(IIBNAL, &kibnal_api);
+ if (rc != PTL_OK) {
+ CERROR("Can't register IBNAL: %d\n", rc);
+ return (-ENOMEM); /* or something... */
+ }
+
+ /* Pure gateways want the NAL started up at module load time... */
+ rc = PtlNIInit(IIBNAL, 0, NULL, NULL, &kibnal_ni);
+ if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
+ ptl_unregister_nal(IIBNAL);
+ return (-ENODEV);
+ }
+
+#ifdef CONFIG_SYSCTL
+ /* Press on regardless even if registering sysctl doesn't work */
+ kibnal_tunables.kib_sysctl =
+ register_sysctl_table (kibnal_top_ctl_table, 0);
+#endif
+ return (0);
+}
+
+MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("Kernel Infinicon IB NAL v0.01");
+MODULE_LICENSE("GPL");
+
+module_init(kibnal_module_init);
+module_exit(kibnal_module_fini);
+
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+#include <linux/uio.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+
+#define DEBUG_SUBSYSTEM S_IBNAL
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+#include <portals/nal.h>
+
+#include <linux/iba/ibt.h>
+
+#define GCC_VERSION (__GNUC__ * 10000 \
+ + __GNUC_MINOR__ * 100 \
+ + __GNUC_PATCHLEVEL__)
+
+/* Test for GCC > 3.2.2 */
+#if GCC_VERSION <= 30202
+/* GCC 3.2.2, and presumably several versions before it, will
+ * miscompile this driver. See
+ * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9853. */
+#error Invalid GCC version. Must use GCC >= 3.2.3
+#endif
+
+#define IBNAL_SERVICE_NAME "iibnal"
+#define IBNAL_SERVICE_NUMBER 0x11b9a1
+
+#if CONFIG_SMP
+# define IBNAL_N_SCHED num_online_cpus() /* # schedulers */
+#else
+# define IBNAL_N_SCHED 1 /* # schedulers */
+#endif
+
+#define IBNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */
+#define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */
+
+#define IBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */
+
+#define IBNAL_MSG_QUEUE_SIZE 8 /* # messages/RDMAs in-flight */
+#define IBNAL_CREDIT_HIGHWATER 7 /* when to eagerly return credits */
+/* 7 indicates infinite retry attempts, Infinicon recommended 5 */
+#define IBNAL_RETRY 5 /* # times to retry */
+#define IBNAL_RNR_RETRY 5 /* */
+#define IBNAL_CM_RETRY 5 /* # times to retry connection */
+#define IBNAL_FLOW_CONTROL 1
+#define IBNAL_ACK_TIMEOUT 20 /* supposedly 4 secs */
+
+#define IBNAL_NTX 64 /* # tx descs */
+/* this had to be dropped down so that we only register < 255 pages per
+ * region. this will change if we register all memory. */
+#define IBNAL_NTX_NBLK 128 /* # reserved tx descs */
+
+#define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */
+
+#define IBNAL_RESCHED 100 /* # scheduler loops before reschedule */
+
+#define IBNAL_CONCURRENT_PEERS 1000 /* # nodes all talking at once to me */
+
+/* default vals for runtime tunables */
+#define IBNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */
+
+/************************/
+/* derived constants... */
+
+/* TX messages (shared by all connections) */
+#define IBNAL_TX_MSGS (IBNAL_NTX + IBNAL_NTX_NBLK)
+#define IBNAL_TX_MSG_BYTES (IBNAL_TX_MSGS * IBNAL_MSG_SIZE)
+#define IBNAL_TX_MSG_PAGES ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
+
+#define IBNAL_TX_MAX_SG (PTL_MD_MAX_IOV + 1)
+
+/* RX messages (per connection) */
+#define IBNAL_RX_MSGS IBNAL_MSG_QUEUE_SIZE
+#define IBNAL_RX_MSG_BYTES (IBNAL_RX_MSGS * IBNAL_MSG_SIZE)
+#define IBNAL_RX_MSG_PAGES ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
+
+
+/* we may have up to 2 completions per transmit +
+ 1 completion per receive, per connection */
+#define IBNAL_CQ_ENTRIES ((2*IBNAL_TX_MSGS) + \
+ (IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS))
+
+#define IBNAL_RDMA_BASE 0x0eeb0000
+#define IBNAL_FMR 0
+#define IBNAL_WHOLE_MEM 1
+#define IBNAL_CKSUM 0
+//#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_PROCESS
+#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_INTERRUPT
+
+/* XXX I have no idea. */
+#define IBNAL_STARTING_PSN 1
+
+typedef struct
+{
+ int kib_io_timeout; /* comms timeout (seconds) */
+ struct ctl_table_header *kib_sysctl; /* sysctl interface */
+} kib_tunables_t;
+
+/* some of these have specific types in the stack that just map back
+ * to the uFOO types, like IB_{L,R}_KEY. */
+typedef struct
+{
+ int ibp_npages; /* # pages */
+ int ibp_mapped; /* mapped? */
+ __u64 ibp_vaddr; /* mapped region vaddr */
+ __u32 ibp_lkey; /* mapped region lkey */
+ __u32 ibp_rkey; /* mapped region rkey */
+ IB_HANDLE ibp_handle; /* mapped region handle */
+ struct page *ibp_pages[0];
+} kib_pages_t;
+
+typedef struct
+{
+ IB_HANDLE md_handle;
+ __u32 md_lkey;
+ __u32 md_rkey;
+ __u64 md_addr;
+} kib_md_t __attribute__((packed));
+
+typedef struct
+{
+ int kib_init; /* initialisation state */
+ __u64 kib_incarnation; /* which one am I */
+ int kib_shutdown; /* shut down? */
+ atomic_t kib_nthreads; /* # live threads */
+
+ __u64 kib_service_id; /* service number I listen on */
+ __u64 kib_port_guid; /* my GUID (lo 64 of GID)*/
+ __u16 kib_port_pkey; /* my pkey, whatever that is */
+ ptl_nid_t kib_nid; /* my NID */
+ struct semaphore kib_nid_mutex; /* serialise NID ops */
+ struct semaphore kib_nid_signal; /* signal completion */
+ IB_HANDLE kib_cep; /* connection end point */
+
+ rwlock_t kib_global_lock; /* stabilize peer/conn ops */
+
+ struct list_head *kib_peers; /* hash table of all my known peers */
+ int kib_peer_hash_size; /* size of kib_peers */
+ atomic_t kib_npeers; /* # peers extant */
+ atomic_t kib_nconns; /* # connections extant */
+
+ struct list_head kib_connd_conns; /* connections to progress */
+ struct list_head kib_connd_peers; /* peers waiting for a connection */
+ wait_queue_head_t kib_connd_waitq; /* connection daemons sleep here */
+ unsigned long kib_connd_waketime; /* when connd will wake */
+ spinlock_t kib_connd_lock; /* serialise */
+
+ wait_queue_head_t kib_sched_waitq; /* schedulers sleep here */
+ struct list_head kib_sched_txq; /* tx requiring attention */
+ struct list_head kib_sched_rxq; /* rx requiring attention */
+ spinlock_t kib_sched_lock; /* serialise */
+
+ struct kib_tx *kib_tx_descs; /* all the tx descriptors */
+ kib_pages_t *kib_tx_pages; /* premapped tx msg pages */
+
+ struct list_head kib_idle_txs; /* idle tx descriptors */
+ struct list_head kib_idle_nblk_txs; /* idle reserved tx descriptors */
+ wait_queue_head_t kib_idle_tx_waitq; /* block here for tx descriptor */
+ __u64 kib_next_tx_cookie; /* RDMA completion cookie */
+ spinlock_t kib_tx_lock; /* serialise */
+
+ IB_HANDLE kib_hca; /* The HCA */
+ int kib_port; /* port on the device */
+ IB_HANDLE kib_pd; /* protection domain */
+ IB_HANDLE kib_sd; /* SD handle */
+ IB_HANDLE kib_cq; /* completion queue */
+ kib_md_t kib_md; /* full-mem registration */
+
+ void *kib_listen_handle; /* where I listen for connections */
+
+ IBT_INTERFACE_UNION kib_interfaces; /* The Infinicon IBT interface */
+
+ uint64 kib_hca_guids[8]; /* all the HCA guids */
+ IB_CA_ATTRIBUTES kib_hca_attrs; /* where to get HCA attrs */
+ FABRIC_OPERATION_DATA kib_fabopdata; /* (un)advertise service record */
+} kib_data_t;
+
+#define IBNAL_INIT_NOTHING 0
+#define IBNAL_INIT_DATA 1
+#define IBNAL_INIT_LIB 2
+#define IBNAL_INIT_HCA 3
+#define IBNAL_INIT_PORTATTRS 4
+#define IBNAL_INIT_PORT 5
+#define IBNAL_INIT_SD 6
+#define IBNAL_INIT_PD 7
+#define IBNAL_INIT_FMR 8
+#define IBNAL_INIT_MR 9
+#define IBNAL_INIT_TXD 10
+#define IBNAL_INIT_CQ 11
+#define IBNAL_INIT_ALL 12
+
+/************************************************************************
+ * Wire message structs.
+ * These are sent in sender's byte order (i.e. receiver flips).
+ * CAVEAT EMPTOR: other structs communicated between nodes (e.g. MAD
+ * private data and SM service info), is LE on the wire.
+ */
+
+/* also kib_md_t above */
+
+typedef struct
+{
+ __u32 rd_key; /* remote key */
+ __u32 rd_nob; /* # of bytes */
+ __u64 rd_addr; /* remote io vaddr */
+} kib_rdma_desc_t __attribute__((packed));
+
+typedef struct
+{
+ ptl_hdr_t ibim_hdr; /* portals header */
+ char ibim_payload[0]; /* piggy-backed payload */
+} kib_immediate_msg_t __attribute__((packed));
+
+/* these arrays serve two purposes during rdma. they are built on the passive
+ * side and sent to the active side as remote arguments. On the active side
+ * the descs are used as a data structure on the way to local gather items.
+ * the different roles result in split local/remote meaning of desc->rd_key */
+typedef struct
+{
+ ptl_hdr_t ibrm_hdr; /* portals header */
+ __u64 ibrm_cookie; /* opaque completion cookie */
+ __u32 ibrm_num_descs; /* how many descs */
+ kib_rdma_desc_t ibrm_desc[0]; /* where to suck/blow */
+} kib_rdma_msg_t __attribute__((packed));
+
+#define kib_rdma_msg_len(num_descs) \
+ offsetof(kib_msg_t, ibm_u.rdma.ibrm_desc[num_descs])
+
+typedef struct
+{
+ __u64 ibcm_cookie; /* opaque completion cookie */
+ __u32 ibcm_status; /* completion status */
+} kib_completion_msg_t __attribute__((packed));
+
+typedef struct
+{
+ __u32 ibm_magic; /* I'm an openibnal message */
+ __u16 ibm_version; /* this is my version number */
+ __u8 ibm_type; /* msg type */
+ __u8 ibm_credits; /* returned credits */
+#if IBNAL_CKSUM
+ __u32 ibm_nob;
+ __u32 ibm_cksum;
+#endif
+ union {
+ kib_immediate_msg_t immediate;
+ kib_rdma_msg_t rdma;
+ kib_completion_msg_t completion;
+ } ibm_u __attribute__((packed));
+} kib_msg_t __attribute__((packed));
+
+#define IBNAL_MSG_MAGIC 0x0be91b91 /* unique magic */
+#define IBNAL_MSG_VERSION 1 /* current protocol version */
+
+#define IBNAL_MSG_NOOP 0xd0 /* nothing (just credits) */
+#define IBNAL_MSG_IMMEDIATE 0xd1 /* portals hdr + payload */
+#define IBNAL_MSG_PUT_RDMA 0xd2 /* portals PUT hdr + source rdma desc */
+#define IBNAL_MSG_PUT_DONE 0xd3 /* signal PUT rdma completion */
+#define IBNAL_MSG_GET_RDMA 0xd4 /* portals GET hdr + sink rdma desc */
+#define IBNAL_MSG_GET_DONE 0xd5 /* signal GET rdma completion */
+
+/***********************************************************************/
+
+typedef struct kib_rx /* receive message */
+{
+ struct list_head rx_list; /* queue for attention */
+ struct kib_conn *rx_conn; /* owning conn */
+ int rx_rdma; /* RDMA completion posted? */
+ int rx_posted; /* posted? */
+ __u64 rx_vaddr; /* pre-mapped buffer (hca vaddr) */
+ kib_msg_t *rx_msg; /* pre-mapped buffer (host vaddr) */
+ IB_WORK_REQ rx_wrq;
+ IB_LOCAL_DATASEGMENT rx_gl; /* and it's memory */
+} kib_rx_t;
+
+typedef struct kib_tx /* transmit message */
+{
+ struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */
+ int tx_isnblk; /* I'm reserved for non-blocking sends */
+ struct kib_conn *tx_conn; /* owning conn */
+ int tx_mapped; /* mapped for RDMA? */
+ int tx_sending; /* # tx callbacks outstanding */
+ int tx_status; /* completion status */
+ unsigned long tx_deadline; /* completion deadline */
+ int tx_passive_rdma; /* peer sucks/blows */
+ int tx_passive_rdma_wait; /* waiting for peer to complete */
+ __u64 tx_passive_rdma_cookie; /* completion cookie */
+ lib_msg_t *tx_libmsg[2]; /* lib msgs to finalize on completion */
+ kib_md_t tx_md; /* RDMA mapping (active/passive) */
+ __u64 tx_vaddr; /* pre-mapped buffer (hca vaddr) */
+ kib_msg_t *tx_msg; /* pre-mapped buffer (host vaddr) */
+ int tx_nsp; /* # send work items */
+ IB_WORK_REQ tx_wrq[IBNAL_TX_MAX_SG]; /* send work items... */
+ IB_LOCAL_DATASEGMENT tx_gl[IBNAL_TX_MAX_SG]; /* ...and their memory */
+} kib_tx_t;
+
+#define KIB_TX_UNMAPPED 0
+#define KIB_TX_MAPPED 1
+#define KIB_TX_MAPPED_FMR 2
+
+typedef struct kib_wire_connreq
+{
+ __u32 wcr_magic; /* I'm an openibnal connreq */
+ __u16 wcr_version; /* this is my version number */
+ __u16 wcr_queue_depth; /* this is my receive queue size */
+ __u64 wcr_nid; /* peer's NID */
+ __u64 wcr_incarnation; /* peer's incarnation */
+} kib_wire_connreq_t;
+
+typedef struct kib_gid
+{
+ __u64 hi, lo;
+} kib_gid_t;
+
+typedef struct kib_connreq
+{
+ /* connection-in-progress */
+ struct kib_conn *cr_conn;
+ kib_wire_connreq_t cr_wcr;
+ __u64 cr_tid;
+ IB_SERVICE_RECORD cr_service;
+ kib_gid_t cr_gid;
+ IB_PATH_RECORD cr_path;
+ CM_REQUEST_INFO cr_cmreq;
+ CM_CONN_INFO cr_discarded;
+ CM_REJECT_INFO cr_rej_info;
+} kib_connreq_t;
+
+typedef struct kib_conn
+{
+ struct kib_peer *ibc_peer; /* owning peer */
+ struct list_head ibc_list; /* stash on peer's conn list */
+ __u64 ibc_incarnation; /* which instance of the peer */
+ atomic_t ibc_refcount; /* # users */
+ int ibc_state; /* what's happening */
+ atomic_t ibc_nob; /* # bytes buffered */
+ int ibc_nsends_posted; /* # uncompleted sends */
+ int ibc_credits; /* # credits I have */
+ int ibc_outstanding_credits; /* # credits to return */
+ int ibc_rcvd_disconnect;/* received discon request */
+ int ibc_sent_disconnect;/* sent discon request */
+ struct list_head ibc_tx_queue; /* send queue */
+ struct list_head ibc_active_txs; /* active tx awaiting completion */
+ spinlock_t ibc_lock; /* serialise */
+ kib_rx_t *ibc_rxs; /* the rx descs */
+ kib_pages_t *ibc_rx_pages; /* premapped rx msg pages */
+ IB_HANDLE ibc_qp; /* queue pair */
+ IB_HANDLE ibc_cep; /* connection ID? */
+ IB_QP_ATTRIBUTES_QUERY ibc_qp_attrs; /* QP attrs */
+ kib_connreq_t *ibc_connreq; /* connection request state */
+} kib_conn_t;
+
+#define IBNAL_CONN_INIT_NOTHING 0 /* initial state */
+#define IBNAL_CONN_INIT_QP 1 /* ibc_qp set up */
+#define IBNAL_CONN_CONNECTING 2 /* started to connect */
+#define IBNAL_CONN_ESTABLISHED 3 /* connection established */
+#define IBNAL_CONN_SEND_DREQ 4 /* to send disconnect req */
+#define IBNAL_CONN_DREQ 5 /* sent disconnect req */
+#define IBNAL_CONN_DREP 6 /* sent disconnect rep */
+#define IBNAL_CONN_DISCONNECTED 7 /* no more QP or CM traffic */
+
+#define KIB_ASSERT_CONN_STATE(conn, state) do { \
+ LASSERTF((conn)->ibc_state == state, "%d\n", conn->ibc_state); \
+} while (0)
+
+#define KIB_ASSERT_CONN_STATE_RANGE(conn, low, high) do { \
+ LASSERTF(low <= high, "%d %d\n", low, high); \
+ LASSERTF((conn)->ibc_state >= low && (conn)->ibc_state <= high, \
+ "%d\n", conn->ibc_state); \
+} while (0)
+
+typedef struct kib_peer
+{
+ struct list_head ibp_list; /* stash on global peer list */
+ struct list_head ibp_connd_list; /* schedule on kib_connd_peers */
+ ptl_nid_t ibp_nid; /* who's on the other end(s) */
+ atomic_t ibp_refcount; /* # users */
+ int ibp_persistence; /* "known" peer refs */
+ struct list_head ibp_conns; /* all active connections */
+ struct list_head ibp_tx_queue; /* msgs waiting for a conn */
+ int ibp_connecting; /* connecting+accepting */
+ unsigned long ibp_reconnect_time; /* when reconnect may be attempted */
+ unsigned long ibp_reconnect_interval; /* exponential backoff */
+} kib_peer_t;
+
+
+extern lib_nal_t kibnal_lib;
+extern kib_data_t kibnal_data;
+extern kib_tunables_t kibnal_tunables;
+
+/******************************************************************************/
+/* Infinicon IBT interface wrappers */
+#define IIBT_IF (kibnal_data.kib_interfaces.ver2)
+
+static inline FSTATUS
+iibt_get_hca_guids(uint32 *hca_count, EUI64 *hca_guid_list)
+{
+ return IIBT_IF.GetCaGuids(hca_count, hca_guid_list);
+}
+
+static inline FSTATUS
+iibt_open_hca(EUI64 hca_guid,
+ IB_COMPLETION_CALLBACK completion_callback,
+ IB_ASYNC_EVENT_CALLBACK async_event_callback,
+ void *arg,
+ IB_HANDLE *handle)
+{
+ return IIBT_IF.Vpi.OpenCA(hca_guid, completion_callback,
+ async_event_callback, arg, handle);
+}
+
+static inline FSTATUS
+iibt_query_hca(IB_HANDLE hca_handle, IB_CA_ATTRIBUTES *attrs, void **argp)
+{
+ return IIBT_IF.Vpi.QueryCA(hca_handle, attrs, argp);
+}
+
+static inline FSTATUS
+iibt_close_hca(IB_HANDLE hca_handle)
+{
+ return IIBT_IF.Vpi.CloseCA(hca_handle);
+}
+
+static inline FSTATUS
+iibt_pd_allocate(IB_HANDLE hca_handle, __u32 max_avs, IB_HANDLE *pd_handle)
+{
+ return IIBT_IF.Vpi.AllocatePD(hca_handle, max_avs, pd_handle);
+}
+
+static inline FSTATUS
+iibt_pd_free(IB_HANDLE pd_handle)
+{
+ return IIBT_IF.Vpi.FreePD(pd_handle);
+}
+
+static inline FSTATUS
+iibt_register_physical_memory(IB_HANDLE hca_handle,
+ IB_VIRT_ADDR requested_io_va,
+ void *phys_buffers, uint64 nphys_buffers,
+ uint32 io_va_offset, IB_HANDLE pd_handle,
+ IB_ACCESS_CONTROL access,
+ IB_HANDLE *mem_handle,
+ IB_VIRT_ADDR *actual_io_va,
+ IB_L_KEY *lkey, IB_R_KEY *rkey)
+{
+ return IIBT_IF.Vpi.RegisterPhysMemRegion(hca_handle, requested_io_va,
+ phys_buffers, nphys_buffers,
+ io_va_offset, pd_handle,
+ access,
+ mem_handle, actual_io_va,
+ lkey, rkey);
+}
+
+static inline FSTATUS
+iibt_register_contig_physical_memory(IB_HANDLE hca_handle,
+ IB_VIRT_ADDR requested_io_va,
+ IB_MR_PHYS_BUFFER *phys_buffers,
+ uint64 nphys_buffers,
+ uint32 io_va_offset, IB_HANDLE pd_handle,
+ IB_ACCESS_CONTROL access,
+ IB_HANDLE *mem_handle,
+ IB_VIRT_ADDR *actual_io_va,
+ IB_L_KEY *lkey, IB_R_KEY *rkey)
+{
+ return IIBT_IF.Vpi.RegisterContigPhysMemRegion(hca_handle,
+ requested_io_va,
+ phys_buffers,
+ nphys_buffers,
+ io_va_offset, pd_handle,
+ access,
+ mem_handle, actual_io_va,
+ lkey, rkey);
+}
+
+static inline FSTATUS
+iibt_register_memory(IB_HANDLE hca_handle,
+ void *virt_addr, unsigned int length,
+ IB_HANDLE pd_handle,
+ IB_ACCESS_CONTROL access,
+ IB_HANDLE *mem_handle,
+ IB_L_KEY *lkey, IB_R_KEY *rkey)
+{
+ return IIBT_IF.Vpi.RegisterMemRegion(hca_handle,
+ virt_addr, length,
+ pd_handle,
+ access,
+ mem_handle,
+ lkey, rkey);
+}
+
+static inline FSTATUS
+iibt_deregister_memory(IB_HANDLE mem_handle)
+{
+ return IIBT_IF.Vpi.DeregisterMemRegion(mem_handle);
+}
+
+static inline FSTATUS
+iibt_cq_create(IB_HANDLE hca_handle, uint32 requested_size,
+ void *arg, IB_HANDLE *cq_handle, uint32 *actual_size)
+{
+ return IIBT_IF.Vpi.CreateCQ(hca_handle, requested_size,
+ arg, cq_handle, actual_size);
+}
+
+static inline FSTATUS
+iibt_cq_poll(IB_HANDLE cq_handle, IB_WORK_COMPLETION *wc)
+{
+ return IIBT_IF.Vpi.PollCQ(cq_handle, wc);
+}
+
+static inline FSTATUS
+iibt_cq_rearm(IB_HANDLE cq_handle, IB_CQ_EVENT_SELECT select)
+{
+ return IIBT_IF.Vpi.RearmCQ(cq_handle, select);
+}
+
+static inline FSTATUS
+iibt_cq_destroy(IB_HANDLE cq_handle)
+{
+ return IIBT_IF.Vpi.DestroyCQ(cq_handle);
+}
+
+static inline FSTATUS
+iibt_qp_create(IB_HANDLE hca_handle, IB_QP_ATTRIBUTES_CREATE *create_attr,
+ void *arg, IB_HANDLE *cq_handle,
+ IB_QP_ATTRIBUTES_QUERY *query_attr)
+{
+ return IIBT_IF.Vpi.CreateQP(hca_handle, create_attr, arg, cq_handle,
+ query_attr);
+}
+
+static inline FSTATUS
+iibt_qp_query(IB_HANDLE qp_handle, IB_QP_ATTRIBUTES_QUERY *query_attr,
+ void **arg_ptr)
+{
+ return IIBT_IF.Vpi.QueryQP(qp_handle, query_attr, arg_ptr);
+}
+
+static inline FSTATUS
+iibt_qp_modify(IB_HANDLE qp_handle, IB_QP_ATTRIBUTES_MODIFY *modify_attr,
+ IB_QP_ATTRIBUTES_QUERY *query_attr)
+{
+ return IIBT_IF.Vpi.ModifyQP(qp_handle, modify_attr, query_attr);
+}
+
+static inline FSTATUS
+iibt_qp_destroy(IB_HANDLE qp_handle)
+{
+ return IIBT_IF.Vpi.DestroyQP(qp_handle);
+}
+
+static inline FSTATUS
+iibt_postrecv(IB_HANDLE qp_handle, IB_WORK_REQ *work_req)
+{
+ return IIBT_IF.Vpi.PostRecv(qp_handle, work_req);
+}
+
+static inline FSTATUS
+iibt_postsend(IB_HANDLE qp_handle, IB_WORK_REQ *work_req)
+{
+ return IIBT_IF.Vpi.PostSend(qp_handle, work_req);
+}
+
+static inline FSTATUS
+iibt_sd_register(IB_HANDLE *sd_handle, CLIENT_CONTROL_PARAMETERS *p)
+{
+ return IIBT_IF.Sdi.Register(sd_handle, p);
+}
+
+static inline FSTATUS
+iibt_sd_deregister(IB_HANDLE sd_handle)
+{
+ return IIBT_IF.Sdi.Deregister(sd_handle);
+}
+
+static inline FSTATUS
+iibt_sd_port_fabric_operation(IB_HANDLE sd_handle, EUI64 port_guid,
+ FABRIC_OPERATION_DATA *fod,
+ PFABRIC_OPERATION_CALLBACK callback,
+ COMMAND_CONTROL_PARAMETERS *p, void *arg)
+{
+ return IIBT_IF.Sdi.PortFabricOperation(sd_handle, port_guid,
+ fod, callback, p, arg);
+}
+
+static inline FSTATUS
+iibt_sd_query_port_fabric_information(IB_HANDLE sd_handle, EUI64 port_guid,
+ QUERY *qry,
+ PQUERY_CALLBACK callback,
+ COMMAND_CONTROL_PARAMETERS *p, void *arg)
+{
+ return IIBT_IF.Sdi.QueryPortFabricInformation(sd_handle, port_guid,
+ qry, callback, p, arg);
+}
+
+static inline IB_HANDLE
+iibt_cm_create_cep(CM_CEP_TYPE type)
+{
+ return IIBT_IF.Cmi.CmCreateCEP(type);
+}
+
+static inline FSTATUS
+iibt_cm_modify_cep(IB_HANDLE cep, uint32 attr, char* value, uint32 len,
+ uint32 offset)
+{
+ return IIBT_IF.Cmi.CmModifyCEP(cep, attr, value, len, offset);
+}
+
+static inline FSTATUS
+iibt_cm_destroy_cep(IB_HANDLE cep_handle)
+{
+ return IIBT_IF.Cmi.CmDestroyCEP(cep_handle);
+}
+
+static inline FSTATUS
+iibt_cm_listen(IB_HANDLE cep, CM_LISTEN_INFO *info,
+ PFN_CM_CALLBACK callback, void *arg)
+{
+ return IIBT_IF.Cmi.CmListen(cep, info, callback, arg);
+}
+
+static inline FSTATUS
+iibt_cm_cancel(IB_HANDLE cep)
+{
+ return IIBT_IF.Cmi.CmCancel(cep);
+}
+
+static inline FSTATUS
+iibt_cm_accept(IB_HANDLE cep,
+ CM_CONN_INFO *send_info, CM_CONN_INFO *recv_info,
+ PFN_CM_CALLBACK callback, void *arg,
+ IB_HANDLE *new_cep)
+{
+ return IIBT_IF.Cmi.CmAccept(cep,
+ send_info, recv_info,
+ callback, arg, new_cep);
+}
+
+static inline FSTATUS
+iibt_cm_reject(IB_HANDLE cep, CM_REJECT_INFO *rej)
+{
+ return IIBT_IF.Cmi.CmReject(cep, rej);
+}
+
+static inline FSTATUS
+iibt_cm_disconnect(IB_HANDLE cep, CM_DREQUEST_INFO *req,
+ CM_DREPLY_INFO *reply)
+{
+ return IIBT_IF.Cmi.CmDisconnect(cep, req, reply);
+}
+
+static inline FSTATUS
+iibt_cm_connect (IB_HANDLE cep, CM_REQUEST_INFO *req,
+ PFN_CM_CALLBACK callback, void *arg)
+{
+ return IIBT_IF.Cmi.CmConnect (cep, req, callback, arg);
+}
+
+static inline int wrq_signals_completion(IB_WORK_REQ *wrq)
+{
+ return wrq->Req.SendRC.Options.s.SignaledCompletion == 1;
+}
+
+
+/******************************************************************************/
+
+/* these are purposely avoiding using local vars so they don't increase
+ * stack consumption. */
+
+#define kib_peer_addref(peer) do { \
+ LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n", \
+ atomic_read(&peer->ibp_refcount)); \
+ CDEBUG(D_NET, "++peer[%p] -> "LPX64" (%d)\n", \
+ peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \
+ atomic_inc(&peer->ibp_refcount); \
+} while (0)
+
+#define kib_peer_decref(peer) do { \
+ LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n", \
+ atomic_read(&peer->ibp_refcount)); \
+ CDEBUG(D_NET, "--peer[%p] -> "LPX64" (%d)\n", \
+ peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \
+ if (atomic_dec_and_test (&peer->ibp_refcount)) { \
+ CDEBUG (D_NET, "destroying peer "LPX64" %p\n", \
+ peer->ibp_nid, peer); \
+ kibnal_destroy_peer (peer); \
+ } \
+} while (0)
+
+/******************************************************************************/
+
+static inline struct list_head *
+kibnal_nid2peerlist (ptl_nid_t nid)
+{
+ unsigned int hash = ((unsigned int)nid) % kibnal_data.kib_peer_hash_size;
+
+ return (&kibnal_data.kib_peers [hash]);
+}
+
+static inline int
+kibnal_peer_active(kib_peer_t *peer)
+{
+ /* Am I in the peer hash table? */
+ return (!list_empty(&peer->ibp_list));
+}
+
+static inline void
+kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn)
+{
+ /* CAVEAT EMPTOR: tx takes caller's ref on conn */
+
+ LASSERT (tx->tx_nsp > 0); /* work items set up */
+ LASSERT (tx->tx_conn == NULL); /* only set here */
+
+ tx->tx_conn = conn;
+ tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ;
+ list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
+}
+
+#define KIBNAL_SERVICE_KEY_MASK (IB_SERVICE_RECORD_COMP_SERVICENAME | \
+ IB_SERVICE_RECORD_COMP_SERVICEDATA8_1 | \
+ IB_SERVICE_RECORD_COMP_SERVICEDATA8_2 | \
+ IB_SERVICE_RECORD_COMP_SERVICEDATA8_3 | \
+ IB_SERVICE_RECORD_COMP_SERVICEDATA8_4 | \
+ IB_SERVICE_RECORD_COMP_SERVICEDATA8_5 | \
+ IB_SERVICE_RECORD_COMP_SERVICEDATA8_6 | \
+ IB_SERVICE_RECORD_COMP_SERVICEDATA8_7 | \
+ IB_SERVICE_RECORD_COMP_SERVICEDATA8_8)
+
+static inline __u64*
+kibnal_service_nid_field(IB_SERVICE_RECORD *srv)
+{
+ /* must be consistent with KIBNAL_SERVICE_KEY_MASK */
+ return (__u64 *)srv->ServiceData8;
+}
+
+
+static inline void
+kibnal_set_service_keys(IB_SERVICE_RECORD *srv, ptl_nid_t nid)
+{
+ LASSERT (strlen(IBNAL_SERVICE_NAME) < sizeof(srv->ServiceName));
+ memset (srv->ServiceName, 0, sizeof(srv->ServiceName));
+ strcpy (srv->ServiceName, IBNAL_SERVICE_NAME);
+
+ *kibnal_service_nid_field(srv) = cpu_to_le64(nid);
+}
+
+#if 0
+static inline void
+kibnal_show_rdma_attr (kib_conn_t *conn)
+{
+ struct ib_qp_attribute qp_attr;
+ int rc;
+
+ memset (&qp_attr, 0, sizeof(qp_attr));
+ rc = ib_qp_query(conn->ibc_qp, &qp_attr);
+ if (rc != 0) {
+ CERROR ("Can't get qp attrs: %d\n", rc);
+ return;
+ }
+
+ CWARN ("RDMA CAPABILITY: write %s read %s\n",
+ (qp_attr.valid_fields & TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE) ?
+ (qp_attr.enable_rdma_write ? "enabled" : "disabled") : "invalid",
+ (qp_attr.valid_fields & TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE) ?
+ (qp_attr.enable_rdma_read ? "enabled" : "disabled") : "invalid");
+}
+#endif
+
+#if CONFIG_X86
+static inline __u64
+kibnal_page2phys (struct page *p)
+{
+ __u64 page_number = p - mem_map;
+
+ return (page_number << PAGE_SHIFT);
+}
+#else
+# error "no page->phys"
+#endif
+
+/* CAVEAT EMPTOR:
+ * We rely on tx/rx descriptor alignment to allow us to use the lowest bit
+ * of the work request id as a flag to determine if the completion is for a
+ * transmit or a receive. It seems that that the CQ entry's 'op' field
+ * isn't always set correctly on completions that occur after QP teardown. */
+
+static inline __u64
+kibnal_ptr2wreqid (void *ptr, int isrx)
+{
+ unsigned long lptr = (unsigned long)ptr;
+
+ LASSERT ((lptr & 1) == 0);
+ return (__u64)(lptr | (isrx ? 1 : 0));
+}
+
+static inline void *
+kibnal_wreqid2ptr (__u64 wreqid)
+{
+ return (void *)(((unsigned long)wreqid) & ~1UL);
+}
+
+static inline int
+kibnal_wreqid_is_rx (__u64 wreqid)
+{
+ return (wreqid & 1) != 0;
+}
+
+static inline int
+kibnal_whole_mem(void)
+{
+ return kibnal_data.kib_md.md_handle != NULL;
+}
+
+extern kib_peer_t *kibnal_create_peer (ptl_nid_t nid);
+extern void kibnal_destroy_peer (kib_peer_t *peer);
+extern int kibnal_del_peer (ptl_nid_t nid, int single_share);
+extern kib_peer_t *kibnal_find_peer_locked (ptl_nid_t nid);
+extern void kibnal_unlink_peer_locked (kib_peer_t *peer);
+extern int kibnal_close_stale_conns_locked (kib_peer_t *peer,
+ __u64 incarnation);
+extern kib_conn_t *kibnal_create_conn (void);
+extern void kibnal_put_conn (kib_conn_t *conn);
+extern void kibnal_destroy_conn (kib_conn_t *conn);
+void kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg);
+
+extern int kibnal_alloc_pages (kib_pages_t **pp, int npages, int access);
+extern void kibnal_free_pages (kib_pages_t *p);
+
+extern void kibnal_check_sends (kib_conn_t *conn);
+extern void kibnal_close_conn_locked (kib_conn_t *conn, int error);
+extern void kibnal_destroy_conn (kib_conn_t *conn);
+extern int kibnal_thread_start (int (*fn)(void *arg), void *arg);
+extern int kibnal_scheduler(void *arg);
+extern int kibnal_connd (void *arg);
+extern void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob);
+extern void kibnal_close_conn (kib_conn_t *conn, int why);
+extern void kibnal_start_active_rdma (int type, int status,
+ kib_rx_t *rx, lib_msg_t *libmsg,
+ unsigned int niov,
+ struct iovec *iov, ptl_kiov_t *kiov,
+ size_t offset, size_t nob);
+
+void kibnal_ca_async_callback (void *ca_arg, IB_EVENT_RECORD *ev);
+void kibnal_ca_callback (void *ca_arg, void *cq_arg);
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "iibnal.h"
+
+/*
+ * LIB functions follow
+ *
+ */
+static void
+kibnal_schedule_tx_done (kib_tx_t *tx)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags);
+
+ list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq);
+ wake_up (&kibnal_data.kib_sched_waitq);
+
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+}
+
+static void
+kibnal_tx_done (kib_tx_t *tx)
+{
+ ptl_err_t ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL;
+ unsigned long flags;
+ int i;
+ FSTATUS frc;
+
+ LASSERT (tx->tx_sending == 0); /* mustn't be awaiting callback */
+ LASSERT (!tx->tx_passive_rdma_wait); /* mustn't be awaiting RDMA */
+
+ switch (tx->tx_mapped) {
+ default:
+ LBUG();
+
+ case KIB_TX_UNMAPPED:
+ break;
+
+ case KIB_TX_MAPPED:
+ if (in_interrupt()) {
+ /* can't deregister memory in IRQ context... */
+ kibnal_schedule_tx_done(tx);
+ return;
+ }
+ frc = iibt_deregister_memory(tx->tx_md.md_handle);
+ LASSERT (frc == FSUCCESS);
+ tx->tx_mapped = KIB_TX_UNMAPPED;
+ break;
+
+#if IBNAL_FMR
+ case KIB_TX_MAPPED_FMR:
+ if (in_interrupt() && tx->tx_status != 0) {
+ /* can't flush FMRs in IRQ context... */
+ kibnal_schedule_tx_done(tx);
+ return;
+ }
+
+ rc = ib_fmr_deregister(tx->tx_md.md_handle.fmr);
+ LASSERT (rc == 0);
+
+ if (tx->tx_status != 0)
+ ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool);
+ tx->tx_mapped = KIB_TX_UNMAPPED;
+ break;
+#endif
+ }
+
+ for (i = 0; i < 2; i++) {
+ /* tx may have up to 2 libmsgs to finalise */
+ if (tx->tx_libmsg[i] == NULL)
+ continue;
+
+ lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc);
+ tx->tx_libmsg[i] = NULL;
+ }
+
+ if (tx->tx_conn != NULL) {
+ kibnal_put_conn (tx->tx_conn);
+ tx->tx_conn = NULL;
+ }
+
+ tx->tx_nsp = 0;
+ tx->tx_passive_rdma = 0;
+ tx->tx_status = 0;
+
+ spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
+
+ if (tx->tx_isnblk) {
+ list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs);
+ } else {
+ list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs);
+ wake_up (&kibnal_data.kib_idle_tx_waitq);
+ }
+
+ spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+}
+
+static kib_tx_t *
+kibnal_get_idle_tx (int may_block)
+{
+ unsigned long flags;
+ kib_tx_t *tx = NULL;
+ ENTRY;
+
+ for (;;) {
+ spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
+
+ /* "normal" descriptor is free */
+ if (!list_empty (&kibnal_data.kib_idle_txs)) {
+ tx = list_entry (kibnal_data.kib_idle_txs.next,
+ kib_tx_t, tx_list);
+ break;
+ }
+
+ if (!may_block) {
+ /* may dip into reserve pool */
+ if (list_empty (&kibnal_data.kib_idle_nblk_txs)) {
+ CERROR ("reserved tx desc pool exhausted\n");
+ break;
+ }
+
+ tx = list_entry (kibnal_data.kib_idle_nblk_txs.next,
+ kib_tx_t, tx_list);
+ break;
+ }
+
+ /* block for idle tx */
+ spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+
+ wait_event (kibnal_data.kib_idle_tx_waitq,
+ !list_empty (&kibnal_data.kib_idle_txs) ||
+ kibnal_data.kib_shutdown);
+ }
+
+ if (tx != NULL) {
+ list_del (&tx->tx_list);
+
+ /* Allocate a new passive RDMA completion cookie. It might
+ * not be needed, but we've got a lock right now and we're
+ * unlikely to wrap... */
+ tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++;
+
+ LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+ LASSERT (tx->tx_nsp == 0);
+ LASSERT (tx->tx_sending == 0);
+ LASSERT (tx->tx_status == 0);
+ LASSERT (tx->tx_conn == NULL);
+ LASSERT (!tx->tx_passive_rdma);
+ LASSERT (!tx->tx_passive_rdma_wait);
+ LASSERT (tx->tx_libmsg[0] == NULL);
+ LASSERT (tx->tx_libmsg[1] == NULL);
+ }
+
+ spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+
+ RETURN(tx);
+}
+
+static int
+kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
+{
+ /* I would guess that if kibnal_get_peer (nid) == NULL,
+ and we're not routing, then 'nid' is very distant :) */
+ if ( nal->libnal_ni.ni_pid.nid == nid ) {
+ *dist = 0;
+ } else {
+ *dist = 1;
+ }
+
+ return 0;
+}
+
+static void
+kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status)
+{
+ struct list_head *ttmp;
+ unsigned long flags;
+ int idle;
+
+ spin_lock_irqsave (&conn->ibc_lock, flags);
+
+ list_for_each (ttmp, &conn->ibc_active_txs) {
+ kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list);
+
+ LASSERT (tx->tx_passive_rdma ||
+ !tx->tx_passive_rdma_wait);
+
+ LASSERT (tx->tx_passive_rdma_wait ||
+ tx->tx_sending != 0);
+
+ if (!tx->tx_passive_rdma_wait ||
+ tx->tx_passive_rdma_cookie != cookie)
+ continue;
+
+ CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status);
+
+ tx->tx_status = status;
+ tx->tx_passive_rdma_wait = 0;
+ idle = (tx->tx_sending == 0);
+
+ if (idle)
+ list_del (&tx->tx_list);
+
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+ /* I could be racing with tx callbacks. It's whoever
+ * _makes_ tx idle that frees it */
+ if (idle)
+ kibnal_tx_done (tx);
+ return;
+ }
+
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+ CERROR ("Unmatched (late?) RDMA completion "LPX64" from "LPX64"\n",
+ cookie, conn->ibc_peer->ibp_nid);
+}
+
+static __u32
+kibnal_lkey(kib_pages_t *ibp)
+{
+ if (kibnal_whole_mem())
+ return kibnal_data.kib_md.md_lkey;
+
+ return ibp->ibp_lkey;
+}
+
+static void
+kibnal_post_rx (kib_rx_t *rx, int do_credits)
+{
+ kib_conn_t *conn = rx->rx_conn;
+ int rc = 0;
+ unsigned long flags;
+ FSTATUS frc;
+ ENTRY;
+
+ rx->rx_gl = (IB_LOCAL_DATASEGMENT) {
+ .Address = rx->rx_vaddr,
+ .Length = IBNAL_MSG_SIZE,
+ .Lkey = kibnal_lkey(conn->ibc_rx_pages),
+ };
+
+ rx->rx_wrq = (IB_WORK_REQ) {
+ .Operation = WROpRecv,
+ .DSListDepth = 1,
+ .MessageLen = IBNAL_MSG_SIZE,
+ .WorkReqId = kibnal_ptr2wreqid(rx, 1),
+ .DSList = &rx->rx_gl,
+ };
+
+ KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED,
+ IBNAL_CONN_DREP);
+ LASSERT (!rx->rx_posted);
+ rx->rx_posted = 1;
+ mb();
+
+ if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
+ rc = -ECONNABORTED;
+ else {
+ frc = iibt_postrecv(conn->ibc_qp, &rx->rx_wrq);
+ if (frc != FSUCCESS) {
+ CDEBUG(D_NET, "post failed %d\n", frc);
+ rc = -EINVAL;
+ }
+ CDEBUG(D_NET, "posted rx %p\n", &rx->rx_wrq);
+ }
+
+ if (rc == 0) {
+ if (do_credits) {
+ spin_lock_irqsave(&conn->ibc_lock, flags);
+ conn->ibc_outstanding_credits++;
+ spin_unlock_irqrestore(&conn->ibc_lock, flags);
+
+ kibnal_check_sends(conn);
+ }
+ EXIT;
+ return;
+ }
+
+ if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
+ CERROR ("Error posting receive -> "LPX64": %d\n",
+ conn->ibc_peer->ibp_nid, rc);
+ kibnal_close_conn (rx->rx_conn, rc);
+ } else {
+ CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n",
+ conn->ibc_peer->ibp_nid, rc);
+ }
+
+ /* Drop rx's ref */
+ kibnal_put_conn (conn);
+ EXIT;
+}
+
+#if IBNAL_CKSUM
+static inline __u32 kibnal_cksum (void *ptr, int nob)
+{
+ char *c = ptr;
+ __u32 sum = 0;
+
+ while (nob-- > 0)
+ sum = ((sum << 1) | (sum >> 31)) + *c++;
+
+ return (sum);
+}
+#endif
+
+static void hexdump(char *string, void *ptr, int len)
+{
+ unsigned char *c = ptr;
+ int i;
+
+ return;
+
+ if (len < 0 || len > 2048) {
+ printk("XXX what the hell? %d\n",len);
+ return;
+ }
+
+ printk("%d bytes of '%s' from 0x%p\n", len, string, ptr);
+
+ for (i = 0; i < len;) {
+ printk("%02x",*(c++));
+ i++;
+ if (!(i & 15)) {
+ printk("\n");
+ } else if (!(i&1)) {
+ printk(" ");
+ }
+ }
+
+ if(len & 15) {
+ printk("\n");
+ }
+}
+
+static void
+kibnal_rx_callback (IB_WORK_COMPLETION *wc)
+{
+ kib_rx_t *rx = (kib_rx_t *)kibnal_wreqid2ptr(wc->WorkReqId);
+ kib_msg_t *msg = rx->rx_msg;
+ kib_conn_t *conn = rx->rx_conn;
+ int nob = wc->Length;
+ const int base_nob = offsetof(kib_msg_t, ibm_u);
+ int credits;
+ int flipped;
+ unsigned long flags;
+ __u32 i;
+#if IBNAL_CKSUM
+ __u32 msg_cksum;
+ __u32 computed_cksum;
+#endif
+
+ /* we set the QP to erroring after we've finished disconnecting,
+ * maybe we should do so sooner. */
+ KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED,
+ IBNAL_CONN_DISCONNECTED);
+
+ CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
+ LASSERT (rx->rx_posted);
+ rx->rx_posted = 0;
+ mb();
+
+ /* receives complete with error in any case after we've started
+ * disconnecting */
+ if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
+ goto failed;
+
+ if (wc->Status != WRStatusSuccess) {
+ CERROR("Rx from "LPX64" failed: %d\n",
+ conn->ibc_peer->ibp_nid, wc->Status);
+ goto failed;
+ }
+
+ if (nob < base_nob) {
+ CERROR ("Short rx from "LPX64": %d < expected %d\n",
+ conn->ibc_peer->ibp_nid, nob, base_nob);
+ goto failed;
+ }
+
+ hexdump("rx", rx->rx_msg, sizeof(kib_msg_t));
+
+ /* Receiver does any byte flipping if necessary... */
+
+ if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
+ flipped = 0;
+ } else {
+ if (msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) {
+ CERROR ("Unrecognised magic: %08x from "LPX64"\n",
+ msg->ibm_magic, conn->ibc_peer->ibp_nid);
+ goto failed;
+ }
+ flipped = 1;
+ __swab16s (&msg->ibm_version);
+ LASSERT (sizeof(msg->ibm_type) == 1);
+ LASSERT (sizeof(msg->ibm_credits) == 1);
+ }
+
+ if (msg->ibm_version != IBNAL_MSG_VERSION) {
+ CERROR ("Incompatible msg version %d (%d expected)\n",
+ msg->ibm_version, IBNAL_MSG_VERSION);
+ goto failed;
+ }
+
+#if IBNAL_CKSUM
+ if (nob != msg->ibm_nob) {
+ CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->ibm_nob);
+ goto failed;
+ }
+
+ msg_cksum = le32_to_cpu(msg->ibm_cksum);
+ msg->ibm_cksum = 0;
+ computed_cksum = kibnal_cksum (msg, nob);
+
+ if (msg_cksum != computed_cksum) {
+ CERROR ("Checksum failure %d: (%d expected)\n",
+ computed_cksum, msg_cksum);
+// goto failed;
+ }
+ CDEBUG(D_NET, "cksum %x, nob %d\n", computed_cksum, nob);
+#endif
+
+ /* Have I received credits that will let me send? */
+ credits = msg->ibm_credits;
+ if (credits != 0) {
+ spin_lock_irqsave(&conn->ibc_lock, flags);
+ conn->ibc_credits += credits;
+ spin_unlock_irqrestore(&conn->ibc_lock, flags);
+
+ kibnal_check_sends(conn);
+ }
+
+ switch (msg->ibm_type) {
+ case IBNAL_MSG_NOOP:
+ kibnal_post_rx (rx, 1);
+ return;
+
+ case IBNAL_MSG_IMMEDIATE:
+ if (nob < base_nob + sizeof (kib_immediate_msg_t)) {
+ CERROR ("Short IMMEDIATE from "LPX64": %d\n",
+ conn->ibc_peer->ibp_nid, nob);
+ goto failed;
+ }
+ break;
+
+ case IBNAL_MSG_PUT_RDMA:
+ case IBNAL_MSG_GET_RDMA:
+ if (nob < base_nob + sizeof (kib_rdma_msg_t)) {
+ CERROR ("Short RDMA msg from "LPX64": %d\n",
+ conn->ibc_peer->ibp_nid, nob);
+ goto failed;
+ }
+ if (flipped)
+ __swab32(msg->ibm_u.rdma.ibrm_num_descs);
+
+ CDEBUG(D_NET, "%d RDMA: cookie "LPX64":\n",
+ msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie);
+
+ if ((msg->ibm_u.rdma.ibrm_num_descs > PTL_MD_MAX_IOV) ||
+ (kib_rdma_msg_len(msg->ibm_u.rdma.ibrm_num_descs) >
+ min(nob, IBNAL_MSG_SIZE))) {
+ CERROR ("num_descs %d too large\n",
+ msg->ibm_u.rdma.ibrm_num_descs);
+ goto failed;
+ }
+
+ for(i = 0; i < msg->ibm_u.rdma.ibrm_num_descs; i++) {
+ kib_rdma_desc_t *desc = &msg->ibm_u.rdma.ibrm_desc[i];
+
+ if (flipped) {
+ __swab32(desc->rd_key);
+ __swab32(desc->rd_nob);
+ __swab64(desc->rd_addr);
+ }
+
+ CDEBUG(D_NET, " key %x, " "addr "LPX64", nob %u\n",
+ desc->rd_key, desc->rd_addr, desc->rd_nob);
+ }
+ break;
+
+ case IBNAL_MSG_PUT_DONE:
+ case IBNAL_MSG_GET_DONE:
+ if (nob < base_nob + sizeof (kib_completion_msg_t)) {
+ CERROR ("Short COMPLETION msg from "LPX64": %d\n",
+ conn->ibc_peer->ibp_nid, nob);
+ goto failed;
+ }
+ if (flipped)
+ __swab32s(&msg->ibm_u.completion.ibcm_status);
+
+ CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n",
+ msg->ibm_type, msg->ibm_u.completion.ibcm_cookie,
+ msg->ibm_u.completion.ibcm_status);
+
+ kibnal_complete_passive_rdma (conn,
+ msg->ibm_u.completion.ibcm_cookie,
+ msg->ibm_u.completion.ibcm_status);
+ kibnal_post_rx (rx, 1);
+ return;
+
+ default:
+ CERROR ("Can't parse type from "LPX64": %d\n",
+ conn->ibc_peer->ibp_nid, msg->ibm_type);
+ goto failed;
+ }
+
+ /* schedule for kibnal_rx() in thread context */
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
+
+ list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq);
+ wake_up (&kibnal_data.kib_sched_waitq);
+
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+ return;
+
+ failed:
+ CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
+ kibnal_close_conn(conn, -ECONNABORTED);
+
+ /* Don't re-post rx & drop its ref on conn */
+ kibnal_put_conn(conn);
+}
+
+void
+kibnal_rx (kib_rx_t *rx)
+{
+ kib_msg_t *msg = rx->rx_msg;
+
+ /* Clear flag so I can detect if I've sent an RDMA completion */
+ rx->rx_rdma = 0;
+
+ switch (msg->ibm_type) {
+ case IBNAL_MSG_GET_RDMA:
+ lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
+ /* If the incoming get was matched, I'll have initiated the
+ * RDMA and the completion message... */
+ if (rx->rx_rdma)
+ break;
+
+ /* Otherwise, I'll send a failed completion now to prevent
+ * the peer's GET blocking for the full timeout. */
+ CERROR ("Completing unmatched RDMA GET from "LPX64"\n",
+ rx->rx_conn->ibc_peer->ibp_nid);
+ kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO,
+ rx, NULL, 0, NULL, NULL, 0, 0);
+ break;
+
+ case IBNAL_MSG_PUT_RDMA:
+ lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
+ if (rx->rx_rdma)
+ break;
+ /* This is most unusual, since even if lib_parse() didn't
+ * match anything, it should have asked us to read (and
+ * discard) the payload. The portals header must be
+ * inconsistent with this message type, so it's the
+ * sender's fault for sending garbage and she can time
+ * herself out... */
+ CERROR ("Uncompleted RMDA PUT from "LPX64"\n",
+ rx->rx_conn->ibc_peer->ibp_nid);
+ break;
+
+ case IBNAL_MSG_IMMEDIATE:
+ lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx);
+ LASSERT (!rx->rx_rdma);
+ break;
+
+ default:
+ LBUG();
+ break;
+ }
+
+ kibnal_post_rx (rx, 1);
+}
+
+static struct page *
+kibnal_kvaddr_to_page (unsigned long vaddr)
+{
+ struct page *page;
+
+ if (vaddr >= VMALLOC_START &&
+ vaddr < VMALLOC_END)
+ page = vmalloc_to_page ((void *)vaddr);
+#if CONFIG_HIGHMEM
+ else if (vaddr >= PKMAP_BASE &&
+ vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
+ page = vmalloc_to_page ((void *)vaddr);
+ /* in 2.4 ^ just walks the page tables */
+#endif
+ else
+ page = virt_to_page (vaddr);
+
+ if (!VALID_PAGE (page))
+ page = NULL;
+
+ return page;
+}
+
+static void
+kibnal_fill_ibrm(kib_tx_t *tx, struct page *page, unsigned long page_offset,
+ unsigned long len, int active)
+{
+ kib_rdma_msg_t *ibrm = &tx->tx_msg->ibm_u.rdma;
+ kib_rdma_desc_t *desc;
+
+ LASSERTF(ibrm->ibrm_num_descs < PTL_MD_MAX_IOV, "%u\n",
+ ibrm->ibrm_num_descs);
+
+ desc = &ibrm->ibrm_desc[ibrm->ibrm_num_descs];
+ if (active)
+ desc->rd_key = kibnal_data.kib_md.md_lkey;
+ else
+ desc->rd_key = kibnal_data.kib_md.md_rkey;
+ desc->rd_nob = len; /*PAGE_SIZE - kiov->kiov_offset; */
+ desc->rd_addr = kibnal_page2phys(page) + page_offset +
+ kibnal_data.kib_md.md_addr;
+
+ ibrm->ibrm_num_descs++;
+}
+
+static int
+kibnal_map_rdma_iov(kib_tx_t *tx, unsigned long vaddr, int nob, int active)
+{
+ struct page *page;
+ int page_offset, len;
+
+ while (nob > 0) {
+ page = kibnal_kvaddr_to_page(vaddr);
+ if (page == NULL)
+ return -EFAULT;
+
+ page_offset = vaddr & (PAGE_SIZE - 1);
+ len = min(nob, (int)PAGE_SIZE - page_offset);
+
+ kibnal_fill_ibrm(tx, page, page_offset, len, active);
+ nob -= len;
+ vaddr += len;
+ }
+ return 0;
+}
+
+static int
+kibnal_map_iov (kib_tx_t *tx, IB_ACCESS_CONTROL access,
+ int niov, struct iovec *iov, int offset, int nob, int active)
+
+{
+ void *vaddr;
+ FSTATUS frc;
+
+ LASSERT (nob > 0);
+ LASSERT (niov > 0);
+ LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+
+ while (offset >= iov->iov_len) {
+ offset -= iov->iov_len;
+ niov--;
+ iov++;
+ LASSERT (niov > 0);
+ }
+
+ if (nob > iov->iov_len - offset) {
+ CERROR ("Can't map multiple vaddr fragments\n");
+ return (-EMSGSIZE);
+ }
+
+ /* our large contiguous iov could be backed by multiple physical
+ * pages. */
+ if (kibnal_whole_mem()) {
+ int rc;
+ tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0;
+ rc = kibnal_map_rdma_iov(tx, (unsigned long)iov->iov_base +
+ offset, nob, active);
+ if (rc != 0) {
+ CERROR ("Can't map iov: %d\n", rc);
+ return rc;
+ }
+ return 0;
+ }
+
+ vaddr = (void *)(((unsigned long)iov->iov_base) + offset);
+ tx->tx_md.md_addr = (__u64)((unsigned long)vaddr);
+
+ frc = iibt_register_memory(kibnal_data.kib_hca, vaddr, nob,
+ kibnal_data.kib_pd, access,
+ &tx->tx_md.md_handle, &tx->tx_md.md_lkey,
+ &tx->tx_md.md_rkey);
+ if (frc != 0) {
+ CERROR ("Can't map vaddr %p: %d\n", vaddr, frc);
+ return -EINVAL;
+ }
+
+ tx->tx_mapped = KIB_TX_MAPPED;
+ return (0);
+}
+
+static int
+kibnal_map_kiov (kib_tx_t *tx, IB_ACCESS_CONTROL access,
+ int nkiov, ptl_kiov_t *kiov,
+ int offset, int nob, int active)
+{
+ __u64 *phys = NULL;
+ int page_offset;
+ int nphys;
+ int resid;
+ int phys_size = 0;
+ FSTATUS frc;
+ int i, rc = 0;
+
+ CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
+
+ LASSERT (nob > 0);
+ LASSERT (nkiov > 0);
+ LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+
+ while (offset >= kiov->kiov_len) {
+ offset -= kiov->kiov_len;
+ nkiov--;
+ kiov++;
+ LASSERT (nkiov > 0);
+ }
+
+ page_offset = kiov->kiov_offset + offset;
+ nphys = 1;
+
+ if (!kibnal_whole_mem()) {
+ phys_size = nkiov * sizeof (*phys);
+ PORTAL_ALLOC(phys, phys_size);
+ if (phys == NULL) {
+ CERROR ("Can't allocate tmp phys\n");
+ return (-ENOMEM);
+ }
+
+ phys[0] = kibnal_page2phys(kiov->kiov_page);
+ } else {
+ tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0;
+ kibnal_fill_ibrm(tx, kiov->kiov_page, kiov->kiov_offset,
+ kiov->kiov_len, active);
+ }
+
+ resid = nob - (kiov->kiov_len - offset);
+
+ while (resid > 0) {
+ kiov++;
+ nkiov--;
+ LASSERT (nkiov > 0);
+
+ if (kiov->kiov_offset != 0 ||
+ ((resid > PAGE_SIZE) &&
+ kiov->kiov_len < PAGE_SIZE)) {
+ /* Can't have gaps */
+ CERROR ("Can't make payload contiguous in I/O VM:"
+ "page %d, offset %d, len %d \n", nphys,
+ kiov->kiov_offset, kiov->kiov_len);
+
+ for (i = -nphys; i < nkiov; i++)
+ {
+ CERROR("kiov[%d] %p +%d for %d\n",
+ i, kiov[i].kiov_page, kiov[i].kiov_offset, kiov[i].kiov_len);
+ }
+
+ rc = -EINVAL;
+ goto out;
+ }
+
+ if (nphys == PTL_MD_MAX_IOV) {
+ CERROR ("payload too big (%d)\n", nphys);
+ rc = -EMSGSIZE;
+ goto out;
+ }
+
+ if (!kibnal_whole_mem()) {
+ LASSERT (nphys * sizeof (*phys) < phys_size);
+ phys[nphys] = kibnal_page2phys(kiov->kiov_page);
+ } else {
+ if (kib_rdma_msg_len(nphys) > IBNAL_MSG_SIZE) {
+ CERROR ("payload too big (%d)\n", nphys);
+ rc = -EMSGSIZE;
+ goto out;
+ }
+ kibnal_fill_ibrm(tx, kiov->kiov_page,
+ kiov->kiov_offset, kiov->kiov_len,
+ active);
+ }
+
+ nphys ++;
+ resid -= PAGE_SIZE;
+ }
+
+ if (kibnal_whole_mem())
+ goto out;
+
+#if 0
+ CWARN ("nphys %d, nob %d, page_offset %d\n", nphys, nob, page_offset);
+ for (i = 0; i < nphys; i++)
+ CWARN (" [%d] "LPX64"\n", i, phys[i]);
+#endif
+
+#if IBNAL_FMR
+#error "iibnal hasn't learned about FMR yet"
+ rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool,
+ phys, nphys,
+ &tx->tx_md.md_addr,
+ page_offset,
+ &tx->tx_md.md_handle.fmr,
+ &tx->tx_md.md_lkey,
+ &tx->tx_md.md_rkey);
+#else
+ frc = iibt_register_physical_memory(kibnal_data.kib_hca,
+ IBNAL_RDMA_BASE,
+ phys, nphys,
+ 0, /* offset */
+ kibnal_data.kib_pd,
+ access,
+ &tx->tx_md.md_handle,
+ &tx->tx_md.md_addr,
+ &tx->tx_md.md_lkey,
+ &tx->tx_md.md_rkey);
+#endif
+ if (frc == FSUCCESS) {
+ CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: lkey %x, rkey %x\n",
+ nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey);
+#if IBNAL_FMR
+ tx->tx_mapped = KIB_TX_MAPPED_FMR;
+#else
+ tx->tx_mapped = KIB_TX_MAPPED;
+#endif
+ } else {
+ CERROR ("Can't map phys: %d\n", rc);
+ rc = -EFAULT;
+ }
+
+ out:
+ if (phys != NULL)
+ PORTAL_FREE(phys, phys_size);
+ return (rc);
+}
+
+static kib_conn_t *
+kibnal_find_conn_locked (kib_peer_t *peer)
+{
+ struct list_head *tmp;
+
+ /* just return the first connection */
+ list_for_each (tmp, &peer->ibp_conns) {
+ return (list_entry(tmp, kib_conn_t, ibc_list));
+ }
+
+ return (NULL);
+}
+
+void
+kibnal_check_sends (kib_conn_t *conn)
+{
+ unsigned long flags;
+ kib_tx_t *tx;
+ int rc;
+ int i;
+ int done;
+ int nwork;
+ ENTRY;
+
+ spin_lock_irqsave (&conn->ibc_lock, flags);
+
+ LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE);
+
+ if (list_empty(&conn->ibc_tx_queue) &&
+ conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) {
+ spin_unlock_irqrestore(&conn->ibc_lock, flags);
+
+ tx = kibnal_get_idle_tx(0); /* don't block */
+ if (tx != NULL)
+ kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
+
+ spin_lock_irqsave(&conn->ibc_lock, flags);
+
+ if (tx != NULL) {
+ atomic_inc(&conn->ibc_refcount);
+ kibnal_queue_tx_locked(tx, conn);
+ }
+ }
+
+ while (!list_empty (&conn->ibc_tx_queue)) {
+ tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list);
+
+ /* We rely on this for QP sizing */
+ LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= IBNAL_TX_MAX_SG);
+
+ LASSERT (conn->ibc_outstanding_credits >= 0);
+ LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
+ LASSERT (conn->ibc_credits >= 0);
+ LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
+
+ /* Not on ibc_rdma_queue */
+ LASSERT (!tx->tx_passive_rdma_wait);
+
+ if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE)
+ GOTO(out, 0);
+
+ if (conn->ibc_credits == 0) /* no credits */
+ GOTO(out, 1);
+
+ if (conn->ibc_credits == 1 && /* last credit reserved for */
+ conn->ibc_outstanding_credits == 0) /* giving back credits */
+ GOTO(out, 2);
+
+ list_del (&tx->tx_list);
+
+ if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
+ (!list_empty(&conn->ibc_tx_queue) ||
+ conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) {
+ /* redundant NOOP */
+ spin_unlock_irqrestore(&conn->ibc_lock, flags);
+ kibnal_tx_done(tx);
+ spin_lock_irqsave(&conn->ibc_lock, flags);
+ continue;
+ }
+
+ tx->tx_msg->ibm_credits = conn->ibc_outstanding_credits;
+ conn->ibc_outstanding_credits = 0;
+
+ conn->ibc_nsends_posted++;
+ conn->ibc_credits--;
+
+ /* we only get a tx completion for the final rdma op */
+ tx->tx_sending = min(tx->tx_nsp, 2);
+ tx->tx_passive_rdma_wait = tx->tx_passive_rdma;
+ list_add (&tx->tx_list, &conn->ibc_active_txs);
+#if IBNAL_CKSUM
+ tx->tx_msg->ibm_cksum = 0;
+ tx->tx_msg->ibm_cksum = kibnal_cksum(tx->tx_msg, tx->tx_msg->ibm_nob);
+ CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->ibm_cksum, tx->tx_msg->ibm_nob);
+#endif
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+ /* NB the gap between removing tx from the queue and sending it
+ * allows message re-ordering to occur */
+
+ LASSERT (tx->tx_nsp > 0);
+
+ rc = -ECONNABORTED;
+ nwork = 0;
+ if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
+ tx->tx_status = 0;
+ /* Driver only accepts 1 item at a time */
+ for (i = 0; i < tx->tx_nsp; i++) {
+ hexdump("tx", tx->tx_msg, sizeof(kib_msg_t));
+ rc = iibt_postsend(conn->ibc_qp,
+ &tx->tx_wrq[i]);
+ if (rc != 0)
+ break;
+ if (wrq_signals_completion(&tx->tx_wrq[i]))
+ nwork++;
+ CDEBUG(D_NET, "posted tx wrq %p\n",
+ &tx->tx_wrq[i]);
+ }
+ }
+
+ spin_lock_irqsave (&conn->ibc_lock, flags);
+ if (rc != 0) {
+ /* NB credits are transferred in the actual
+ * message, which can only be the last work item */
+ conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
+ conn->ibc_credits++;
+ conn->ibc_nsends_posted--;
+
+ tx->tx_status = rc;
+ tx->tx_passive_rdma_wait = 0;
+ tx->tx_sending -= tx->tx_nsp - nwork;
+
+ done = (tx->tx_sending == 0);
+ if (done)
+ list_del (&tx->tx_list);
+
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+ if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
+ CERROR ("Error %d posting transmit to "LPX64"\n",
+ rc, conn->ibc_peer->ibp_nid);
+ else
+ CDEBUG (D_NET, "Error %d posting transmit to "
+ LPX64"\n", rc, conn->ibc_peer->ibp_nid);
+
+ kibnal_close_conn (conn, rc);
+
+ if (done)
+ kibnal_tx_done (tx);
+ return;
+ }
+
+ }
+
+ EXIT;
+out:
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+}
+
+static void
+kibnal_tx_callback (IB_WORK_COMPLETION *wc)
+{
+ kib_tx_t *tx = (kib_tx_t *)kibnal_wreqid2ptr(wc->WorkReqId);
+ kib_conn_t *conn;
+ unsigned long flags;
+ int idle;
+
+ conn = tx->tx_conn;
+ LASSERT (conn != NULL);
+ LASSERT (tx->tx_sending != 0);
+
+ spin_lock_irqsave(&conn->ibc_lock, flags);
+
+ CDEBUG(D_NET, "conn %p tx %p [%d/%d]: %d\n", conn, tx,
+ tx->tx_sending, tx->tx_nsp, wc->Status);
+
+ /* I could be racing with rdma completion. Whoever makes 'tx' idle
+ * gets to free it, which also drops its ref on 'conn'. If it's
+ * not me, then I take an extra ref on conn so it can't disappear
+ * under me. */
+
+ tx->tx_sending--;
+ idle = (tx->tx_sending == 0) && /* This is the final callback */
+ (!tx->tx_passive_rdma_wait); /* Not waiting for RDMA completion */
+ if (idle)
+ list_del(&tx->tx_list);
+
+ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+ atomic_inc (&conn->ibc_refcount);
+
+ if (tx->tx_sending == 0)
+ conn->ibc_nsends_posted--;
+
+ if (wc->Status != WRStatusSuccess &&
+ tx->tx_status == 0)
+ tx->tx_status = -ECONNABORTED;
+
+ spin_unlock_irqrestore(&conn->ibc_lock, flags);
+
+ if (idle)
+ kibnal_tx_done (tx);
+
+ if (wc->Status != WRStatusSuccess) {
+ CERROR ("Tx completion to "LPX64" failed: %d\n",
+ conn->ibc_peer->ibp_nid, wc->Status);
+ kibnal_close_conn (conn, -ENETDOWN);
+ } else {
+ /* can I shovel some more sends out the door? */
+ kibnal_check_sends(conn);
+ }
+
+ kibnal_put_conn (conn);
+}
+
+void
+kibnal_ca_async_callback (void *ca_arg, IB_EVENT_RECORD *ev)
+{
+ /* XXX flesh out. this seems largely for async errors */
+ CERROR("type: %d code: %u\n", ev->EventType, ev->EventCode);
+}
+
+void
+kibnal_ca_callback (void *ca_arg, void *cq_arg)
+{
+ IB_HANDLE cq = *(IB_HANDLE *)cq_arg;
+ IB_HANDLE ca = *(IB_HANDLE *)ca_arg;
+ IB_WORK_COMPLETION wc;
+ int armed = 0;
+
+ CDEBUG(D_NET, "ca %p cq %p\n", ca, cq);
+
+ for(;;) {
+ while (iibt_cq_poll(cq, &wc) == FSUCCESS) {
+ if (kibnal_wreqid_is_rx(wc.WorkReqId))
+ kibnal_rx_callback(&wc);
+ else
+ kibnal_tx_callback(&wc);
+ }
+ if (armed)
+ return;
+ if (iibt_cq_rearm(cq, CQEventSelNextWC) != FSUCCESS) {
+ CERROR("rearm failed?\n");
+ return;
+ }
+ armed = 1;
+ }
+}
+
+void
+kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
+{
+ IB_LOCAL_DATASEGMENT *gl = &tx->tx_gl[tx->tx_nsp];
+ IB_WORK_REQ *wrq = &tx->tx_wrq[tx->tx_nsp];
+ int fence;
+ int nob = offsetof (kib_msg_t, ibm_u) + body_nob;
+
+ LASSERT (tx->tx_nsp >= 0 &&
+ tx->tx_nsp < sizeof(tx->tx_wrq)/sizeof(tx->tx_wrq[0]));
+ LASSERT (nob <= IBNAL_MSG_SIZE);
+
+ tx->tx_msg->ibm_magic = IBNAL_MSG_MAGIC;
+ tx->tx_msg->ibm_version = IBNAL_MSG_VERSION;
+ tx->tx_msg->ibm_type = type;
+#if IBNAL_CKSUM
+ tx->tx_msg->ibm_nob = nob;
+#endif
+ /* Fence the message if it's bundled with an RDMA read */
+ fence = (tx->tx_nsp > 0) &&
+ (type == IBNAL_MSG_PUT_DONE);
+
+ *gl = (IB_LOCAL_DATASEGMENT) {
+ .Address = tx->tx_vaddr,
+ .Length = IBNAL_MSG_SIZE,
+ .Lkey = kibnal_lkey(kibnal_data.kib_tx_pages),
+ };
+
+ wrq->WorkReqId = kibnal_ptr2wreqid(tx, 0);
+ wrq->Operation = WROpSend;
+ wrq->DSList = gl;
+ wrq->DSListDepth = 1;
+ wrq->MessageLen = nob;
+ wrq->Req.SendRC.ImmediateData = 0;
+ wrq->Req.SendRC.Options.s.SolicitedEvent = 1;
+ wrq->Req.SendRC.Options.s.SignaledCompletion = 1;
+ wrq->Req.SendRC.Options.s.ImmediateData = 0;
+ wrq->Req.SendRC.Options.s.Fence = fence;
+
+ tx->tx_nsp++;
+}
+
+static void
+kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&conn->ibc_lock, flags);
+
+ kibnal_queue_tx_locked (tx, conn);
+
+ spin_unlock_irqrestore(&conn->ibc_lock, flags);
+
+ kibnal_check_sends(conn);
+}
+
+static void
+kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
+{
+ unsigned long flags;
+ kib_peer_t *peer;
+ kib_conn_t *conn;
+ rwlock_t *g_lock = &kibnal_data.kib_global_lock;
+
+ /* If I get here, I've committed to send, so I complete the tx with
+ * failure on any problems */
+
+ LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */
+ LASSERT (tx->tx_nsp > 0); /* work items have been set up */
+
+ read_lock (g_lock);
+
+ peer = kibnal_find_peer_locked (nid);
+ if (peer == NULL) {
+ read_unlock (g_lock);
+ tx->tx_status = -EHOSTUNREACH;
+ kibnal_tx_done (tx);
+ return;
+ }
+
+ conn = kibnal_find_conn_locked (peer);
+ if (conn != NULL) {
+ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+ atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
+ read_unlock (g_lock);
+
+ kibnal_queue_tx (tx, conn);
+ return;
+ }
+
+ /* Making one or more connections; I'll need a write lock... */
+ read_unlock (g_lock);
+ write_lock_irqsave (g_lock, flags);
+
+ peer = kibnal_find_peer_locked (nid);
+ if (peer == NULL) {
+ write_unlock_irqrestore (g_lock, flags);
+ tx->tx_status = -EHOSTUNREACH;
+ kibnal_tx_done (tx);
+ return;
+ }
+
+ conn = kibnal_find_conn_locked (peer);
+ if (conn != NULL) {
+ /* Connection exists; queue message on it */
+ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+ atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
+ write_unlock_irqrestore (g_lock, flags);
+
+ kibnal_queue_tx (tx, conn);
+ return;
+ }
+
+ if (peer->ibp_connecting == 0) {
+ if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) {
+ write_unlock_irqrestore (g_lock, flags);
+ tx->tx_status = -EHOSTUNREACH;
+ kibnal_tx_done (tx);
+ return;
+ }
+
+ peer->ibp_connecting = 1;
+ kib_peer_addref(peer); /* extra ref for connd */
+
+ spin_lock (&kibnal_data.kib_connd_lock);
+
+ list_add_tail (&peer->ibp_connd_list,
+ &kibnal_data.kib_connd_peers);
+ wake_up (&kibnal_data.kib_connd_waitq);
+
+ spin_unlock (&kibnal_data.kib_connd_lock);
+ }
+
+ /* A connection is being established; queue the message... */
+ list_add_tail (&tx->tx_list, &peer->ibp_tx_queue);
+
+ write_unlock_irqrestore (g_lock, flags);
+}
+
+static ptl_err_t
+kibnal_start_passive_rdma (int type, ptl_nid_t nid,
+ lib_msg_t *libmsg, ptl_hdr_t *hdr)
+{
+ int nob = libmsg->md->length;
+ kib_tx_t *tx;
+ kib_msg_t *ibmsg;
+ int rc;
+ IB_ACCESS_CONTROL access = {0,};
+
+ LASSERT (type == IBNAL_MSG_PUT_RDMA || type == IBNAL_MSG_GET_RDMA);
+ LASSERT (nob > 0);
+ LASSERT (!in_interrupt()); /* Mapping could block */
+
+ access.s.MWBindable = 1;
+ access.s.LocalWrite = 1;
+ access.s.RdmaRead = 1;
+ access.s.RdmaWrite = 1;
+
+ tx = kibnal_get_idle_tx (1); /* May block; caller is an app thread */
+ LASSERT (tx != NULL);
+
+ if ((libmsg->md->options & PTL_MD_KIOV) == 0)
+ rc = kibnal_map_iov (tx, access,
+ libmsg->md->md_niov,
+ libmsg->md->md_iov.iov,
+ 0, nob, 0);
+ else
+ rc = kibnal_map_kiov (tx, access,
+ libmsg->md->md_niov,
+ libmsg->md->md_iov.kiov,
+ 0, nob, 0);
+
+ if (rc != 0) {
+ CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc);
+ goto failed;
+ }
+
+ if (type == IBNAL_MSG_GET_RDMA) {
+ /* reply gets finalized when tx completes */
+ tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib,
+ nid, libmsg);
+ if (tx->tx_libmsg[1] == NULL) {
+ CERROR ("Can't create reply for GET -> "LPX64"\n",
+ nid);
+ rc = -ENOMEM;
+ goto failed;
+ }
+ }
+
+ tx->tx_passive_rdma = 1;
+
+ ibmsg = tx->tx_msg;
+
+ ibmsg->ibm_u.rdma.ibrm_hdr = *hdr;
+ ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie;
+ /* map_kiov alrady filled the rdma descs for the whole_mem case */
+ if (!kibnal_whole_mem()) {
+ ibmsg->ibm_u.rdma.ibrm_desc[0].rd_key = tx->tx_md.md_rkey;
+ ibmsg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr;
+ ibmsg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob;
+ ibmsg->ibm_u.rdma.ibrm_num_descs = 1;
+ }
+
+ kibnal_init_tx_msg (tx, type,
+ kib_rdma_msg_len(ibmsg->ibm_u.rdma.ibrm_num_descs));
+
+ CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr "
+ LPX64", nob %d\n",
+ tx, tx->tx_passive_rdma_cookie, tx->tx_md.md_rkey,
+ tx->tx_md.md_addr, nob);
+
+ /* libmsg gets finalized when tx completes. */
+ tx->tx_libmsg[0] = libmsg;
+
+ kibnal_launch_tx(tx, nid);
+ return (PTL_OK);
+
+ failed:
+ tx->tx_status = rc;
+ kibnal_tx_done (tx);
+ return (PTL_FAIL);
+}
+
+void
+kibnal_start_active_rdma (int type, int status,
+ kib_rx_t *rx, lib_msg_t *libmsg,
+ unsigned int niov,
+ struct iovec *iov, ptl_kiov_t *kiov,
+ size_t offset, size_t nob)
+{
+ kib_msg_t *rxmsg = rx->rx_msg;
+ kib_msg_t *txmsg;
+ kib_tx_t *tx;
+ IB_ACCESS_CONTROL access = {0,};
+ IB_WR_OP rdma_op;
+ int rc;
+ __u32 i;
+
+ CDEBUG(D_NET, "type %d, status %d, niov %d, offset %d, nob %d\n",
+ type, status, niov, offset, nob);
+
+ /* Called by scheduler */
+ LASSERT (!in_interrupt ());
+
+ /* Either all pages or all vaddrs */
+ LASSERT (!(kiov != NULL && iov != NULL));
+
+ /* No data if we're completing with failure */
+ LASSERT (status == 0 || nob == 0);
+
+ LASSERT (type == IBNAL_MSG_GET_DONE ||
+ type == IBNAL_MSG_PUT_DONE);
+
+ /* Flag I'm completing the RDMA. Even if I fail to send the
+ * completion message, I will have tried my best so further
+ * attempts shouldn't be tried. */
+ LASSERT (!rx->rx_rdma);
+ rx->rx_rdma = 1;
+
+ if (type == IBNAL_MSG_GET_DONE) {
+ rdma_op = WROpRdmaWrite;
+ LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA);
+ } else {
+ access.s.LocalWrite = 1;
+ rdma_op = WROpRdmaRead;
+ LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA);
+ }
+
+ tx = kibnal_get_idle_tx (0); /* Mustn't block */
+ if (tx == NULL) {
+ CERROR ("tx descs exhausted on RDMA from "LPX64
+ " completing locally with failure\n",
+ rx->rx_conn->ibc_peer->ibp_nid);
+ lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE);
+ return;
+ }
+ LASSERT (tx->tx_nsp == 0);
+
+ if (nob == 0)
+ GOTO(init_tx, 0);
+
+ /* We actually need to transfer some data (the transfer
+ * size could get truncated to zero when the incoming
+ * message is matched) */
+ if (kiov != NULL)
+ rc = kibnal_map_kiov (tx, access, niov, kiov, offset, nob, 1);
+ else
+ rc = kibnal_map_iov (tx, access, niov, iov, offset, nob, 1);
+
+ if (rc != 0) {
+ CERROR ("Can't map RDMA -> "LPX64": %d\n",
+ rx->rx_conn->ibc_peer->ibp_nid, rc);
+ /* We'll skip the RDMA and complete with failure. */
+ status = rc;
+ nob = 0;
+ GOTO(init_tx, rc);
+ }
+
+ if (!kibnal_whole_mem()) {
+ tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_key = tx->tx_md.md_lkey;
+ tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr;
+ tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob;
+ tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 1;
+ }
+
+ /* XXX ugh. different page-sized hosts. */
+ if (tx->tx_msg->ibm_u.rdma.ibrm_num_descs !=
+ rxmsg->ibm_u.rdma.ibrm_num_descs) {
+ CERROR("tx descs (%u) != rx descs (%u)\n",
+ tx->tx_msg->ibm_u.rdma.ibrm_num_descs,
+ rxmsg->ibm_u.rdma.ibrm_num_descs);
+ /* We'll skip the RDMA and complete with failure. */
+ status = rc;
+ nob = 0;
+ GOTO(init_tx, rc);
+ }
+
+ /* map_kiov filled in the rdma descs which describe our side of the
+ * rdma transfer. */
+ /* ibrm_num_descs was verified in rx_callback */
+ for(i = 0; i < rxmsg->ibm_u.rdma.ibrm_num_descs; i++) {
+ kib_rdma_desc_t *ldesc, *rdesc; /* local, remote */
+ IB_LOCAL_DATASEGMENT *ds = &tx->tx_gl[i];
+ IB_WORK_REQ *wrq = &tx->tx_wrq[i];
+
+ ldesc = &tx->tx_msg->ibm_u.rdma.ibrm_desc[i];
+ rdesc = &rxmsg->ibm_u.rdma.ibrm_desc[i];
+
+ ds->Address = ldesc->rd_addr;
+ ds->Length = ldesc->rd_nob;
+ ds->Lkey = ldesc->rd_key;
+
+ memset(wrq, 0, sizeof(*wrq));
+ wrq->WorkReqId = kibnal_ptr2wreqid(tx, 0);
+ wrq->Operation = rdma_op;
+ wrq->DSList = ds;
+ wrq->DSListDepth = 1;
+ wrq->MessageLen = ds->Length;
+ wrq->Req.SendRC.ImmediateData = 0;
+ wrq->Req.SendRC.Options.s.SolicitedEvent = 0;
+ wrq->Req.SendRC.Options.s.SignaledCompletion = 0;
+ wrq->Req.SendRC.Options.s.ImmediateData = 0;
+ wrq->Req.SendRC.Options.s.Fence = 0;
+ wrq->Req.SendRC.RemoteDS.Address = rdesc->rd_addr;
+ wrq->Req.SendRC.RemoteDS.Rkey = rdesc->rd_key;
+
+ /* only the last rdma post triggers tx completion */
+ if (i == rxmsg->ibm_u.rdma.ibrm_num_descs - 1)
+ wrq->Req.SendRC.Options.s.SignaledCompletion = 1;
+
+ tx->tx_nsp++;
+ }
+
+init_tx:
+ txmsg = tx->tx_msg;
+
+ txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie;
+ txmsg->ibm_u.completion.ibcm_status = status;
+
+ kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
+
+ if (status == 0 && nob != 0) {
+ LASSERT (tx->tx_nsp > 1);
+ /* RDMA: libmsg gets finalized when the tx completes. This
+ * is after the completion message has been sent, which in
+ * turn is after the RDMA has finished. */
+ tx->tx_libmsg[0] = libmsg;
+ } else {
+ LASSERT (tx->tx_nsp == 1);
+ /* No RDMA: local completion happens now! */
+ CDEBUG(D_WARNING,"No data: immediate completion\n");
+ lib_finalize (&kibnal_lib, NULL, libmsg,
+ status == 0 ? PTL_OK : PTL_FAIL);
+ }
+
+ /* +1 ref for this tx... */
+ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+ rx->rx_conn, rx->rx_conn->ibc_state,
+ rx->rx_conn->ibc_peer->ibp_nid,
+ atomic_read (&rx->rx_conn->ibc_refcount));
+ atomic_inc (&rx->rx_conn->ibc_refcount);
+ /* ...and queue it up */
+ kibnal_queue_tx(tx, rx->rx_conn);
+}
+
+static ptl_err_t
+kibnal_sendmsg(lib_nal_t *nal,
+ void *private,
+ lib_msg_t *libmsg,
+ ptl_hdr_t *hdr,
+ int type,
+ ptl_nid_t nid,
+ ptl_pid_t pid,
+ unsigned int payload_niov,
+ struct iovec *payload_iov,
+ ptl_kiov_t *payload_kiov,
+ size_t payload_offset,
+ size_t payload_nob)
+{
+ kib_msg_t *ibmsg;
+ kib_tx_t *tx;
+ int nob;
+
+ /* NB 'private' is different depending on what we're sending.... */
+
+ CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid:"LPX64
+ " pid %d\n", payload_nob, payload_niov, nid , pid);
+
+ LASSERT (payload_nob == 0 || payload_niov > 0);
+ LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+
+ /* Thread context if we're sending payload */
+ LASSERT (!in_interrupt() || payload_niov == 0);
+ /* payload is either all vaddrs or all pages */
+ LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+
+ switch (type) {
+ default:
+ LBUG();
+ return (PTL_FAIL);
+
+ case PTL_MSG_REPLY: {
+ /* reply's 'private' is the incoming receive */
+ kib_rx_t *rx = private;
+
+ /* RDMA reply expected? */
+ if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) {
+ kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0,
+ rx, libmsg, payload_niov,
+ payload_iov, payload_kiov,
+ payload_offset, payload_nob);
+ return (PTL_OK);
+ }
+
+ /* Incoming message consistent with immediate reply? */
+ if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) {
+ CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n",
+ nid, rx->rx_msg->ibm_type);
+ return (PTL_FAIL);
+ }
+
+ /* Will it fit in a message? */
+ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+ if (nob >= IBNAL_MSG_SIZE) {
+ CERROR("REPLY for "LPX64" too big (RDMA not requested): %d\n",
+ nid, payload_nob);
+ return (PTL_FAIL);
+ }
+ break;
+ }
+
+ case PTL_MSG_GET:
+ /* might the REPLY message be big enough to need RDMA? */
+ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]);
+ if (nob > IBNAL_MSG_SIZE)
+ return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA,
+ nid, libmsg, hdr));
+ break;
+
+ case PTL_MSG_ACK:
+ LASSERT (payload_nob == 0);
+ break;
+
+ case PTL_MSG_PUT:
+ /* Is the payload big enough to need RDMA? */
+ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+ if (nob > IBNAL_MSG_SIZE)
+ return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA,
+ nid, libmsg, hdr));
+
+ break;
+ }
+
+ tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
+ type == PTL_MSG_REPLY ||
+ in_interrupt()));
+ if (tx == NULL) {
+ CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n",
+ type, nid, in_interrupt() ? " (intr)" : "");
+ return (PTL_NO_SPACE);
+ }
+
+ ibmsg = tx->tx_msg;
+ ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
+
+ if (payload_nob > 0) {
+ if (payload_kiov != NULL)
+ lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload,
+ payload_niov, payload_kiov,
+ payload_offset, payload_nob);
+ else
+ lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload,
+ payload_niov, payload_iov,
+ payload_offset, payload_nob);
+ }
+
+ kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE,
+ offsetof(kib_immediate_msg_t,
+ ibim_payload[payload_nob]));
+
+ /* libmsg gets finalized when tx completes */
+ tx->tx_libmsg[0] = libmsg;
+
+ kibnal_launch_tx(tx, nid);
+ return (PTL_OK);
+}
+
+static ptl_err_t
+kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
+ ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+ unsigned int payload_niov, struct iovec *payload_iov,
+ size_t payload_offset, size_t payload_len)
+{
+ return (kibnal_sendmsg(nal, private, cookie,
+ hdr, type, nid, pid,
+ payload_niov, payload_iov, NULL,
+ payload_offset, payload_len));
+}
+
+static ptl_err_t
+kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie,
+ ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+ unsigned int payload_niov, ptl_kiov_t *payload_kiov,
+ size_t payload_offset, size_t payload_len)
+{
+ return (kibnal_sendmsg(nal, private, cookie,
+ hdr, type, nid, pid,
+ payload_niov, NULL, payload_kiov,
+ payload_offset, payload_len));
+}
+
+static ptl_err_t
+kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
+ unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov,
+ size_t offset, size_t mlen, size_t rlen)
+{
+ kib_rx_t *rx = private;
+ kib_msg_t *rxmsg = rx->rx_msg;
+ int msg_nob;
+
+ LASSERT (mlen <= rlen);
+ LASSERT (!in_interrupt ());
+ /* Either all pages or all vaddrs */
+ LASSERT (!(kiov != NULL && iov != NULL));
+
+ switch (rxmsg->ibm_type) {
+ default:
+ LBUG();
+ return (PTL_FAIL);
+
+ case IBNAL_MSG_IMMEDIATE:
+ msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
+ if (msg_nob > IBNAL_MSG_SIZE) {
+ CERROR ("Immediate message from "LPX64" too big: %d\n",
+ rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen);
+ return (PTL_FAIL);
+ }
+
+ if (kiov != NULL)
+ lib_copy_buf2kiov(niov, kiov, offset,
+ rxmsg->ibm_u.immediate.ibim_payload,
+ mlen);
+ else
+ lib_copy_buf2iov(niov, iov, offset,
+ rxmsg->ibm_u.immediate.ibim_payload,
+ mlen);
+
+ lib_finalize (nal, NULL, libmsg, PTL_OK);
+ return (PTL_OK);
+
+ case IBNAL_MSG_GET_RDMA:
+ /* We get called here just to discard any junk after the
+ * GET hdr. */
+ LASSERT (libmsg == NULL);
+ lib_finalize (nal, NULL, libmsg, PTL_OK);
+ return (PTL_OK);
+
+ case IBNAL_MSG_PUT_RDMA:
+ kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0,
+ rx, libmsg,
+ niov, iov, kiov, offset, mlen);
+ return (PTL_OK);
+ }
+}
+
+static ptl_err_t
+kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
+ unsigned int niov, struct iovec *iov,
+ size_t offset, size_t mlen, size_t rlen)
+{
+ return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL,
+ offset, mlen, rlen));
+}
+
+static ptl_err_t
+kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
+ unsigned int niov, ptl_kiov_t *kiov,
+ size_t offset, size_t mlen, size_t rlen)
+{
+ return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov,
+ offset, mlen, rlen));
+}
+
+/*****************************************************************************
+ * the rest of this file concerns connection management. active connetions
+ * start with connect_peer, passive connections start with passive_callback.
+ * active disconnects start with conn_close, cm_callback starts passive
+ * disconnects and contains the guts of how the disconnect state machine
+ * progresses.
+ *****************************************************************************/
+
+int
+kibnal_thread_start (int (*fn)(void *arg), void *arg)
+{
+ long pid = kernel_thread (fn, arg, 0);
+
+ if (pid < 0)
+ return ((int)pid);
+
+ atomic_inc (&kibnal_data.kib_nthreads);
+ return (0);
+}
+
+static void
+kibnal_thread_fini (void)
+{
+ atomic_dec (&kibnal_data.kib_nthreads);
+}
+
+/* this can be called by anyone at any time to close a connection. if
+ * the connection is still established it heads to the connd to start
+ * the disconnection in a safe context. It has no effect if called
+ * on a connection that is already disconnecting */
+void
+kibnal_close_conn_locked (kib_conn_t *conn, int error)
+{
+ /* This just does the immmediate housekeeping, and schedules the
+ * connection for the connd to finish off.
+ * Caller holds kib_global_lock exclusively in irq context */
+ kib_peer_t *peer = conn->ibc_peer;
+
+ KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_CONNECTING,
+ IBNAL_CONN_DISCONNECTED);
+
+ if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
+ return; /* already disconnecting */
+
+ CDEBUG (error == 0 ? D_NET : D_ERROR,
+ "closing conn to "LPX64": error %d\n", peer->ibp_nid, error);
+
+ if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
+ /* kib_connd_conns takes ibc_list's ref */
+ list_del (&conn->ibc_list);
+ } else {
+ /* new ref for kib_connd_conns */
+ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+ atomic_inc (&conn->ibc_refcount);
+ }
+
+ if (list_empty (&peer->ibp_conns) &&
+ peer->ibp_persistence == 0) {
+ /* Non-persistent peer with no more conns... */
+ kibnal_unlink_peer_locked (peer);
+ }
+
+ conn->ibc_state = IBNAL_CONN_SEND_DREQ;
+
+ spin_lock (&kibnal_data.kib_connd_lock);
+
+ list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
+ wake_up (&kibnal_data.kib_connd_waitq);
+
+ spin_unlock (&kibnal_data.kib_connd_lock);
+}
+
+void
+kibnal_close_conn (kib_conn_t *conn, int error)
+{
+ unsigned long flags;
+
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+ kibnal_close_conn_locked (conn, error);
+
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+}
+
+static void
+kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc)
+{
+ LIST_HEAD (zombies);
+ kib_tx_t *tx;
+ unsigned long flags;
+
+ LASSERT (rc != 0);
+ LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL);
+
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+ LASSERT (peer->ibp_connecting != 0);
+ peer->ibp_connecting--;
+
+ if (peer->ibp_connecting != 0) {
+ /* another connection attempt under way (loopback?)... */
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+ return;
+ }
+
+ if (list_empty(&peer->ibp_conns)) {
+ /* Say when active connection can be re-attempted */
+ peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval;
+ /* Increase reconnection interval */
+ peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2,
+ IBNAL_MAX_RECONNECT_INTERVAL);
+
+ /* Take peer's blocked blocked transmits; I'll complete
+ * them with error */
+ while (!list_empty (&peer->ibp_tx_queue)) {
+ tx = list_entry (peer->ibp_tx_queue.next,
+ kib_tx_t, tx_list);
+
+ list_del (&tx->tx_list);
+ list_add_tail (&tx->tx_list, &zombies);
+ }
+
+ if (kibnal_peer_active(peer) &&
+ (peer->ibp_persistence == 0)) {
+ /* failed connection attempt on non-persistent peer */
+ kibnal_unlink_peer_locked (peer);
+ }
+ } else {
+ /* Can't have blocked transmits if there are connections */
+ LASSERT (list_empty(&peer->ibp_tx_queue));
+ }
+
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+ if (!list_empty (&zombies))
+ CERROR ("Deleting messages for "LPX64": connection failed\n",
+ peer->ibp_nid);
+
+ while (!list_empty (&zombies)) {
+ tx = list_entry (zombies.next, kib_tx_t, tx_list);
+
+ list_del (&tx->tx_list);
+ /* complete now */
+ tx->tx_status = -EHOSTUNREACH;
+ kibnal_tx_done (tx);
+ }
+}
+
+static void
+kibnal_connreq_done (kib_conn_t *conn, int active, int status)
+{
+ int state = conn->ibc_state;
+ kib_peer_t *peer = conn->ibc_peer;
+ kib_tx_t *tx;
+ unsigned long flags;
+ int i;
+
+ /* passive connection has no connreq & vice versa */
+ LASSERTF(!active == !(conn->ibc_connreq != NULL),
+ "%d %p\n", active, conn->ibc_connreq);
+ if (active) {
+ PORTAL_FREE (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
+ conn->ibc_connreq = NULL;
+ }
+
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+ LASSERT (peer->ibp_connecting != 0);
+
+ if (status == 0) {
+ /* connection established... */
+ KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_CONNECTING);
+ conn->ibc_state = IBNAL_CONN_ESTABLISHED;
+
+ if (!kibnal_peer_active(peer)) {
+ /* ...but peer deleted meantime */
+ status = -ECONNABORTED;
+ }
+ } else {
+ KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_INIT_QP,
+ IBNAL_CONN_CONNECTING);
+ }
+
+ if (status == 0) {
+ /* Everything worked! */
+
+ peer->ibp_connecting--;
+
+ /* +1 ref for ibc_list; caller(== CM)'s ref remains until
+ * the IB_CM_IDLE callback */
+ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+ atomic_inc (&conn->ibc_refcount);
+ list_add (&conn->ibc_list, &peer->ibp_conns);
+
+ /* reset reconnect interval for next attempt */
+ peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
+
+ /* post blocked sends to the new connection */
+ spin_lock (&conn->ibc_lock);
+
+ while (!list_empty (&peer->ibp_tx_queue)) {
+ tx = list_entry (peer->ibp_tx_queue.next,
+ kib_tx_t, tx_list);
+
+ list_del (&tx->tx_list);
+
+ /* +1 ref for each tx */
+ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+ atomic_inc (&conn->ibc_refcount);
+ kibnal_queue_tx_locked (tx, conn);
+ }
+
+ spin_unlock (&conn->ibc_lock);
+
+ /* Nuke any dangling conns from a different peer instance... */
+ kibnal_close_stale_conns_locked (conn->ibc_peer,
+ conn->ibc_incarnation);
+
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+ /* queue up all the receives */
+ for (i = 0; i < IBNAL_RX_MSGS; i++) {
+ /* +1 ref for rx desc */
+ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+ atomic_inc (&conn->ibc_refcount);
+
+ CDEBUG(D_NET, "RX[%d] %p->%p - "LPX64"\n",
+ i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg,
+ conn->ibc_rxs[i].rx_vaddr);
+
+ kibnal_post_rx (&conn->ibc_rxs[i], 0);
+ }
+
+ kibnal_check_sends (conn);
+ return;
+ }
+
+ /* connection failed */
+ if (state == IBNAL_CONN_CONNECTING) {
+ /* schedule for connd to close */
+ kibnal_close_conn_locked (conn, status);
+ } else {
+ /* Don't have a CM comm_id; just wait for refs to drain */
+ conn->ibc_state = IBNAL_CONN_DISCONNECTED;
+ }
+
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+ kibnal_peer_connect_failed (conn->ibc_peer, active, status);
+
+ /* If we didn't establish the connection we don't have to pass
+ * through the disconnect protocol before dropping the CM ref */
+ if (state < IBNAL_CONN_CONNECTING)
+ kibnal_put_conn (conn);
+}
+
+static int
+kibnal_accept (kib_conn_t **connp, IB_HANDLE *cep,
+ ptl_nid_t nid, __u64 incarnation, int queue_depth)
+{
+ kib_conn_t *conn = kibnal_create_conn();
+ kib_peer_t *peer;
+ kib_peer_t *peer2;
+ unsigned long flags;
+
+ if (conn == NULL)
+ return (-ENOMEM);
+
+ if (queue_depth != IBNAL_MSG_QUEUE_SIZE) {
+ CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n",
+ nid, queue_depth, IBNAL_MSG_QUEUE_SIZE);
+ atomic_dec (&conn->ibc_refcount);
+ kibnal_destroy_conn(conn);
+ return (-EPROTO);
+ }
+
+ /* assume 'nid' is a new peer */
+ peer = kibnal_create_peer (nid);
+ if (peer == NULL) {
+ CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+ atomic_dec (&conn->ibc_refcount);
+ kibnal_destroy_conn(conn);
+ return (-ENOMEM);
+ }
+
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+
+ peer2 = kibnal_find_peer_locked(nid);
+ if (peer2 == NULL) {
+ /* peer table takes my ref on peer */
+ list_add_tail (&peer->ibp_list, kibnal_nid2peerlist(nid));
+ } else {
+ kib_peer_decref (peer);
+ peer = peer2;
+ }
+
+ kib_peer_addref(peer); /* +1 ref for conn */
+ peer->ibp_connecting++;
+
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+ conn->ibc_peer = peer;
+ conn->ibc_state = IBNAL_CONN_CONNECTING;
+ /* conn->ibc_cep is set when cm_accept is called */
+ conn->ibc_incarnation = incarnation;
+ conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
+
+ *connp = conn;
+ return (0);
+}
+
+static void kibnal_set_qp_state(IB_HANDLE *qp, IB_QP_STATE state)
+{
+ IB_QP_ATTRIBUTES_MODIFY modify_attr = {0,};
+ FSTATUS frc;
+
+ modify_attr.RequestState = state;
+
+ frc = iibt_qp_modify(qp, &modify_attr, NULL);
+ if (frc != FSUCCESS)
+ CERROR("couldn't set qp state to %d, error %d\n", state, frc);
+}
+
+static void kibnal_flush_pending(kib_conn_t *conn)
+{
+ LIST_HEAD (zombies);
+ struct list_head *tmp;
+ struct list_head *nxt;
+ kib_tx_t *tx;
+ unsigned long flags;
+ int done;
+
+ /* NB we wait until the connection has closed before completing
+ * outstanding passive RDMAs so we can be sure the network can't
+ * touch the mapped memory any more. */
+ KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_DISCONNECTED);
+
+ /* set the QP to the error state so that we get flush callbacks
+ * on our posted receives which can then drop their conn refs */
+ kibnal_set_qp_state(conn->ibc_qp, QPStateError);
+
+ spin_lock_irqsave (&conn->ibc_lock, flags);
+
+ /* grab passive RDMAs not waiting for the tx callback */
+ list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) {
+ tx = list_entry (tmp, kib_tx_t, tx_list);
+
+ LASSERT (tx->tx_passive_rdma ||
+ !tx->tx_passive_rdma_wait);
+
+ LASSERT (tx->tx_passive_rdma_wait ||
+ tx->tx_sending != 0);
+
+ /* still waiting for tx callback? */
+ if (!tx->tx_passive_rdma_wait)
+ continue;
+
+ tx->tx_status = -ECONNABORTED;
+ tx->tx_passive_rdma_wait = 0;
+ done = (tx->tx_sending == 0);
+
+ if (!done)
+ continue;
+
+ list_del (&tx->tx_list);
+ list_add (&tx->tx_list, &zombies);
+ }
+
+ /* grab all blocked transmits */
+ list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) {
+ tx = list_entry (tmp, kib_tx_t, tx_list);
+
+ list_del (&tx->tx_list);
+ list_add (&tx->tx_list, &zombies);
+ }
+
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+ while (!list_empty(&zombies)) {
+ tx = list_entry (zombies.next, kib_tx_t, tx_list);
+
+ list_del(&tx->tx_list);
+ kibnal_tx_done (tx);
+ }
+}
+
+static void
+kibnal_reject (IB_HANDLE cep, uint16_t reason)
+{
+ CM_REJECT_INFO *rej;
+
+ PORTAL_ALLOC(rej, sizeof(*rej));
+ if (rej == NULL) /* PORTAL_ALLOC() will CERROR on failure */
+ return;
+
+ rej->Reason = reason;
+ iibt_cm_reject(cep, rej);
+ PORTAL_FREE(rej, sizeof(*rej));
+}
+
+static FSTATUS
+kibnal_qp_rts(IB_HANDLE qp_handle, __u32 qpn, __u8 resp_res,
+ IB_PATH_RECORD *path, __u8 init_depth, __u32 send_psn)
+{
+ IB_QP_ATTRIBUTES_MODIFY modify_attr;
+ FSTATUS frc;
+ ENTRY;
+
+ modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
+ .RequestState = QPStateReadyToRecv,
+ .RecvPSN = IBNAL_STARTING_PSN,
+ .DestQPNumber = qpn,
+ .ResponderResources = resp_res,
+ .MinRnrTimer = UsecToRnrNakTimer(2000), /* 20 ms */
+ .Attrs = (IB_QP_ATTR_RECVPSN |
+ IB_QP_ATTR_DESTQPNUMBER |
+ IB_QP_ATTR_RESPONDERRESOURCES |
+ IB_QP_ATTR_DESTAV |
+ IB_QP_ATTR_PATHMTU |
+ IB_QP_ATTR_MINRNRTIMER),
+ };
+ GetAVFromPath(0, path, &modify_attr.PathMTU, NULL,
+ &modify_attr.DestAV);
+
+ frc = iibt_qp_modify(qp_handle, &modify_attr, NULL);
+ if (frc != FSUCCESS)
+ RETURN(frc);
+
+ modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
+ .RequestState = QPStateReadyToSend,
+ .FlowControl = TRUE,
+ .InitiatorDepth = init_depth,
+ .SendPSN = send_psn,
+ .LocalAckTimeout = path->PktLifeTime + 2, /* 2 or 1? */
+ .RetryCount = IBNAL_RETRY,
+ .RnrRetryCount = IBNAL_RNR_RETRY,
+ .Attrs = (IB_QP_ATTR_FLOWCONTROL |
+ IB_QP_ATTR_INITIATORDEPTH |
+ IB_QP_ATTR_SENDPSN |
+ IB_QP_ATTR_LOCALACKTIMEOUT |
+ IB_QP_ATTR_RETRYCOUNT |
+ IB_QP_ATTR_RNRRETRYCOUNT),
+ };
+
+ frc = iibt_qp_modify(qp_handle, &modify_attr, NULL);
+ RETURN(frc);
+}
+
+static void
+kibnal_connect_reply (IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
+{
+ IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs;
+ kib_conn_t *conn = arg;
+ kib_wire_connreq_t *wcr;
+ CM_REPLY_INFO *rep = &info->Info.Reply;
+ uint16_t reason;
+ FSTATUS frc;
+
+ wcr = (kib_wire_connreq_t *)info->Info.Reply.PrivateData;
+
+ if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
+ CERROR ("Can't connect "LPX64": bad magic %08x\n",
+ conn->ibc_peer->ibp_nid, le32_to_cpu(wcr->wcr_magic));
+ GOTO(reject, reason = RC_USER_REJ);
+ }
+
+ if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
+ CERROR ("Can't connect "LPX64": bad version %d\n",
+ conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_magic));
+ GOTO(reject, reason = RC_USER_REJ);
+ }
+
+ if (wcr->wcr_queue_depth != cpu_to_le16(IBNAL_MSG_QUEUE_SIZE)) {
+ CERROR ("Can't connect "LPX64": bad queue depth %d\n",
+ conn->ibc_peer->ibp_nid,
+ le16_to_cpu(wcr->wcr_queue_depth));
+ GOTO(reject, reason = RC_USER_REJ);
+ }
+
+ if (le64_to_cpu(wcr->wcr_nid) != conn->ibc_peer->ibp_nid) {
+ CERROR ("Unexpected NID "LPX64" from "LPX64"\n",
+ le64_to_cpu(wcr->wcr_nid), conn->ibc_peer->ibp_nid);
+ GOTO(reject, reason = RC_USER_REJ);
+ }
+
+ CDEBUG(D_NET, "Connection %p -> "LPX64" REP_RECEIVED.\n",
+ conn, conn->ibc_peer->ibp_nid);
+
+ conn->ibc_incarnation = le64_to_cpu(wcr->wcr_incarnation);
+ conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
+
+ frc = kibnal_qp_rts(conn->ibc_qp, rep->QPN,
+ min_t(__u8, rep->ArbInitiatorDepth,
+ ca_attr->MaxQPResponderResources),
+ &conn->ibc_connreq->cr_path,
+ min_t(__u8, rep->ArbResponderResources,
+ ca_attr->MaxQPInitiatorDepth),
+ rep->StartingPSN);
+ if (frc != FSUCCESS) {
+ CERROR("Connection %p -> "LPX64" QP RTS/RTR failed: %d\n",
+ conn, conn->ibc_peer->ibp_nid, frc);
+ GOTO(reject, reason = RC_NO_QP);
+ }
+
+ /* the callback arguments are ignored for an active accept */
+ conn->ibc_connreq->cr_discarded.Status = FSUCCESS;
+ frc = iibt_cm_accept(cep, &conn->ibc_connreq->cr_discarded,
+ NULL, NULL, NULL, NULL);
+ if (frc != FCM_CONNECT_ESTABLISHED) {
+ CERROR("Connection %p -> "LPX64" CMAccept failed: %d\n",
+ conn, conn->ibc_peer->ibp_nid, frc);
+ kibnal_connreq_done (conn, 1, -ECONNABORTED);
+ /* XXX don't call reject after accept fails? */
+ return;
+ }
+
+ CDEBUG(D_NET, "Connection %p -> "LPX64" Established\n",
+ conn, conn->ibc_peer->ibp_nid);
+
+ kibnal_connreq_done (conn, 1, 0);
+ return;
+
+reject:
+ kibnal_reject(cep, reason);
+ kibnal_connreq_done (conn, 1, -EPROTO);
+}
+
+/* ib_cm.h has a wealth of information on the CM procedures */
+static void
+kibnal_cm_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
+{
+ kib_conn_t *conn = arg;
+
+ CDEBUG(D_NET, "status 0x%x\n", info->Status);
+
+ /* Established Connection Notifier */
+ switch (info->Status) {
+ default:
+ CERROR("unknown status %d on Connection %p -> "LPX64"\n",
+ info->Status, conn, conn->ibc_peer->ibp_nid);
+ LBUG();
+ break;
+
+ case FCM_CONNECT_REPLY:
+ kibnal_connect_reply(cep, info, arg);
+ break;
+
+ case FCM_DISCONNECT_REQUEST:
+ /* XXX lock around these state management bits? */
+ if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
+ kibnal_close_conn (conn, 0);
+ conn->ibc_state = IBNAL_CONN_DREP;
+ iibt_cm_disconnect(conn->ibc_cep, NULL, NULL);
+ break;
+
+ /* these both guarantee that no more cm callbacks will occur */
+ case FCM_DISCONNECTED: /* aka FCM_DISCONNECT_TIMEOUT */
+ case FCM_DISCONNECT_REPLY:
+ CDEBUG(D_NET, "Connection %p -> "LPX64" disconnect done.\n",
+ conn, conn->ibc_peer->ibp_nid);
+
+ conn->ibc_state = IBNAL_CONN_DISCONNECTED;
+ kibnal_flush_pending(conn);
+ kibnal_put_conn(conn); /* Lose CM's ref */
+ break;
+ }
+
+ return;
+}
+
+static int
+kibnal_set_cm_flags(IB_HANDLE cep)
+{
+ FSTATUS frc;
+ uint32 value = 1;
+
+ frc = iibt_cm_modify_cep(cep, CM_FLAG_TIMEWAIT_CALLBACK,
+ (char *)&value, sizeof(value), 0);
+ if (frc != FSUCCESS) {
+ CERROR("error setting timeout callback: %d\n", frc);
+ return -1;
+ }
+
+#if 0
+ frc = iibt_cm_modify_cep(cep, CM_FLAG_ASYNC_ACCEPT, (char *)&value,
+ sizeof(value), 0);
+ if (frc != FSUCCESS) {
+ CERROR("error setting async accept: %d\n", frc);
+ return -1;
+ }
+#endif
+
+ return 0;
+}
+
+void
+kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
+{
+ IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs;
+ IB_QP_ATTRIBUTES_QUERY *query;
+ CM_REQUEST_INFO *req;
+ CM_CONN_INFO *rep = NULL, *rcv = NULL;
+ kib_wire_connreq_t *wcr;
+ kib_conn_t *conn = NULL;
+ uint16_t reason = 0;
+ FSTATUS frc;
+ int rc = 0;
+
+ LASSERT(cep);
+ LASSERT(info);
+ LASSERT(arg == NULL); /* no conn yet for passive */
+
+ CDEBUG(D_NET, "status 0x%x\n", info->Status);
+
+ req = &info->Info.Request;
+ wcr = (kib_wire_connreq_t *)req->PrivateData;
+
+ CDEBUG(D_NET, "%d from "LPX64"\n", info->Status,
+ le64_to_cpu(wcr->wcr_nid));
+
+ if (info->Status == FCM_CONNECT_CANCEL)
+ return;
+
+ LASSERT (info->Status == FCM_CONNECT_REQUEST);
+
+ if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
+ CERROR ("Can't accept: bad magic %08x\n",
+ le32_to_cpu(wcr->wcr_magic));
+ GOTO(out, reason = RC_USER_REJ);
+ }
+
+ if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
+ CERROR ("Can't accept: bad version %d\n",
+ le16_to_cpu(wcr->wcr_magic));
+ GOTO(out, reason = RC_USER_REJ);
+ }
+
+ rc = kibnal_accept(&conn, cep,
+ le64_to_cpu(wcr->wcr_nid),
+ le64_to_cpu(wcr->wcr_incarnation),
+ le16_to_cpu(wcr->wcr_queue_depth));
+ if (rc != 0) {
+ CERROR ("Can't accept "LPX64": %d\n",
+ le64_to_cpu(wcr->wcr_nid), rc);
+ GOTO(out, reason = RC_NO_RESOURCES);
+ }
+
+ frc = kibnal_qp_rts(conn->ibc_qp, req->CEPInfo.QPN,
+ min_t(__u8, req->CEPInfo.OfferedInitiatorDepth,
+ ca_attr->MaxQPResponderResources),
+ &req->PathInfo.Path,
+ min_t(__u8, req->CEPInfo.OfferedResponderResources,
+ ca_attr->MaxQPInitiatorDepth),
+ req->CEPInfo.StartingPSN);
+
+ if (frc != FSUCCESS) {
+ CERROR ("Can't mark QP RTS/RTR "LPX64": %d\n",
+ le64_to_cpu(wcr->wcr_nid), frc);
+ GOTO(out, reason = RC_NO_QP);
+ }
+
+ frc = iibt_qp_query(conn->ibc_qp, &conn->ibc_qp_attrs, NULL);
+ if (frc != FSUCCESS) {
+ CERROR ("Couldn't query qp attributes "LPX64": %d\n",
+ le64_to_cpu(wcr->wcr_nid), frc);
+ GOTO(out, reason = RC_NO_QP);
+ }
+ query = &conn->ibc_qp_attrs;
+
+ PORTAL_ALLOC(rep, sizeof(*rep));
+ PORTAL_ALLOC(rcv, sizeof(*rcv));
+ if (rep == NULL || rcv == NULL) {
+ CERROR ("can't reply and receive buffers\n");
+ GOTO(out, reason = RC_INSUFFICIENT_RESP_RES);
+ }
+
+ /* don't try to deref this into the incoming wcr :) */
+ wcr = (kib_wire_connreq_t *)rep->Info.Reply.PrivateData;
+
+ rep->Info.Reply = (CM_REPLY_INFO) {
+ .QPN = query->QPNumber,
+ .QKey = query->Qkey,
+ .StartingPSN = query->RecvPSN,
+ .EndToEndFlowControl = query->FlowControl,
+ /* XXX Hmm. */
+ .ArbInitiatorDepth = query->InitiatorDepth,
+ .ArbResponderResources = query->ResponderResources,
+ .TargetAckDelay = 0,
+ .FailoverAccepted = 0,
+ .RnRRetryCount = req->CEPInfo.RnrRetryCount,
+ };
+
+ *wcr = (kib_wire_connreq_t) {
+ .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC),
+ .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION),
+ .wcr_queue_depth = cpu_to_le32(IBNAL_MSG_QUEUE_SIZE),
+ .wcr_nid = cpu_to_le64(kibnal_data.kib_nid),
+ .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
+ };
+
+ frc = iibt_cm_accept(cep, rep, rcv, kibnal_cm_callback, conn,
+ &conn->ibc_cep);
+
+ PORTAL_FREE(rep, sizeof(*rep));
+ PORTAL_FREE(rcv, sizeof(*rcv));
+
+ if (frc != FCM_CONNECT_ESTABLISHED) {
+ /* XXX it seems we don't call reject after this point? */
+ CERROR("iibt_cm_accept() failed: %d, aborting\n", frc);
+ rc = -ECONNABORTED;
+ goto out;
+ }
+
+ if (kibnal_set_cm_flags(conn->ibc_cep)) {
+ rc = -ECONNABORTED;
+ goto out;
+ }
+
+ CDEBUG(D_WARNING, "Connection %p -> "LPX64" ESTABLISHED.\n",
+ conn, conn->ibc_peer->ibp_nid);
+
+out:
+ if (reason) {
+ kibnal_reject(cep, reason);
+ rc = -ECONNABORTED;
+ }
+ if (conn != NULL)
+ kibnal_connreq_done(conn, 0, rc);
+
+ return;
+}
+
+static void
+dump_path_records(PATH_RESULTS *results)
+{
+ IB_PATH_RECORD *path;
+ int i;
+
+ for(i = 0; i < results->NumPathRecords; i++) {
+ path = &results->PathRecords[i];
+ CDEBUG(D_NET, "%d: sgid "LPX64":"LPX64" dgid "
+ LPX64":"LPX64" pkey %x\n",
+ i,
+ path->SGID.Type.Global.SubnetPrefix,
+ path->SGID.Type.Global.InterfaceID,
+ path->DGID.Type.Global.SubnetPrefix,
+ path->DGID.Type.Global.InterfaceID,
+ path->P_Key);
+ }
+}
+
+static void
+kibnal_pathreq_callback (void *arg, QUERY *query,
+ QUERY_RESULT_VALUES *query_res)
+{
+ IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs;
+ kib_conn_t *conn = arg;
+ PATH_RESULTS *path;
+ FSTATUS frc;
+
+ if (query_res->Status != FSUCCESS || query_res->ResultDataSize == 0) {
+ CERROR ("status %d data size %d\n", query_res->Status,
+ query_res->ResultDataSize);
+ kibnal_connreq_done (conn, 1, -EINVAL);
+ return;
+ }
+
+ path = (PATH_RESULTS *)query_res->QueryResult;
+
+ if (path->NumPathRecords < 1) {
+ CERROR ("expected path records: %d\n", path->NumPathRecords);
+ kibnal_connreq_done (conn, 1, -EINVAL);
+ return;
+ }
+
+ dump_path_records(path);
+
+ /* just using the first. this is probably a horrible idea. */
+ conn->ibc_connreq->cr_path = path->PathRecords[0];
+
+ conn->ibc_cep = iibt_cm_create_cep(CM_RC_TYPE);
+ if (conn->ibc_cep == NULL) {
+ CERROR ("Can't create CEP\n");
+ kibnal_connreq_done (conn, 1, -EINVAL);
+ return;
+ }
+
+ if (kibnal_set_cm_flags(conn->ibc_cep)) {
+ kibnal_connreq_done (conn, 1, -EINVAL);
+ return;
+ }
+
+ conn->ibc_connreq->cr_wcr = (kib_wire_connreq_t) {
+ .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC),
+ .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION),
+ .wcr_queue_depth = cpu_to_le16(IBNAL_MSG_QUEUE_SIZE),
+ .wcr_nid = cpu_to_le64(kibnal_data.kib_nid),
+ .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
+ };
+
+ conn->ibc_connreq->cr_cmreq = (CM_REQUEST_INFO) {
+ .SID = conn->ibc_connreq->cr_service.RID.ServiceID,
+ .CEPInfo = (CM_CEP_INFO) {
+ .CaGUID = kibnal_data.kib_hca_guids[0],
+ .EndToEndFlowControl = FALSE,
+ .PortGUID = conn->ibc_connreq->cr_path.SGID.Type.Global.InterfaceID,
+ .RetryCount = IBNAL_RETRY,
+ .RnrRetryCount = IBNAL_RNR_RETRY,
+ .AckTimeout = IBNAL_ACK_TIMEOUT,
+ .StartingPSN = IBNAL_STARTING_PSN,
+ .QPN = conn->ibc_qp_attrs.QPNumber,
+ .QKey = conn->ibc_qp_attrs.Qkey,
+ .OfferedResponderResources = ca_attr->MaxQPResponderResources,
+ .OfferedInitiatorDepth = ca_attr->MaxQPInitiatorDepth,
+ },
+ .PathInfo = (CM_CEP_PATHINFO) {
+ .bSubnetLocal = TRUE,
+ .Path = conn->ibc_connreq->cr_path,
+ },
+ };
+
+#if 0
+ /* XXX set timeout just like SDP!!!*/
+ conn->ibc_connreq->cr_path.packet_life = 13;
+#endif
+ /* Flag I'm getting involved with the CM... */
+ conn->ibc_state = IBNAL_CONN_CONNECTING;
+
+ CDEBUG(D_NET, "Connecting to, service id "LPX64", on "LPX64"\n",
+ conn->ibc_connreq->cr_service.RID.ServiceID,
+ *kibnal_service_nid_field(&conn->ibc_connreq->cr_service));
+
+ memset(conn->ibc_connreq->cr_cmreq.PrivateData, 0,
+ CM_REQUEST_INFO_USER_LEN);
+ memcpy(conn->ibc_connreq->cr_cmreq.PrivateData,
+ &conn->ibc_connreq->cr_wcr, sizeof(conn->ibc_connreq->cr_wcr));
+
+ /* kibnal_cm_callback gets my conn ref */
+ frc = iibt_cm_connect(conn->ibc_cep, &conn->ibc_connreq->cr_cmreq,
+ kibnal_cm_callback, conn);
+ if (frc != FPENDING && frc != FSUCCESS) {
+ CERROR ("Connect: %d\n", frc);
+ /* Back out state change as connect failed */
+ conn->ibc_state = IBNAL_CONN_INIT_QP;
+ kibnal_connreq_done (conn, 1, -EINVAL);
+ }
+}
+
+static void
+dump_service_records(SERVICE_RECORD_RESULTS *results)
+{
+ IB_SERVICE_RECORD *svc;
+ int i;
+
+ for(i = 0; i < results->NumServiceRecords; i++) {
+ svc = &results->ServiceRecords[i];
+ CDEBUG(D_NET, "%d: sid "LPX64" gid "LPX64":"LPX64" pkey %x\n",
+ i,
+ svc->RID.ServiceID,
+ svc->RID.ServiceGID.Type.Global.SubnetPrefix,
+ svc->RID.ServiceGID.Type.Global.InterfaceID,
+ svc->RID.ServiceP_Key);
+ }
+}
+
+
+static void
+kibnal_service_get_callback (void *arg, QUERY *query,
+ QUERY_RESULT_VALUES *query_res)
+{
+ kib_conn_t *conn = arg;
+ SERVICE_RECORD_RESULTS *svc;
+ COMMAND_CONTROL_PARAMETERS sd_params;
+ QUERY path_query;
+ FSTATUS frc;
+
+ if (query_res->Status != FSUCCESS || query_res->ResultDataSize == 0) {
+ CERROR ("status %d data size %d\n", query_res->Status,
+ query_res->ResultDataSize);
+ kibnal_connreq_done (conn, 1, -EINVAL);
+ return;
+ }
+
+ svc = (SERVICE_RECORD_RESULTS *)query_res->QueryResult;
+
+ if (svc->NumServiceRecords < 1) {
+ CERROR ("%d service records\n", svc->NumServiceRecords);
+ kibnal_connreq_done (conn, 1, -EINVAL);
+ return;
+ }
+
+ dump_service_records(svc);
+
+ conn->ibc_connreq->cr_service = svc->ServiceRecords[0];
+
+ CDEBUG(D_NET, "Got status %d, service id "LPX64", on "LPX64"\n",
+ query_res->Status , conn->ibc_connreq->cr_service.RID.ServiceID,
+ *kibnal_service_nid_field(&conn->ibc_connreq->cr_service));
+
+ memset(&path_query, 0, sizeof(path_query));
+ path_query.InputType = InputTypePortGuidPair;
+ path_query.OutputType = OutputTypePathRecord;
+ path_query.InputValue.PortGuidPair.SourcePortGuid = kibnal_data.kib_port_guid;
+ path_query.InputValue.PortGuidPair.DestPortGuid = conn->ibc_connreq->cr_service.RID.ServiceGID.Type.Global.InterfaceID;
+
+ memset(&sd_params, 0, sizeof(sd_params));
+ sd_params.RetryCount = IBNAL_RETRY;
+ sd_params.Timeout = 10 * 1000; /* wait 10 seconds */
+
+ /* kibnal_service_get_callback gets my conn ref */
+
+ frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd,
+ kibnal_data.kib_port_guid,
+ &path_query,
+ kibnal_pathreq_callback,
+ &sd_params, conn);
+ if (frc == FPENDING)
+ return;
+
+ CERROR ("Path record request failed: %d\n", frc);
+ kibnal_connreq_done (conn, 1, -EINVAL);
+}
+
+static void
+kibnal_connect_peer (kib_peer_t *peer)
+{
+ COMMAND_CONTROL_PARAMETERS sd_params;
+ QUERY query;
+ FSTATUS frc;
+ kib_conn_t *conn = kibnal_create_conn();
+
+ LASSERT (peer->ibp_connecting != 0);
+
+ if (conn == NULL) {
+ CERROR ("Can't allocate conn\n");
+ kibnal_peer_connect_failed (peer, 1, -ENOMEM);
+ return;
+ }
+
+ conn->ibc_peer = peer;
+ kib_peer_addref(peer);
+
+ PORTAL_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
+ if (conn->ibc_connreq == NULL) {
+ CERROR ("Can't allocate connreq\n");
+ kibnal_connreq_done (conn, 1, -ENOMEM);
+ return;
+ }
+
+ memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq));
+
+ kibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid);
+
+ memset(&query, 0, sizeof(query));
+ query.InputType = InputTypeServiceRecord;
+ query.OutputType = OutputTypeServiceRecord;
+ query.InputValue.ServiceRecordValue.ServiceRecord = conn->ibc_connreq->cr_service;
+ query.InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK;
+
+ memset(&sd_params, 0, sizeof(sd_params));
+ sd_params.RetryCount = IBNAL_RETRY;
+ sd_params.Timeout = 10 * 1000; /* wait 10 seconds */
+
+ /* kibnal_service_get_callback gets my conn ref */
+ frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd,
+ kibnal_data.kib_port_guid,
+ &query,
+ kibnal_service_get_callback,
+ &sd_params, conn);
+ if (frc == FPENDING)
+ return;
+
+ CERROR ("iibt_sd_query_port_fabric_information(): %d\n", frc);
+ kibnal_connreq_done (conn, 1, frc);
+}
+
+static int
+kibnal_conn_timed_out (kib_conn_t *conn)
+{
+ kib_tx_t *tx;
+ struct list_head *ttmp;
+ unsigned long flags;
+
+ spin_lock_irqsave (&conn->ibc_lock, flags);
+
+ list_for_each (ttmp, &conn->ibc_tx_queue) {
+ tx = list_entry (ttmp, kib_tx_t, tx_list);
+
+ LASSERT (!tx->tx_passive_rdma_wait);
+ LASSERT (tx->tx_sending == 0);
+
+ if (time_after_eq (jiffies, tx->tx_deadline)) {
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+ return 1;
+ }
+ }
+
+ list_for_each (ttmp, &conn->ibc_active_txs) {
+ tx = list_entry (ttmp, kib_tx_t, tx_list);
+
+ LASSERT (tx->tx_passive_rdma ||
+ !tx->tx_passive_rdma_wait);
+
+ LASSERT (tx->tx_passive_rdma_wait ||
+ tx->tx_sending != 0);
+
+ if (time_after_eq (jiffies, tx->tx_deadline)) {
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+ return 1;
+ }
+ }
+
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+ return 0;
+}
+
+static void
+kibnal_check_conns (int idx)
+{
+ struct list_head *peers = &kibnal_data.kib_peers[idx];
+ struct list_head *ptmp;
+ kib_peer_t *peer;
+ kib_conn_t *conn;
+ struct list_head *ctmp;
+
+ again:
+ /* NB. We expect to have a look at all the peers and not find any
+ * rdmas to time out, so we just use a shared lock while we
+ * take a look... */
+ read_lock (&kibnal_data.kib_global_lock);
+
+ list_for_each (ptmp, peers) {
+ peer = list_entry (ptmp, kib_peer_t, ibp_list);
+
+ list_for_each (ctmp, &peer->ibp_conns) {
+ conn = list_entry (ctmp, kib_conn_t, ibc_list);
+
+ KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_ESTABLISHED);
+
+ /* In case we have enough credits to return via a
+ * NOOP, but there were no non-blocking tx descs
+ * free to do it last time... */
+ kibnal_check_sends(conn);
+
+ if (!kibnal_conn_timed_out(conn))
+ continue;
+
+ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
+ conn, conn->ibc_state, peer->ibp_nid,
+ atomic_read (&conn->ibc_refcount));
+
+ atomic_inc (&conn->ibc_refcount);
+ read_unlock (&kibnal_data.kib_global_lock);
+
+ CERROR("Timed out RDMA with "LPX64"\n",
+ peer->ibp_nid);
+
+ kibnal_close_conn (conn, -ETIMEDOUT);
+ kibnal_put_conn (conn);
+
+ /* start again now I've dropped the lock */
+ goto again;
+ }
+ }
+
+ read_unlock (&kibnal_data.kib_global_lock);
+}
+
+static void
+kib_connd_handle_state(kib_conn_t *conn)
+{
+ FSTATUS frc;
+
+ switch (conn->ibc_state) {
+ /* all refs have gone, free and be done with it */
+ case IBNAL_CONN_DISCONNECTED:
+ kibnal_destroy_conn (conn);
+ return; /* avoid put_conn */
+
+ case IBNAL_CONN_SEND_DREQ:
+ frc = iibt_cm_disconnect(conn->ibc_cep, NULL, NULL);
+ if (frc != FSUCCESS) /* XXX do real things */
+ CERROR("disconnect failed: %d\n", frc);
+ conn->ibc_state = IBNAL_CONN_DREQ;
+ break;
+
+ /* a callback got to the conn before we did */
+ case IBNAL_CONN_DREP:
+ break;
+
+ default:
+ CERROR ("Bad conn %p state: %d\n", conn,
+ conn->ibc_state);
+ LBUG();
+ break;
+ }
+
+ /* drop ref from close_conn */
+ kibnal_put_conn(conn);
+}
+
+int
+kibnal_connd (void *arg)
+{
+ wait_queue_t wait;
+ unsigned long flags;
+ kib_conn_t *conn;
+ kib_peer_t *peer;
+ int timeout;
+ int i;
+ int peer_index = 0;
+ unsigned long deadline = jiffies;
+
+ kportal_daemonize ("kibnal_connd");
+ kportal_blockallsigs ();
+
+ init_waitqueue_entry (&wait, current);
+
+ spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+
+ for (;;) {
+ if (!list_empty (&kibnal_data.kib_connd_conns)) {
+ conn = list_entry (kibnal_data.kib_connd_conns.next,
+ kib_conn_t, ibc_list);
+ list_del (&conn->ibc_list);
+
+ spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+ kib_connd_handle_state(conn);
+
+ spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+ continue;
+ }
+
+ if (!list_empty (&kibnal_data.kib_connd_peers)) {
+ peer = list_entry (kibnal_data.kib_connd_peers.next,
+ kib_peer_t, ibp_connd_list);
+
+ list_del_init (&peer->ibp_connd_list);
+ spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+
+ kibnal_connect_peer (peer);
+ kib_peer_decref (peer);
+
+ spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+ }
+
+ /* shut down and nobody left to reap... */
+ if (kibnal_data.kib_shutdown &&
+ atomic_read(&kibnal_data.kib_nconns) == 0)
+ break;
+
+ spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+
+ /* careful with the jiffy wrap... */
+ while ((timeout = (int)(deadline - jiffies)) <= 0) {
+ const int n = 4;
+ const int p = 1;
+ int chunk = kibnal_data.kib_peer_hash_size;
+
+ /* Time to check for RDMA timeouts on a few more
+ * peers: I do checks every 'p' seconds on a
+ * proportion of the peer table and I need to check
+ * every connection 'n' times within a timeout
+ * interval, to ensure I detect a timeout on any
+ * connection within (n+1)/n times the timeout
+ * interval. */
+
+ if (kibnal_tunables.kib_io_timeout > n * p)
+ chunk = (chunk * n * p) /
+ kibnal_tunables.kib_io_timeout;
+ if (chunk == 0)
+ chunk = 1;
+
+ for (i = 0; i < chunk; i++) {
+ kibnal_check_conns (peer_index);
+ peer_index = (peer_index + 1) %
+ kibnal_data.kib_peer_hash_size;
+ }
+
+ deadline += p * HZ;
+ }
+
+ kibnal_data.kib_connd_waketime = jiffies + timeout;
+
+ set_current_state (TASK_INTERRUPTIBLE);
+ add_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
+
+ if (!kibnal_data.kib_shutdown &&
+ list_empty (&kibnal_data.kib_connd_conns) &&
+ list_empty (&kibnal_data.kib_connd_peers))
+ schedule_timeout (timeout);
+
+ set_current_state (TASK_RUNNING);
+ remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
+
+ spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+ }
+
+ spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+
+ kibnal_thread_fini ();
+ return (0);
+}
+
+int
+kibnal_scheduler(void *arg)
+{
+ long id = (long)arg;
+ char name[16];
+ kib_rx_t *rx;
+ kib_tx_t *tx;
+ unsigned long flags;
+ int rc;
+ int counter = 0;
+ int did_something;
+
+ snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
+ kportal_daemonize(name);
+ kportal_blockallsigs();
+
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
+
+ for (;;) {
+ did_something = 0;
+
+ while (!list_empty(&kibnal_data.kib_sched_txq)) {
+ tx = list_entry(kibnal_data.kib_sched_txq.next,
+ kib_tx_t, tx_list);
+ list_del(&tx->tx_list);
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
+ flags);
+ kibnal_tx_done(tx);
+
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock,
+ flags);
+ }
+
+ if (!list_empty(&kibnal_data.kib_sched_rxq)) {
+ rx = list_entry(kibnal_data.kib_sched_rxq.next,
+ kib_rx_t, rx_list);
+ list_del(&rx->rx_list);
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
+ flags);
+
+ kibnal_rx(rx);
+
+ did_something = 1;
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock,
+ flags);
+ }
+
+ /* shut down and no receives to complete... */
+ if (kibnal_data.kib_shutdown &&
+ atomic_read(&kibnal_data.kib_nconns) == 0)
+ break;
+
+ /* nothing to do or hogging CPU */
+ if (!did_something || counter++ == IBNAL_RESCHED) {
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
+ flags);
+ counter = 0;
+
+ if (!did_something) {
+ rc = wait_event_interruptible(
+ kibnal_data.kib_sched_waitq,
+ !list_empty(&kibnal_data.kib_sched_txq) ||
+ !list_empty(&kibnal_data.kib_sched_rxq) ||
+ (kibnal_data.kib_shutdown &&
+ atomic_read (&kibnal_data.kib_nconns) == 0));
+ } else {
+ our_cond_resched();
+ }
+
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock,
+ flags);
+ }
+ }
+
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+
+ kibnal_thread_fini();
+ return (0);
+}
+
+
+lib_nal_t kibnal_lib = {
+ libnal_data: &kibnal_data, /* NAL private data */
+ libnal_send: kibnal_send,
+ libnal_send_pages: kibnal_send_pages,
+ libnal_recv: kibnal_recv,
+ libnal_recv_pages: kibnal_recv_pages,
+ libnal_dist: kibnal_dist
+};
--- /dev/null
+.deps
+Makefile
+.*.cmd
+autoMakefile.in
+autoMakefile
+*.ko
+*.mod.c
+.*.flags
+.tmp_versions
+.depend
#include "openibnal.h"
-nal_t koibnal_api;
-ptl_handle_ni_t koibnal_ni;
-koib_data_t koibnal_data;
-koib_tunables_t koibnal_tunables;
+nal_t kibnal_api;
+ptl_handle_ni_t kibnal_ni;
+kib_data_t kibnal_data;
+kib_tunables_t kibnal_tunables;
#ifdef CONFIG_SYSCTL
-#define OPENIBNAL_SYSCTL 202
+#define IBNAL_SYSCTL 202
-#define OPENIBNAL_SYSCTL_TIMEOUT 1
-#define OPENIBNAL_SYSCTL_ZERO_COPY 2
+#define IBNAL_SYSCTL_TIMEOUT 1
-static ctl_table koibnal_ctl_table[] = {
- {OPENIBNAL_SYSCTL_TIMEOUT, "timeout",
- &koibnal_tunables.koib_io_timeout, sizeof (int),
+static ctl_table kibnal_ctl_table[] = {
+ {IBNAL_SYSCTL_TIMEOUT, "timeout",
+ &kibnal_tunables.kib_io_timeout, sizeof (int),
0644, NULL, &proc_dointvec},
{ 0 }
};
-static ctl_table koibnal_top_ctl_table[] = {
- {OPENIBNAL_SYSCTL, "openibnal", NULL, 0, 0555, koibnal_ctl_table},
+static ctl_table kibnal_top_ctl_table[] = {
+ {IBNAL_SYSCTL, "openibnal", NULL, 0, 0555, kibnal_ctl_table},
{ 0 }
};
#endif
"service id: "LPX64"\n"
"name : %s\n"
"NID : "LPX64"\n", tag, rc,
- service->service_id, name, service->service_data64[0]);
+ service->service_id, name,
+ *kibnal_service_nid_field(service));
}
void
-koibnal_service_setunset_done (tTS_IB_CLIENT_QUERY_TID tid, int status,
+kibnal_service_setunset_done (tTS_IB_CLIENT_QUERY_TID tid, int status,
struct ib_common_attrib_service *service, void *arg)
{
*(int *)arg = status;
- up (&koibnal_data.koib_nid_signal);
+ up (&kibnal_data.kib_nid_signal);
}
+#if IBNAL_CHECK_ADVERT
+void
+kibnal_check_advert (void)
+{
+ struct ib_common_attrib_service *svc;
+ __u64 tid;
+ int rc;
+ int rc2;
+
+ PORTAL_ALLOC(svc, sizeof(*svc));
+ if (svc == NULL)
+ return;
+
+ memset (svc, 0, sizeof (*svc));
+ kibnal_set_service_keys(svc, kibnal_data.kib_nid);
+
+ rc = ib_service_get (kibnal_data.kib_device,
+ kibnal_data.kib_port,
+ svc,
+ KIBNAL_SERVICE_KEY_MASK,
+ kibnal_tunables.kib_io_timeout * HZ,
+ kibnal_service_setunset_done, &rc2,
+ &tid);
+
+ if (rc != 0) {
+ CERROR ("Immediate error %d checking SM service\n", rc);
+ } else {
+ down (&kibnal_data.kib_nid_signal);
+ rc = rc2;
+
+ if (rc != 0)
+ CERROR ("Error %d checking SM service\n", rc);
+ }
+
+ PORTAL_FREE(svc, sizeof(*svc));
+}
+#endif
+
int
-koibnal_advertise (void)
+kibnal_advertise (void)
{
+ struct ib_common_attrib_service *svc;
__u64 tid;
int rc;
int rc2;
- LASSERT (koibnal_data.koib_nid != PTL_NID_ANY);
+ LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
+
+ PORTAL_ALLOC(svc, sizeof(*svc));
+ if (svc == NULL)
+ return (-ENOMEM);
- memset (&koibnal_data.koib_service, 0,
- sizeof (koibnal_data.koib_service));
+ memset (svc, 0, sizeof (*svc));
- koibnal_data.koib_service.service_id
- = koibnal_data.koib_cm_service_id;
+ svc->service_id = kibnal_data.kib_service_id;
- rc = ib_cached_gid_get(koibnal_data.koib_device,
- koibnal_data.koib_port,
+ rc = ib_cached_gid_get(kibnal_data.kib_device,
+ kibnal_data.kib_port,
0,
- koibnal_data.koib_service.service_gid);
+ svc->service_gid);
if (rc != 0) {
CERROR ("Can't get port %d GID: %d\n",
- koibnal_data.koib_port, rc);
- return (rc);
+ kibnal_data.kib_port, rc);
+ goto out;
}
- rc = ib_cached_pkey_get(koibnal_data.koib_device,
- koibnal_data.koib_port,
+ rc = ib_cached_pkey_get(kibnal_data.kib_device,
+ kibnal_data.kib_port,
0,
- &koibnal_data.koib_service.service_pkey);
+ &svc->service_pkey);
if (rc != 0) {
CERROR ("Can't get port %d PKEY: %d\n",
- koibnal_data.koib_port, rc);
- return (rc);
+ kibnal_data.kib_port, rc);
+ goto out;
}
- koibnal_data.koib_service.service_lease = 0xffffffff;
+ svc->service_lease = 0xffffffff;
- koibnal_set_service_keys(&koibnal_data.koib_service, koibnal_data.koib_nid);
+ kibnal_set_service_keys(svc, kibnal_data.kib_nid);
CDEBUG(D_NET, "Advertising service id "LPX64" %s:"LPX64"\n",
- koibnal_data.koib_service.service_id,
- koibnal_data.koib_service.service_name,
- *koibnal_service_nid_field(&koibnal_data.koib_service));
+ svc->service_id,
+ svc->service_name, *kibnal_service_nid_field(svc));
- rc = ib_service_set (koibnal_data.koib_device,
- koibnal_data.koib_port,
- &koibnal_data.koib_service,
+ rc = ib_service_set (kibnal_data.kib_device,
+ kibnal_data.kib_port,
+ svc,
IB_SA_SERVICE_COMP_MASK_ID |
IB_SA_SERVICE_COMP_MASK_GID |
IB_SA_SERVICE_COMP_MASK_PKEY |
IB_SA_SERVICE_COMP_MASK_LEASE |
- KOIBNAL_SERVICE_KEY_MASK,
- koibnal_tunables.koib_io_timeout * HZ,
- koibnal_service_setunset_done, &rc2, &tid);
+ KIBNAL_SERVICE_KEY_MASK,
+ kibnal_tunables.kib_io_timeout * HZ,
+ kibnal_service_setunset_done, &rc2, &tid);
- if (rc == 0) {
- down (&koibnal_data.koib_nid_signal);
- rc = rc2;
+ if (rc != 0) {
+ CERROR ("Immediate error %d advertising NID "LPX64"\n",
+ rc, kibnal_data.kib_nid);
+ goto out;
}
-
- if (rc != 0)
- CERROR ("Error %d advertising SM service\n", rc);
+ down (&kibnal_data.kib_nid_signal);
+
+ rc = rc2;
+ if (rc != 0)
+ CERROR ("Error %d advertising NID "LPX64"\n",
+ rc, kibnal_data.kib_nid);
+ out:
+ PORTAL_FREE(svc, sizeof(*svc));
return (rc);
}
-int
-koibnal_unadvertise (int expect_success)
+void
+kibnal_unadvertise (int expect_success)
{
+ struct ib_common_attrib_service *svc;
__u64 tid;
int rc;
int rc2;
- LASSERT (koibnal_data.koib_nid != PTL_NID_ANY);
+ LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
- memset (&koibnal_data.koib_service, 0,
- sizeof (koibnal_data.koib_service));
+ PORTAL_ALLOC(svc, sizeof(*svc));
+ if (svc == NULL)
+ return;
- koibnal_set_service_keys(&koibnal_data.koib_service, koibnal_data.koib_nid);
+ memset (svc, 0, sizeof(*svc));
+
+ kibnal_set_service_keys(svc, kibnal_data.kib_nid);
CDEBUG(D_NET, "Unadvertising service %s:"LPX64"\n",
- koibnal_data.koib_service.service_name,
- *koibnal_service_nid_field(&koibnal_data.koib_service));
-
- rc = ib_service_delete (koibnal_data.koib_device,
- koibnal_data.koib_port,
- &koibnal_data.koib_service,
- KOIBNAL_SERVICE_KEY_MASK,
- koibnal_tunables.koib_io_timeout * HZ,
- koibnal_service_setunset_done, &rc2, &tid);
+ svc->service_name, *kibnal_service_nid_field(svc));
+
+ rc = ib_service_delete (kibnal_data.kib_device,
+ kibnal_data.kib_port,
+ svc,
+ KIBNAL_SERVICE_KEY_MASK,
+ kibnal_tunables.kib_io_timeout * HZ,
+ kibnal_service_setunset_done, &rc2, &tid);
if (rc != 0) {
CERROR ("Immediate error %d unadvertising NID "LPX64"\n",
- rc, koibnal_data.koib_nid);
- return (rc);
+ rc, kibnal_data.kib_nid);
+ goto out;
}
- down (&koibnal_data.koib_nid_signal);
+ down (&kibnal_data.kib_nid_signal);
if ((rc2 == 0) == !!expect_success)
- return (0);
+ goto out; /* success: rc == 0 */
if (expect_success)
CERROR("Error %d unadvertising NID "LPX64"\n",
- rc, koibnal_data.koib_nid);
+ rc, kibnal_data.kib_nid);
else
CWARN("Removed conflicting NID "LPX64"\n",
- koibnal_data.koib_nid);
-
- return (rc);
-}
-
-int
-koibnal_check_advert (void)
-{
- __u64 tid;
- int rc;
- int rc2;
-
- static struct ib_common_attrib_service srv;
-
- memset (&srv, 0, sizeof (srv));
-
- koibnal_set_service_keys(&srv, koibnal_data.koib_nid);
-
- rc = ib_service_get (koibnal_data.koib_device,
- koibnal_data.koib_port,
- &srv,
- KOIBNAL_SERVICE_KEY_MASK,
- koibnal_tunables.koib_io_timeout * HZ,
- koibnal_service_setunset_done, &rc2,
- &tid);
-
- if (rc != 0) {
- CERROR ("Immediate error %d checking SM service\n", rc);
- } else {
- down (&koibnal_data.koib_nid_signal);
- rc = rc2;
-
- if (rc != 0)
- CERROR ("Error %d checking SM service\n", rc);
- }
-
- return (rc);
+ kibnal_data.kib_nid);
+ out:
+ PORTAL_FREE(svc, sizeof(*svc));
}
int
-koibnal_set_mynid(ptl_nid_t nid)
+kibnal_set_mynid(ptl_nid_t nid)
{
struct timeval tv;
- lib_ni_t *ni = &koibnal_lib.libnal_ni;
+ lib_ni_t *ni = &kibnal_lib.libnal_ni;
int rc;
CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
do_gettimeofday(&tv);
- down (&koibnal_data.koib_nid_mutex);
+ down (&kibnal_data.kib_nid_mutex);
- if (nid == koibnal_data.koib_nid) {
+ if (nid == kibnal_data.kib_nid) {
/* no change of NID */
- up (&koibnal_data.koib_nid_mutex);
+ up (&kibnal_data.kib_nid_mutex);
return (0);
}
CDEBUG(D_NET, "NID "LPX64"("LPX64")\n",
- koibnal_data.koib_nid, nid);
+ kibnal_data.kib_nid, nid);
- if (koibnal_data.koib_nid != PTL_NID_ANY) {
+ if (kibnal_data.kib_nid != PTL_NID_ANY) {
- koibnal_unadvertise (1);
+ kibnal_unadvertise (1);
- rc = ib_cm_listen_stop (koibnal_data.koib_listen_handle);
+ rc = ib_cm_listen_stop (kibnal_data.kib_listen_handle);
if (rc != 0)
CERROR ("Error %d stopping listener\n", rc);
}
- koibnal_data.koib_nid = ni->ni_pid.nid = nid;
- koibnal_data.koib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+ kibnal_data.kib_nid = ni->ni_pid.nid = nid;
+ kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
/* Delete all existing peers and their connections after new
* NID/incarnation set to ensure no old connections in our brave
* new world. */
- koibnal_del_peer (PTL_NID_ANY, 0);
-
- rc = 0;
- if (koibnal_data.koib_nid != PTL_NID_ANY) {
- /* New NID installed */
+ kibnal_del_peer (PTL_NID_ANY, 0);
- /* remove any previous advert (crashed node etc) */
- koibnal_unadvertise(0);
+ if (kibnal_data.kib_nid == PTL_NID_ANY) {
+ /* No new NID to install */
+ up (&kibnal_data.kib_nid_mutex);
+ return (0);
+ }
+
+ /* remove any previous advert (crashed node etc) */
+ kibnal_unadvertise(0);
- /* Assign new service number */
- koibnal_data.koib_cm_service_id = ib_cm_service_assign();
- CDEBUG(D_NET, "service_id "LPX64"\n", koibnal_data.koib_cm_service_id);
+ /* Assign new service number */
+ kibnal_data.kib_service_id = ib_cm_service_assign();
+ CDEBUG(D_NET, "service_id "LPX64"\n", kibnal_data.kib_service_id);
- rc = ib_cm_listen(koibnal_data.koib_cm_service_id,
- TS_IB_CM_SERVICE_EXACT_MASK,
- koibnal_passive_conn_callback, NULL,
- &koibnal_data.koib_listen_handle);
- if (rc != 0) {
- CERROR ("ib_cm_listen error: %d\n", rc);
- goto out;
+ rc = ib_cm_listen(kibnal_data.kib_service_id,
+ TS_IB_CM_SERVICE_EXACT_MASK,
+ kibnal_passive_conn_callback, NULL,
+ &kibnal_data.kib_listen_handle);
+ if (rc == 0) {
+ rc = kibnal_advertise();
+ if (rc == 0) {
+#if IBNAL_CHECK_ADVERT
+ kibnal_check_advert();
+#endif
+ up (&kibnal_data.kib_nid_mutex);
+ return (0);
}
- rc = koibnal_advertise();
-
- koibnal_check_advert();
- }
-
- out:
- if (rc != 0) {
- koibnal_data.koib_nid = PTL_NID_ANY;
+ ib_cm_listen_stop(kibnal_data.kib_listen_handle);
/* remove any peers that sprung up while I failed to
* advertise myself */
- koibnal_del_peer (PTL_NID_ANY, 0);
+ kibnal_del_peer (PTL_NID_ANY, 0);
}
-
- up (&koibnal_data.koib_nid_mutex);
- return (0);
+
+ kibnal_data.kib_nid = PTL_NID_ANY;
+ up (&kibnal_data.kib_nid_mutex);
+ return (rc);
}
-koib_peer_t *
-koibnal_create_peer (ptl_nid_t nid)
+kib_peer_t *
+kibnal_create_peer (ptl_nid_t nid)
{
- koib_peer_t *peer;
+ kib_peer_t *peer;
LASSERT (nid != PTL_NID_ANY);
INIT_LIST_HEAD (&peer->ibp_tx_queue);
peer->ibp_reconnect_time = jiffies;
- peer->ibp_reconnect_interval = OPENIBNAL_MIN_RECONNECT_INTERVAL;
+ peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
- atomic_inc (&koibnal_data.koib_npeers);
+ atomic_inc (&kibnal_data.kib_npeers);
return (peer);
}
void
-koibnal_destroy_peer (koib_peer_t *peer)
+kibnal_destroy_peer (kib_peer_t *peer)
{
CDEBUG (D_NET, "peer "LPX64" %p deleted\n", peer->ibp_nid, peer);
LASSERT (atomic_read (&peer->ibp_refcount) == 0);
LASSERT (peer->ibp_persistence == 0);
- LASSERT (!koibnal_peer_active(peer));
+ LASSERT (!kibnal_peer_active(peer));
LASSERT (peer->ibp_connecting == 0);
LASSERT (list_empty (&peer->ibp_conns));
LASSERT (list_empty (&peer->ibp_tx_queue));
* they are destroyed, so we can be assured that _all_ state to do
* with this peer has been cleaned up when its refcount drops to
* zero. */
- atomic_dec (&koibnal_data.koib_npeers);
+ atomic_dec (&kibnal_data.kib_npeers);
}
void
-koibnal_put_peer (koib_peer_t *peer)
+kibnal_put_peer (kib_peer_t *peer)
{
CDEBUG (D_OTHER, "putting peer[%p] -> "LPX64" (%d)\n",
peer, peer->ibp_nid,
if (!atomic_dec_and_test (&peer->ibp_refcount))
return;
- koibnal_destroy_peer (peer);
+ kibnal_destroy_peer (peer);
}
-koib_peer_t *
-koibnal_find_peer_locked (ptl_nid_t nid)
+kib_peer_t *
+kibnal_find_peer_locked (ptl_nid_t nid)
{
- struct list_head *peer_list = koibnal_nid2peerlist (nid);
+ struct list_head *peer_list = kibnal_nid2peerlist (nid);
struct list_head *tmp;
- koib_peer_t *peer;
+ kib_peer_t *peer;
list_for_each (tmp, peer_list) {
- peer = list_entry (tmp, koib_peer_t, ibp_list);
+ peer = list_entry (tmp, kib_peer_t, ibp_list);
LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
peer->ibp_connecting != 0 || /* creating conns */
return (NULL);
}
-koib_peer_t *
-koibnal_get_peer (ptl_nid_t nid)
+kib_peer_t *
+kibnal_get_peer (ptl_nid_t nid)
{
- koib_peer_t *peer;
+ kib_peer_t *peer;
- read_lock (&koibnal_data.koib_global_lock);
- peer = koibnal_find_peer_locked (nid);
+ read_lock (&kibnal_data.kib_global_lock);
+ peer = kibnal_find_peer_locked (nid);
if (peer != NULL) /* +1 ref for caller? */
atomic_inc (&peer->ibp_refcount);
- read_unlock (&koibnal_data.koib_global_lock);
+ read_unlock (&kibnal_data.kib_global_lock);
return (peer);
}
void
-koibnal_unlink_peer_locked (koib_peer_t *peer)
+kibnal_unlink_peer_locked (kib_peer_t *peer)
{
LASSERT (peer->ibp_persistence == 0);
LASSERT (list_empty(&peer->ibp_conns));
- LASSERT (koibnal_peer_active(peer));
+ LASSERT (kibnal_peer_active(peer));
list_del_init (&peer->ibp_list);
/* lose peerlist's ref */
- koibnal_put_peer (peer);
+ kibnal_put_peer (peer);
}
int
-koibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
+kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
{
- koib_peer_t *peer;
+ kib_peer_t *peer;
struct list_head *ptmp;
int i;
- read_lock (&koibnal_data.koib_global_lock);
+ read_lock (&kibnal_data.kib_global_lock);
- for (i = 0; i < koibnal_data.koib_peer_hash_size; i++) {
+ for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
- list_for_each (ptmp, &koibnal_data.koib_peers[i]) {
+ list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
- peer = list_entry (ptmp, koib_peer_t, ibp_list);
+ peer = list_entry (ptmp, kib_peer_t, ibp_list);
LASSERT (peer->ibp_persistence != 0 ||
peer->ibp_connecting != 0 ||
!list_empty (&peer->ibp_conns));
*nidp = peer->ibp_nid;
*persistencep = peer->ibp_persistence;
- read_unlock (&koibnal_data.koib_global_lock);
+ read_unlock (&kibnal_data.kib_global_lock);
return (0);
}
}
- read_unlock (&koibnal_data.koib_global_lock);
+ read_unlock (&kibnal_data.kib_global_lock);
return (-ENOENT);
}
int
-koibnal_add_persistent_peer (ptl_nid_t nid)
+kibnal_add_persistent_peer (ptl_nid_t nid)
{
unsigned long flags;
- koib_peer_t *peer;
- koib_peer_t *peer2;
+ kib_peer_t *peer;
+ kib_peer_t *peer2;
if (nid == PTL_NID_ANY)
return (-EINVAL);
- peer = koibnal_create_peer (nid);
+ peer = kibnal_create_peer (nid);
if (peer == NULL)
return (-ENOMEM);
- write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
- peer2 = koibnal_find_peer_locked (nid);
+ peer2 = kibnal_find_peer_locked (nid);
if (peer2 != NULL) {
- koibnal_put_peer (peer);
+ kibnal_put_peer (peer);
peer = peer2;
} else {
/* peer table takes existing ref on peer */
list_add_tail (&peer->ibp_list,
- koibnal_nid2peerlist (nid));
+ kibnal_nid2peerlist (nid));
}
peer->ibp_persistence++;
- write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
return (0);
}
void
-koibnal_del_peer_locked (koib_peer_t *peer, int single_share)
+kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
{
struct list_head *ctmp;
struct list_head *cnxt;
- koib_conn_t *conn;
+ kib_conn_t *conn;
if (!single_share)
peer->ibp_persistence = 0;
return;
list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
- conn = list_entry(ctmp, koib_conn_t, ibc_list);
+ conn = list_entry(ctmp, kib_conn_t, ibc_list);
- koibnal_close_conn_locked (conn, 0);
+ kibnal_close_conn_locked (conn, 0);
}
/* NB peer unlinks itself when last conn is closed */
}
int
-koibnal_del_peer (ptl_nid_t nid, int single_share)
+kibnal_del_peer (ptl_nid_t nid, int single_share)
{
unsigned long flags;
struct list_head *ptmp;
struct list_head *pnxt;
- koib_peer_t *peer;
+ kib_peer_t *peer;
int lo;
int hi;
int i;
int rc = -ENOENT;
- write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
if (nid != PTL_NID_ANY)
- lo = hi = koibnal_nid2peerlist(nid) - koibnal_data.koib_peers;
+ lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
else {
lo = 0;
- hi = koibnal_data.koib_peer_hash_size - 1;
+ hi = kibnal_data.kib_peer_hash_size - 1;
}
for (i = lo; i <= hi; i++) {
- list_for_each_safe (ptmp, pnxt, &koibnal_data.koib_peers[i]) {
- peer = list_entry (ptmp, koib_peer_t, ibp_list);
+ list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
+ peer = list_entry (ptmp, kib_peer_t, ibp_list);
LASSERT (peer->ibp_persistence != 0 ||
peer->ibp_connecting != 0 ||
!list_empty (&peer->ibp_conns));
if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid))
continue;
- koibnal_del_peer_locked (peer, single_share);
+ kibnal_del_peer_locked (peer, single_share);
rc = 0; /* matched something */
if (single_share)
}
}
out:
- write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
return (rc);
}
-koib_conn_t *
-koibnal_get_conn_by_idx (int index)
+kib_conn_t *
+kibnal_get_conn_by_idx (int index)
{
- koib_peer_t *peer;
+ kib_peer_t *peer;
struct list_head *ptmp;
- koib_conn_t *conn;
+ kib_conn_t *conn;
struct list_head *ctmp;
int i;
- read_lock (&koibnal_data.koib_global_lock);
+ read_lock (&kibnal_data.kib_global_lock);
- for (i = 0; i < koibnal_data.koib_peer_hash_size; i++) {
- list_for_each (ptmp, &koibnal_data.koib_peers[i]) {
+ for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
+ list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
- peer = list_entry (ptmp, koib_peer_t, ibp_list);
+ peer = list_entry (ptmp, kib_peer_t, ibp_list);
LASSERT (peer->ibp_persistence > 0 ||
peer->ibp_connecting != 0 ||
!list_empty (&peer->ibp_conns));
if (index-- > 0)
continue;
- conn = list_entry (ctmp, koib_conn_t, ibc_list);
+ conn = list_entry (ctmp, kib_conn_t, ibc_list);
CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
atomic_read (&conn->ibc_refcount));
atomic_inc (&conn->ibc_refcount);
- read_unlock (&koibnal_data.koib_global_lock);
+ read_unlock (&kibnal_data.kib_global_lock);
return (conn);
}
}
}
- read_unlock (&koibnal_data.koib_global_lock);
+ read_unlock (&kibnal_data.kib_global_lock);
return (NULL);
}
-koib_conn_t *
-koibnal_create_conn (void)
+kib_conn_t *
+kibnal_create_conn (void)
{
- koib_conn_t *conn;
+ kib_conn_t *conn;
int i;
__u64 vaddr = 0;
__u64 vaddr_base;
memset (conn, 0, sizeof (*conn));
INIT_LIST_HEAD (&conn->ibc_tx_queue);
- INIT_LIST_HEAD (&conn->ibc_rdma_queue);
+ INIT_LIST_HEAD (&conn->ibc_active_txs);
spin_lock_init (&conn->ibc_lock);
- atomic_inc (&koibnal_data.koib_nconns);
+ atomic_inc (&kibnal_data.kib_nconns);
/* well not really, but I call destroy() on failure, which decrements */
- PORTAL_ALLOC (conn->ibc_rxs, OPENIBNAL_RX_MSGS * sizeof (koib_rx_t));
+ PORTAL_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
if (conn->ibc_rxs == NULL)
goto failed;
- memset (conn->ibc_rxs, 0, OPENIBNAL_RX_MSGS * sizeof(koib_rx_t));
+ memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
- rc = koibnal_alloc_pages(&conn->ibc_rx_pages,
- OPENIBNAL_RX_MSG_PAGES,
- IB_ACCESS_LOCAL_WRITE);
+ rc = kibnal_alloc_pages(&conn->ibc_rx_pages,
+ IBNAL_RX_MSG_PAGES,
+ IB_ACCESS_LOCAL_WRITE);
if (rc != 0)
goto failed;
- vaddr_base = vaddr = conn->ibc_rx_pages->oibp_vaddr;
+ vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr;
- for (i = ipage = page_offset = 0; i < OPENIBNAL_RX_MSGS; i++) {
- struct page *page = conn->ibc_rx_pages->oibp_pages[ipage];
- koib_rx_t *rx = &conn->ibc_rxs[i];
+ for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
+ struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
+ kib_rx_t *rx = &conn->ibc_rxs[i];
rx->rx_conn = conn;
rx->rx_vaddr = vaddr;
- rx->rx_msg = (koib_msg_t *)(((char *)page_address(page)) + page_offset);
+ rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset);
- vaddr += OPENIBNAL_MSG_SIZE;
- LASSERT (vaddr <= vaddr_base + OPENIBNAL_RX_MSG_BYTES);
+ vaddr += IBNAL_MSG_SIZE;
+ LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
- page_offset += OPENIBNAL_MSG_SIZE;
+ page_offset += IBNAL_MSG_SIZE;
LASSERT (page_offset <= PAGE_SIZE);
if (page_offset == PAGE_SIZE) {
page_offset = 0;
ipage++;
- LASSERT (ipage <= OPENIBNAL_RX_MSG_PAGES);
+ LASSERT (ipage <= IBNAL_RX_MSG_PAGES);
}
}
params.qp_create = (struct ib_qp_create_param) {
.limit = {
/* Sends have an optional RDMA */
- .max_outstanding_send_request = 2 * OPENIBNAL_MSG_QUEUE_SIZE,
- .max_outstanding_receive_request = OPENIBNAL_MSG_QUEUE_SIZE,
+ .max_outstanding_send_request = 2 * IBNAL_MSG_QUEUE_SIZE,
+ .max_outstanding_receive_request = IBNAL_MSG_QUEUE_SIZE,
.max_send_gather_element = 1,
.max_receive_scatter_element = 1,
},
- .pd = koibnal_data.koib_pd,
- .send_queue = koibnal_data.koib_tx_cq,
- .receive_queue = koibnal_data.koib_rx_cq,
+ .pd = kibnal_data.kib_pd,
+ .send_queue = kibnal_data.kib_cq,
+ .receive_queue = kibnal_data.kib_cq,
.send_policy = IB_WQ_SIGNAL_SELECTABLE,
.receive_policy = IB_WQ_SIGNAL_SELECTABLE,
.rd_domain = 0,
}
/* Mark QP created */
- conn->ibc_state = OPENIBNAL_CONN_INIT_QP;
+ conn->ibc_state = IBNAL_CONN_INIT_QP;
params.qp_attr = (struct ib_qp_attribute) {
.state = IB_QP_STATE_INIT,
- .port = koibnal_data.koib_port,
+ .port = kibnal_data.kib_port,
.enable_rdma_read = 1,
.enable_rdma_write = 1,
.valid_fields = (IB_QP_ATTRIBUTE_STATE |
return (conn);
failed:
- koibnal_destroy_conn (conn);
+ kibnal_destroy_conn (conn);
return (NULL);
}
void
-koibnal_destroy_conn (koib_conn_t *conn)
+kibnal_destroy_conn (kib_conn_t *conn)
{
int rc;
LASSERT (atomic_read (&conn->ibc_refcount) == 0);
LASSERT (list_empty(&conn->ibc_tx_queue));
- LASSERT (list_empty(&conn->ibc_rdma_queue));
+ LASSERT (list_empty(&conn->ibc_active_txs));
LASSERT (conn->ibc_nsends_posted == 0);
LASSERT (conn->ibc_connreq == NULL);
switch (conn->ibc_state) {
- case OPENIBNAL_CONN_ZOMBIE:
+ case IBNAL_CONN_ZOMBIE:
/* called after connection sequence initiated */
- case OPENIBNAL_CONN_INIT_QP:
+ case IBNAL_CONN_INIT_QP:
rc = ib_qp_destroy(conn->ibc_qp);
if (rc != 0)
CERROR("Can't destroy QP: %d\n", rc);
/* fall through */
- case OPENIBNAL_CONN_INIT_NOTHING:
+ case IBNAL_CONN_INIT_NOTHING:
break;
default:
}
if (conn->ibc_rx_pages != NULL)
- koibnal_free_pages(conn->ibc_rx_pages);
+ kibnal_free_pages(conn->ibc_rx_pages);
if (conn->ibc_rxs != NULL)
PORTAL_FREE(conn->ibc_rxs,
- OPENIBNAL_RX_MSGS * sizeof(koib_rx_t));
+ IBNAL_RX_MSGS * sizeof(kib_rx_t));
if (conn->ibc_peer != NULL)
- koibnal_put_peer(conn->ibc_peer);
+ kibnal_put_peer(conn->ibc_peer);
PORTAL_FREE(conn, sizeof (*conn));
- atomic_dec(&koibnal_data.koib_nconns);
+ atomic_dec(&kibnal_data.kib_nconns);
- if (atomic_read (&koibnal_data.koib_nconns) == 0 &&
- koibnal_data.koib_shutdown) {
+ if (atomic_read (&kibnal_data.kib_nconns) == 0 &&
+ kibnal_data.kib_shutdown) {
/* I just nuked the last connection on shutdown; wake up
* everyone so they can exit. */
- wake_up_all(&koibnal_data.koib_sched_waitq);
- wake_up_all(&koibnal_data.koib_connd_waitq);
+ wake_up_all(&kibnal_data.kib_sched_waitq);
+ wake_up_all(&kibnal_data.kib_connd_waitq);
}
}
void
-koibnal_put_conn (koib_conn_t *conn)
+kibnal_put_conn (kib_conn_t *conn)
{
unsigned long flags;
return;
/* last ref only goes on zombies */
- LASSERT (conn->ibc_state == OPENIBNAL_CONN_ZOMBIE);
+ LASSERT (conn->ibc_state == IBNAL_CONN_ZOMBIE);
- spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags);
+ spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
- list_add (&conn->ibc_list, &koibnal_data.koib_connd_conns);
- wake_up (&koibnal_data.koib_connd_waitq);
+ list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns);
+ wake_up (&kibnal_data.kib_connd_waitq);
- spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags);
+ spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
}
int
-koibnal_close_peer_conns_locked (koib_peer_t *peer, int why)
+kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
{
- koib_conn_t *conn;
+ kib_conn_t *conn;
struct list_head *ctmp;
struct list_head *cnxt;
int count = 0;
list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
- conn = list_entry (ctmp, koib_conn_t, ibc_list);
+ conn = list_entry (ctmp, kib_conn_t, ibc_list);
count++;
- koibnal_close_conn_locked (conn, why);
+ kibnal_close_conn_locked (conn, why);
}
return (count);
}
int
-koibnal_close_stale_conns_locked (koib_peer_t *peer, __u64 incarnation)
+kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
{
- koib_conn_t *conn;
+ kib_conn_t *conn;
struct list_head *ctmp;
struct list_head *cnxt;
int count = 0;
list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
- conn = list_entry (ctmp, koib_conn_t, ibc_list);
+ conn = list_entry (ctmp, kib_conn_t, ibc_list);
if (conn->ibc_incarnation == incarnation)
continue;
peer->ibp_nid, conn->ibc_incarnation, incarnation);
count++;
- koibnal_close_conn_locked (conn, -ESTALE);
+ kibnal_close_conn_locked (conn, -ESTALE);
}
return (count);
}
int
-koibnal_close_matching_conns (ptl_nid_t nid)
+kibnal_close_matching_conns (ptl_nid_t nid)
{
unsigned long flags;
- koib_peer_t *peer;
+ kib_peer_t *peer;
struct list_head *ptmp;
struct list_head *pnxt;
int lo;
int i;
int count = 0;
- write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
if (nid != PTL_NID_ANY)
- lo = hi = koibnal_nid2peerlist(nid) - koibnal_data.koib_peers;
+ lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
else {
lo = 0;
- hi = koibnal_data.koib_peer_hash_size - 1;
+ hi = kibnal_data.kib_peer_hash_size - 1;
}
for (i = lo; i <= hi; i++) {
- list_for_each_safe (ptmp, pnxt, &koibnal_data.koib_peers[i]) {
+ list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
- peer = list_entry (ptmp, koib_peer_t, ibp_list);
+ peer = list_entry (ptmp, kib_peer_t, ibp_list);
LASSERT (peer->ibp_persistence != 0 ||
peer->ibp_connecting != 0 ||
!list_empty (&peer->ibp_conns));
if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid))
continue;
- count += koibnal_close_peer_conns_locked (peer, 0);
+ count += kibnal_close_peer_conns_locked (peer, 0);
}
}
- write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
/* wildcards always succeed */
if (nid == PTL_NID_ANY)
}
int
-koibnal_cmd(struct portals_cfg *pcfg, void * private)
+kibnal_cmd(struct portals_cfg *pcfg, void * private)
{
int rc = -EINVAL;
ptl_nid_t nid = 0;
int share_count = 0;
- rc = koibnal_get_peer_info(pcfg->pcfg_count,
- &nid, &share_count);
+ rc = kibnal_get_peer_info(pcfg->pcfg_count,
+ &nid, &share_count);
pcfg->pcfg_nid = nid;
pcfg->pcfg_size = 0;
pcfg->pcfg_id = 0;
break;
}
case NAL_CMD_ADD_PEER: {
- rc = koibnal_add_persistent_peer (pcfg->pcfg_nid);
+ rc = kibnal_add_persistent_peer (pcfg->pcfg_nid);
break;
}
case NAL_CMD_DEL_PEER: {
- rc = koibnal_del_peer (pcfg->pcfg_nid,
+ rc = kibnal_del_peer (pcfg->pcfg_nid,
/* flags == single_share */
pcfg->pcfg_flags != 0);
break;
}
case NAL_CMD_GET_CONN: {
- koib_conn_t *conn = koibnal_get_conn_by_idx (pcfg->pcfg_count);
+ kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count);
if (conn == NULL)
rc = -ENOENT;
pcfg->pcfg_id = 0;
pcfg->pcfg_misc = 0;
pcfg->pcfg_flags = 0;
- koibnal_put_conn (conn);
+ kibnal_put_conn (conn);
}
break;
}
case NAL_CMD_CLOSE_CONNECTION: {
- rc = koibnal_close_matching_conns (pcfg->pcfg_nid);
+ rc = kibnal_close_matching_conns (pcfg->pcfg_nid);
break;
}
case NAL_CMD_REGISTER_MYNID: {
if (pcfg->pcfg_nid == PTL_NID_ANY)
rc = -EINVAL;
else
- rc = koibnal_set_mynid (pcfg->pcfg_nid);
+ rc = kibnal_set_mynid (pcfg->pcfg_nid);
break;
}
}
}
void
-koibnal_free_pages (koib_pages_t *p)
+kibnal_free_pages (kib_pages_t *p)
{
- int npages = p->oibp_npages;
+ int npages = p->ibp_npages;
int rc;
int i;
- if (p->oibp_mapped) {
- rc = ib_memory_deregister(p->oibp_handle);
+ if (p->ibp_mapped) {
+ rc = ib_memory_deregister(p->ibp_handle);
if (rc != 0)
CERROR ("Deregister error: %d\n", rc);
}
for (i = 0; i < npages; i++)
- if (p->oibp_pages[i] != NULL)
- __free_page(p->oibp_pages[i]);
+ if (p->ibp_pages[i] != NULL)
+ __free_page(p->ibp_pages[i]);
- PORTAL_FREE (p, offsetof(koib_pages_t, oibp_pages[npages]));
+ PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
}
int
-koibnal_alloc_pages (koib_pages_t **pp, int npages, int access)
+kibnal_alloc_pages (kib_pages_t **pp, int npages, int access)
{
- koib_pages_t *p;
+ kib_pages_t *p;
struct ib_physical_buffer *phys_pages;
int i;
int rc;
- PORTAL_ALLOC(p, offsetof(koib_pages_t, oibp_pages[npages]));
+ PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
if (p == NULL) {
CERROR ("Can't allocate buffer %d\n", npages);
return (-ENOMEM);
}
- memset (p, 0, offsetof(koib_pages_t, oibp_pages[npages]));
- p->oibp_npages = npages;
+ memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
+ p->ibp_npages = npages;
for (i = 0; i < npages; i++) {
- p->oibp_pages[i] = alloc_page (GFP_KERNEL);
- if (p->oibp_pages[i] == NULL) {
+ p->ibp_pages[i] = alloc_page (GFP_KERNEL);
+ if (p->ibp_pages[i] == NULL) {
CERROR ("Can't allocate page %d of %d\n", i, npages);
- koibnal_free_pages(p);
+ kibnal_free_pages(p);
return (-ENOMEM);
}
}
PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages));
if (phys_pages == NULL) {
CERROR ("Can't allocate physarray for %d pages\n", npages);
- koibnal_free_pages(p);
+ kibnal_free_pages(p);
return (-ENOMEM);
}
for (i = 0; i < npages; i++) {
phys_pages[i].size = PAGE_SIZE;
phys_pages[i].address =
- koibnal_page2phys(p->oibp_pages[i]);
+ kibnal_page2phys(p->ibp_pages[i]);
}
- p->oibp_vaddr = 0;
- rc = ib_memory_register_physical(koibnal_data.koib_pd,
+ p->ibp_vaddr = 0;
+ rc = ib_memory_register_physical(kibnal_data.kib_pd,
phys_pages, npages,
- &p->oibp_vaddr,
+ &p->ibp_vaddr,
npages * PAGE_SIZE, 0,
access,
- &p->oibp_handle,
- &p->oibp_lkey,
- &p->oibp_rkey);
+ &p->ibp_handle,
+ &p->ibp_lkey,
+ &p->ibp_rkey);
PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages));
if (rc != 0) {
CERROR ("Error %d mapping %d pages\n", rc, npages);
- koibnal_free_pages(p);
+ kibnal_free_pages(p);
return (rc);
}
- p->oibp_mapped = 1;
+ p->ibp_mapped = 1;
*pp = p;
return (0);
}
int
-koibnal_setup_tx_descs (void)
+kibnal_setup_tx_descs (void)
{
int ipage = 0;
int page_offset = 0;
__u64 vaddr;
__u64 vaddr_base;
struct page *page;
- koib_tx_t *tx;
+ kib_tx_t *tx;
int i;
int rc;
/* pre-mapped messages are not bigger than 1 page */
- LASSERT (OPENIBNAL_MSG_SIZE <= PAGE_SIZE);
+ LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
/* No fancy arithmetic when we do the buffer calculations */
- LASSERT (PAGE_SIZE % OPENIBNAL_MSG_SIZE == 0);
+ LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
- rc = koibnal_alloc_pages(&koibnal_data.koib_tx_pages,
- OPENIBNAL_TX_MSG_PAGES,
- 0); /* local read access only */
+ rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages,
+ IBNAL_TX_MSG_PAGES,
+ 0); /* local read access only */
if (rc != 0)
return (rc);
- vaddr = vaddr_base = koibnal_data.koib_tx_pages->oibp_vaddr;
+ vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
- for (i = 0; i < OPENIBNAL_TX_MSGS; i++) {
- page = koibnal_data.koib_tx_pages->oibp_pages[ipage];
- tx = &koibnal_data.koib_tx_descs[i];
+ for (i = 0; i < IBNAL_TX_MSGS; i++) {
+ page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
+ tx = &kibnal_data.kib_tx_descs[i];
memset (tx, 0, sizeof(*tx)); /* zero flags etc */
- tx->tx_msg = (koib_msg_t *)(((char *)page_address(page)) + page_offset);
+ tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset);
tx->tx_vaddr = vaddr;
- tx->tx_isnblk = (i >= OPENIBNAL_NTX);
- tx->tx_mapped = KOIB_TX_UNMAPPED;
+ tx->tx_isnblk = (i >= IBNAL_NTX);
+ tx->tx_mapped = KIB_TX_UNMAPPED;
CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n",
i, tx, tx->tx_msg, tx->tx_vaddr);
if (tx->tx_isnblk)
list_add (&tx->tx_list,
- &koibnal_data.koib_idle_nblk_txs);
+ &kibnal_data.kib_idle_nblk_txs);
else
list_add (&tx->tx_list,
- &koibnal_data.koib_idle_txs);
+ &kibnal_data.kib_idle_txs);
- vaddr += OPENIBNAL_MSG_SIZE;
- LASSERT (vaddr <= vaddr_base + OPENIBNAL_TX_MSG_BYTES);
+ vaddr += IBNAL_MSG_SIZE;
+ LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
- page_offset += OPENIBNAL_MSG_SIZE;
+ page_offset += IBNAL_MSG_SIZE;
LASSERT (page_offset <= PAGE_SIZE);
if (page_offset == PAGE_SIZE) {
page_offset = 0;
ipage++;
- LASSERT (ipage <= OPENIBNAL_TX_MSG_PAGES);
+ LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
}
}
}
void
-koibnal_api_shutdown (nal_t *nal)
+kibnal_api_shutdown (nal_t *nal)
{
int i;
int rc;
CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
atomic_read (&portal_kmemory));
- LASSERT(nal == &koibnal_api);
+ LASSERT(nal == &kibnal_api);
- switch (koibnal_data.koib_init) {
+ switch (kibnal_data.kib_init) {
default:
- CERROR ("Unexpected state %d\n", koibnal_data.koib_init);
+ CERROR ("Unexpected state %d\n", kibnal_data.kib_init);
LBUG();
- case OPENIBNAL_INIT_ALL:
+ case IBNAL_INIT_ALL:
/* stop calls to nal_cmd */
libcfs_nal_cmd_unregister(OPENIBNAL);
/* No new peers */
/* resetting my NID to unadvertises me, removes my
* listener and nukes all current peers */
- koibnal_set_mynid (PTL_NID_ANY);
+ kibnal_set_mynid (PTL_NID_ANY);
/* Wait for all peer state to clean up */
i = 2;
- while (atomic_read (&koibnal_data.koib_npeers) != 0) {
+ while (atomic_read (&kibnal_data.kib_npeers) != 0) {
i++;
CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
"waiting for %d peers to close down\n",
- atomic_read (&koibnal_data.koib_npeers));
+ atomic_read (&kibnal_data.kib_npeers));
set_current_state (TASK_INTERRUPTIBLE);
schedule_timeout (HZ);
}
/* fall through */
- case OPENIBNAL_INIT_TX_CQ:
- rc = ib_cq_destroy (koibnal_data.koib_tx_cq);
- if (rc != 0)
- CERROR ("Destroy tx CQ error: %d\n", rc);
- /* fall through */
-
- case OPENIBNAL_INIT_RX_CQ:
- rc = ib_cq_destroy (koibnal_data.koib_rx_cq);
+ case IBNAL_INIT_CQ:
+ rc = ib_cq_destroy (kibnal_data.kib_cq);
if (rc != 0)
- CERROR ("Destroy rx CQ error: %d\n", rc);
+ CERROR ("Destroy CQ error: %d\n", rc);
/* fall through */
- case OPENIBNAL_INIT_TXD:
- koibnal_free_pages (koibnal_data.koib_tx_pages);
+ case IBNAL_INIT_TXD:
+ kibnal_free_pages (kibnal_data.kib_tx_pages);
/* fall through */
-#if OPENIBNAL_FMR
- case OPENIBNAL_INIT_FMR:
- rc = ib_fmr_pool_destroy (koibnal_data.koib_fmr_pool);
+#if IBNAL_FMR
+ case IBNAL_INIT_FMR:
+ rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool);
if (rc != 0)
CERROR ("Destroy FMR pool error: %d\n", rc);
/* fall through */
#endif
- case OPENIBNAL_INIT_PD:
- rc = ib_pd_destroy(koibnal_data.koib_pd);
+ case IBNAL_INIT_PD:
+ rc = ib_pd_destroy(kibnal_data.kib_pd);
if (rc != 0)
CERROR ("Destroy PD error: %d\n", rc);
/* fall through */
- case OPENIBNAL_INIT_LIB:
- lib_fini(&koibnal_lib);
+ case IBNAL_INIT_LIB:
+ lib_fini(&kibnal_lib);
/* fall through */
- case OPENIBNAL_INIT_DATA:
+ case IBNAL_INIT_DATA:
/* Module refcount only gets to zero when all peers
* have been closed so all lists must be empty */
- LASSERT (atomic_read (&koibnal_data.koib_npeers) == 0);
- LASSERT (koibnal_data.koib_peers != NULL);
- for (i = 0; i < koibnal_data.koib_peer_hash_size; i++) {
- LASSERT (list_empty (&koibnal_data.koib_peers[i]));
+ LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
+ LASSERT (kibnal_data.kib_peers != NULL);
+ for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
+ LASSERT (list_empty (&kibnal_data.kib_peers[i]));
}
- LASSERT (atomic_read (&koibnal_data.koib_nconns) == 0);
- LASSERT (list_empty (&koibnal_data.koib_sched_rxq));
- LASSERT (list_empty (&koibnal_data.koib_sched_txq));
- LASSERT (list_empty (&koibnal_data.koib_connd_conns));
- LASSERT (list_empty (&koibnal_data.koib_connd_peers));
+ LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
+ LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
+ LASSERT (list_empty (&kibnal_data.kib_sched_txq));
+ LASSERT (list_empty (&kibnal_data.kib_connd_conns));
+ LASSERT (list_empty (&kibnal_data.kib_connd_peers));
/* flag threads to terminate; wake and wait for them to die */
- koibnal_data.koib_shutdown = 1;
- wake_up_all (&koibnal_data.koib_sched_waitq);
- wake_up_all (&koibnal_data.koib_connd_waitq);
+ kibnal_data.kib_shutdown = 1;
+ wake_up_all (&kibnal_data.kib_sched_waitq);
+ wake_up_all (&kibnal_data.kib_connd_waitq);
i = 2;
- while (atomic_read (&koibnal_data.koib_nthreads) != 0) {
+ while (atomic_read (&kibnal_data.kib_nthreads) != 0) {
i++;
CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
"Waiting for %d threads to terminate\n",
- atomic_read (&koibnal_data.koib_nthreads));
+ atomic_read (&kibnal_data.kib_nthreads));
set_current_state (TASK_INTERRUPTIBLE);
schedule_timeout (HZ);
}
/* fall through */
- case OPENIBNAL_INIT_NOTHING:
+ case IBNAL_INIT_NOTHING:
break;
}
- if (koibnal_data.koib_tx_descs != NULL)
- PORTAL_FREE (koibnal_data.koib_tx_descs,
- OPENIBNAL_TX_MSGS * sizeof(koib_tx_t));
+ if (kibnal_data.kib_tx_descs != NULL)
+ PORTAL_FREE (kibnal_data.kib_tx_descs,
+ IBNAL_TX_MSGS * sizeof(kib_tx_t));
- if (koibnal_data.koib_peers != NULL)
- PORTAL_FREE (koibnal_data.koib_peers,
+ if (kibnal_data.kib_peers != NULL)
+ PORTAL_FREE (kibnal_data.kib_peers,
sizeof (struct list_head) *
- koibnal_data.koib_peer_hash_size);
+ kibnal_data.kib_peer_hash_size);
CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
atomic_read (&portal_kmemory));
printk(KERN_INFO "Lustre: OpenIB NAL unloaded (final mem %d)\n",
atomic_read(&portal_kmemory));
- koibnal_data.koib_init = OPENIBNAL_INIT_NOTHING;
+ kibnal_data.kib_init = IBNAL_INIT_NOTHING;
}
int
-koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
+kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
ptl_ni_limits_t *requested_limits,
ptl_ni_limits_t *actual_limits)
{
int rc;
int i;
- LASSERT (nal == &koibnal_api);
+ LASSERT (nal == &kibnal_api);
if (nal->nal_refct != 0) {
if (actual_limits != NULL)
- *actual_limits = koibnal_lib.libnal_ni.ni_actual_limits;
+ *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits;
/* This module got the first ref */
PORTAL_MODULE_USE;
return (PTL_OK);
}
- LASSERT (koibnal_data.koib_init == OPENIBNAL_INIT_NOTHING);
+ LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING);
- memset (&koibnal_data, 0, sizeof (koibnal_data)); /* zero pointers, flags etc */
+ memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */
- init_MUTEX (&koibnal_data.koib_nid_mutex);
- init_MUTEX_LOCKED (&koibnal_data.koib_nid_signal);
- koibnal_data.koib_nid = PTL_NID_ANY;
+ init_MUTEX (&kibnal_data.kib_nid_mutex);
+ init_MUTEX_LOCKED (&kibnal_data.kib_nid_signal);
+ kibnal_data.kib_nid = PTL_NID_ANY;
- rwlock_init(&koibnal_data.koib_global_lock);
+ rwlock_init(&kibnal_data.kib_global_lock);
- koibnal_data.koib_peer_hash_size = OPENIBNAL_PEER_HASH_SIZE;
- PORTAL_ALLOC (koibnal_data.koib_peers,
- sizeof (struct list_head) * koibnal_data.koib_peer_hash_size);
- if (koibnal_data.koib_peers == NULL) {
+ kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
+ PORTAL_ALLOC (kibnal_data.kib_peers,
+ sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
+ if (kibnal_data.kib_peers == NULL) {
goto failed;
}
- for (i = 0; i < koibnal_data.koib_peer_hash_size; i++)
- INIT_LIST_HEAD(&koibnal_data.koib_peers[i]);
-
- spin_lock_init (&koibnal_data.koib_connd_lock);
- INIT_LIST_HEAD (&koibnal_data.koib_connd_peers);
- INIT_LIST_HEAD (&koibnal_data.koib_connd_conns);
- init_waitqueue_head (&koibnal_data.koib_connd_waitq);
-
- spin_lock_init (&koibnal_data.koib_sched_lock);
- INIT_LIST_HEAD (&koibnal_data.koib_sched_txq);
- INIT_LIST_HEAD (&koibnal_data.koib_sched_rxq);
- init_waitqueue_head (&koibnal_data.koib_sched_waitq);
-
- spin_lock_init (&koibnal_data.koib_tx_lock);
- INIT_LIST_HEAD (&koibnal_data.koib_idle_txs);
- INIT_LIST_HEAD (&koibnal_data.koib_idle_nblk_txs);
- init_waitqueue_head(&koibnal_data.koib_idle_tx_waitq);
-
- PORTAL_ALLOC (koibnal_data.koib_tx_descs,
- OPENIBNAL_TX_MSGS * sizeof(koib_tx_t));
- if (koibnal_data.koib_tx_descs == NULL) {
+ for (i = 0; i < kibnal_data.kib_peer_hash_size; i++)
+ INIT_LIST_HEAD(&kibnal_data.kib_peers[i]);
+
+ spin_lock_init (&kibnal_data.kib_connd_lock);
+ INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
+ INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
+ init_waitqueue_head (&kibnal_data.kib_connd_waitq);
+
+ spin_lock_init (&kibnal_data.kib_sched_lock);
+ INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
+ INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
+ init_waitqueue_head (&kibnal_data.kib_sched_waitq);
+
+ spin_lock_init (&kibnal_data.kib_tx_lock);
+ INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
+ INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs);
+ init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq);
+
+ PORTAL_ALLOC (kibnal_data.kib_tx_descs,
+ IBNAL_TX_MSGS * sizeof(kib_tx_t));
+ if (kibnal_data.kib_tx_descs == NULL) {
CERROR ("Can't allocate tx descs\n");
goto failed;
}
/* lists/ptrs/locks initialised */
- koibnal_data.koib_init = OPENIBNAL_INIT_DATA;
+ kibnal_data.kib_init = IBNAL_INIT_DATA;
/*****************************************************/
+
process_id.pid = requested_pid;
- process_id.nid = koibnal_data.koib_nid;
+ process_id.nid = kibnal_data.kib_nid;
- rc = lib_init(&koibnal_lib, nal, process_id,
+ rc = lib_init(&kibnal_lib, nal, process_id,
requested_limits, actual_limits);
if (rc != PTL_OK) {
CERROR("lib_init failed: error %d\n", rc);
}
/* lib interface initialised */
- koibnal_data.koib_init = OPENIBNAL_INIT_LIB;
+ kibnal_data.kib_init = IBNAL_INIT_LIB;
/*****************************************************/
- for (i = 0; i < OPENIBNAL_N_SCHED; i++) {
- rc = koibnal_thread_start (koibnal_scheduler, (void *)i);
+ for (i = 0; i < IBNAL_N_SCHED; i++) {
+ rc = kibnal_thread_start (kibnal_scheduler, (void *)i);
if (rc != 0) {
CERROR("Can't spawn openibnal scheduler[%d]: %d\n",
i, rc);
}
}
- rc = koibnal_thread_start (koibnal_connd, NULL);
+ rc = kibnal_thread_start (kibnal_connd, NULL);
if (rc != 0) {
CERROR ("Can't spawn openibnal connd: %d\n", rc);
goto failed;
}
- koibnal_data.koib_device = ib_device_get_by_index(0);
- if (koibnal_data.koib_device == NULL) {
+ kibnal_data.kib_device = ib_device_get_by_index(0);
+ if (kibnal_data.kib_device == NULL) {
CERROR ("Can't open ib device 0\n");
goto failed;
}
- rc = ib_device_properties_get(koibnal_data.koib_device,
- &koibnal_data.koib_device_props);
+ rc = ib_device_properties_get(kibnal_data.kib_device,
+ &kibnal_data.kib_device_props);
if (rc != 0) {
CERROR ("Can't get device props: %d\n", rc);
goto failed;
}
CDEBUG(D_NET, "Max Initiator: %d Max Responder %d\n",
- koibnal_data.koib_device_props.max_initiator_per_qp,
- koibnal_data.koib_device_props.max_responder_per_qp);
+ kibnal_data.kib_device_props.max_initiator_per_qp,
+ kibnal_data.kib_device_props.max_responder_per_qp);
- koibnal_data.koib_port = 0;
+ kibnal_data.kib_port = 0;
for (i = 1; i <= 2; i++) {
- rc = ib_port_properties_get(koibnal_data.koib_device, i,
- &koibnal_data.koib_port_props);
+ rc = ib_port_properties_get(kibnal_data.kib_device, i,
+ &kibnal_data.kib_port_props);
if (rc == 0) {
- koibnal_data.koib_port = i;
+ kibnal_data.kib_port = i;
break;
}
}
- if (koibnal_data.koib_port == 0) {
+ if (kibnal_data.kib_port == 0) {
CERROR ("Can't find a port\n");
goto failed;
}
- rc = ib_pd_create(koibnal_data.koib_device,
- NULL, &koibnal_data.koib_pd);
+ rc = ib_pd_create(kibnal_data.kib_device,
+ NULL, &kibnal_data.kib_pd);
if (rc != 0) {
CERROR ("Can't create PD: %d\n", rc);
goto failed;
}
/* flag PD initialised */
- koibnal_data.koib_init = OPENIBNAL_INIT_PD;
+ kibnal_data.kib_init = IBNAL_INIT_PD;
/*****************************************************/
-#if OPENIBNAL_FMR
+#if IBNAL_FMR
{
- const int pool_size = OPENIBNAL_NTX + OPENIBNAL_NTX_NBLK;
+ const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK;
struct ib_fmr_pool_param params = {
.max_pages_per_fmr = PTL_MTU/PAGE_SIZE,
.access = (IB_ACCESS_LOCAL_WRITE |
.flush_arg = NULL,
.cache = 1,
};
- rc = ib_fmr_pool_create(koibnal_data.koib_pd, ¶ms,
- &koibnal_data.koib_fmr_pool);
+ rc = ib_fmr_pool_create(kibnal_data.kib_pd, ¶ms,
+ &kibnal_data.kib_fmr_pool);
if (rc != 0) {
CERROR ("Can't create FMR pool size %d: %d\n",
pool_size, rc);
}
/* flag FMR pool initialised */
- koibnal_data.koib_init = OPENIBNAL_INIT_FMR;
+ kibnal_data.kib_init = IBNAL_INIT_FMR;
#endif
/*****************************************************/
- rc = koibnal_setup_tx_descs();
+ rc = kibnal_setup_tx_descs();
if (rc != 0) {
CERROR ("Can't register tx descs: %d\n", rc);
goto failed;
}
/* flag TX descs initialised */
- koibnal_data.koib_init = OPENIBNAL_INIT_TXD;
+ kibnal_data.kib_init = IBNAL_INIT_TXD;
/*****************************************************/
{
struct ib_cq_callback callback = {
- .context = OPENIBNAL_CALLBACK_CTXT,
+ .context = IBNAL_CALLBACK_CTXT,
.policy = IB_CQ_PROVIDER_REARM,
.function = {
- .entry = koibnal_rx_callback,
+ .entry = kibnal_callback,
},
.arg = NULL,
};
- int nentries = OPENIBNAL_RX_CQ_ENTRIES;
+ int nentries = IBNAL_CQ_ENTRIES;
- rc = ib_cq_create (koibnal_data.koib_device,
+ rc = ib_cq_create (kibnal_data.kib_device,
&nentries, &callback, NULL,
- &koibnal_data.koib_rx_cq);
+ &kibnal_data.kib_cq);
if (rc != 0) {
- CERROR ("Can't create RX CQ: %d\n", rc);
+ CERROR ("Can't create CQ: %d\n", rc);
goto failed;
}
/* I only want solicited events */
- rc = ib_cq_request_notification(koibnal_data.koib_rx_cq, 1);
+ rc = ib_cq_request_notification(kibnal_data.kib_cq, 1);
LASSERT (rc == 0);
}
- /* flag RX CQ initialised */
- koibnal_data.koib_init = OPENIBNAL_INIT_RX_CQ;
- /*****************************************************/
-
- {
- struct ib_cq_callback callback = {
- .context = OPENIBNAL_CALLBACK_CTXT,
- .policy = IB_CQ_PROVIDER_REARM,
- .function = {
- .entry = koibnal_tx_callback,
- },
- .arg = NULL,
- };
- int nentries = OPENIBNAL_TX_CQ_ENTRIES;
-
- rc = ib_cq_create (koibnal_data.koib_device,
- &nentries, &callback, NULL,
- &koibnal_data.koib_tx_cq);
- if (rc != 0) {
- CERROR ("Can't create RX CQ: %d\n", rc);
- goto failed;
- }
-
- /* I only want solicited events */
- rc = ib_cq_request_notification(koibnal_data.koib_tx_cq, 1);
- LASSERT (rc == 0);
- }
-
- /* flag TX CQ initialised */
- koibnal_data.koib_init = OPENIBNAL_INIT_TX_CQ;
+ /* flag CQ initialised */
+ kibnal_data.kib_init = IBNAL_INIT_CQ;
/*****************************************************/
- rc = libcfs_nal_cmd_register(OPENIBNAL, &koibnal_cmd, NULL);
+ rc = libcfs_nal_cmd_register(OPENIBNAL, &kibnal_cmd, NULL);
if (rc != 0) {
CERROR ("Can't initialise command interface (rc = %d)\n", rc);
goto failed;
}
/* flag everything initialised */
- koibnal_data.koib_init = OPENIBNAL_INIT_ALL;
+ kibnal_data.kib_init = IBNAL_INIT_ALL;
/*****************************************************/
printk(KERN_INFO "Lustre: OpenIB NAL loaded "
return (PTL_OK);
failed:
- koibnal_api_shutdown (&koibnal_api);
+ kibnal_api_shutdown (&kibnal_api);
return (PTL_FAIL);
}
void __exit
-koibnal_module_fini (void)
+kibnal_module_fini (void)
{
#ifdef CONFIG_SYSCTL
- if (koibnal_tunables.koib_sysctl != NULL)
- unregister_sysctl_table (koibnal_tunables.koib_sysctl);
+ if (kibnal_tunables.kib_sysctl != NULL)
+ unregister_sysctl_table (kibnal_tunables.kib_sysctl);
#endif
- PtlNIFini(koibnal_ni);
+ PtlNIFini(kibnal_ni);
ptl_unregister_nal(OPENIBNAL);
}
int __init
-koibnal_module_init (void)
+kibnal_module_init (void)
{
int rc;
/* the following must be sizeof(int) for proc_dointvec() */
- LASSERT(sizeof (koibnal_tunables.koib_io_timeout) == sizeof (int));
+ LASSERT(sizeof (kibnal_tunables.kib_io_timeout) == sizeof (int));
- koibnal_api.nal_ni_init = koibnal_api_startup;
- koibnal_api.nal_ni_fini = koibnal_api_shutdown;
+ kibnal_api.nal_ni_init = kibnal_api_startup;
+ kibnal_api.nal_ni_fini = kibnal_api_shutdown;
/* Initialise dynamic tunables to defaults once only */
- koibnal_tunables.koib_io_timeout = OPENIBNAL_IO_TIMEOUT;
+ kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
- rc = ptl_register_nal(OPENIBNAL, &koibnal_api);
+ rc = ptl_register_nal(OPENIBNAL, &kibnal_api);
if (rc != PTL_OK) {
- CERROR("Can't register OPENIBNAL: %d\n", rc);
+ CERROR("Can't register IBNAL: %d\n", rc);
return (-ENOMEM); /* or something... */
}
/* Pure gateways want the NAL started up at module load time... */
- rc = PtlNIInit(OPENIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &koibnal_ni);
+ rc = PtlNIInit(OPENIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni);
if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
ptl_unregister_nal(OPENIBNAL);
return (-ENODEV);
#ifdef CONFIG_SYSCTL
/* Press on regardless even if registering sysctl doesn't work */
- koibnal_tunables.koib_sysctl =
- register_sysctl_table (koibnal_top_ctl_table, 0);
+ kibnal_tunables.kib_sysctl =
+ register_sysctl_table (kibnal_top_ctl_table, 0);
#endif
return (0);
}
MODULE_DESCRIPTION("Kernel OpenIB NAL v0.01");
MODULE_LICENSE("GPL");
-module_init(koibnal_module_init);
-module_exit(koibnal_module_fini);
+module_init(kibnal_module_init);
+module_exit(kibnal_module_fini);
#include <linux/kmod.h>
#include <linux/sysctl.h>
-#define DEBUG_SUBSYSTEM S_OPENIBNAL
+#define DEBUG_SUBSYSTEM S_IBNAL
#include <linux/kp30.h>
#include <portals/p30.h>
#include <ts_ib_cm.h>
#include <ts_ib_sa_client.h>
-#define OPENIBNAL_SERVICE_NAME "openibnal"
+#define IBNAL_SERVICE_NAME "openibnal"
#if CONFIG_SMP
-# define OPENIBNAL_N_SCHED num_online_cpus() /* # schedulers */
+# define IBNAL_N_SCHED num_online_cpus() /* # schedulers */
#else
-# define OPENIBNAL_N_SCHED 1 /* # schedulers */
+# define IBNAL_N_SCHED 1 /* # schedulers */
#endif
-#define OPENIBNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */
-#define OPENIBNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */
+#define IBNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */
+#define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */
-#define OPENIBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */
+#define IBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */
-#define OPENIBNAL_MSG_QUEUE_SIZE 8 /* # messages in-flight */
-#define OPENIBNAL_CREDIT_HIGHWATER 6 /* when to eagerly return credits */
-#define OPENIBNAL_RETRY 7 /* # times to retry */
-#define OPENIBNAL_RNR_RETRY 7 /* */
-#define OPENIBNAL_CM_RETRY 7 /* # times to retry connection */
-#define OPENIBNAL_FLOW_CONTROL 1
-#define OPENIBNAL_RESPONDER_RESOURCES 8
+#define IBNAL_MSG_QUEUE_SIZE 8 /* # messages/RDMAs in-flight */
+#define IBNAL_CREDIT_HIGHWATER 6 /* when to eagerly return credits */
+#define IBNAL_RETRY 7 /* # times to retry */
+#define IBNAL_RNR_RETRY 7 /* */
+#define IBNAL_CM_RETRY 7 /* # times to retry connection */
+#define IBNAL_FLOW_CONTROL 1
+#define IBNAL_RESPONDER_RESOURCES 8
-#define OPENIBNAL_NTX 64 /* # tx descs */
-#define OPENIBNAL_NTX_NBLK 256 /* # reserved tx descs */
+#define IBNAL_NTX 64 /* # tx descs */
+#define IBNAL_NTX_NBLK 256 /* # reserved tx descs */
-#define OPENIBNAL_PEER_HASH_SIZE 101 /* # peer lists */
+#define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */
-#define OPENIBNAL_RESCHED 100 /* # scheduler loops before reschedule */
+#define IBNAL_RESCHED 100 /* # scheduler loops before reschedule */
-#define OPENIBNAL_CONCURRENT_PEERS 1000 /* # nodes all talking at once to me */
+#define IBNAL_CONCURRENT_PEERS 1000 /* # nodes all talking at once to me */
/* default vals for runtime tunables */
-#define OPENIBNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */
+#define IBNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */
/************************/
/* derived constants... */
/* TX messages (shared by all connections) */
-#define OPENIBNAL_TX_MSGS (OPENIBNAL_NTX + OPENIBNAL_NTX_NBLK)
-#define OPENIBNAL_TX_MSG_BYTES (OPENIBNAL_TX_MSGS * OPENIBNAL_MSG_SIZE)
-#define OPENIBNAL_TX_MSG_PAGES ((OPENIBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
-
-/* we may have up to 2 completions per transmit */
-#define OPENIBNAL_TX_CQ_ENTRIES (2*OPENIBNAL_TX_MSGS)
+#define IBNAL_TX_MSGS (IBNAL_NTX + IBNAL_NTX_NBLK)
+#define IBNAL_TX_MSG_BYTES (IBNAL_TX_MSGS * IBNAL_MSG_SIZE)
+#define IBNAL_TX_MSG_PAGES ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
/* RX messages (per connection) */
-#define OPENIBNAL_RX_MSGS OPENIBNAL_MSG_QUEUE_SIZE
-#define OPENIBNAL_RX_MSG_BYTES (OPENIBNAL_RX_MSGS * OPENIBNAL_MSG_SIZE)
-#define OPENIBNAL_RX_MSG_PAGES ((OPENIBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
+#define IBNAL_RX_MSGS IBNAL_MSG_QUEUE_SIZE
+#define IBNAL_RX_MSG_BYTES (IBNAL_RX_MSGS * IBNAL_MSG_SIZE)
+#define IBNAL_RX_MSG_PAGES ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
-/* 1 completion per receive, per connection */
-#define OPENIBNAL_RX_CQ_ENTRIES (OPENIBNAL_RX_MSGS * OPENIBNAL_CONCURRENT_PEERS)
+/* we may have up to 2 completions per transmit +
+ 1 completion per receive, per connection */
+#define IBNAL_CQ_ENTRIES ((2*IBNAL_TX_MSGS) + \
+ (IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS))
-#define OPENIBNAL_RDMA_BASE 0x0eeb0000
-#define OPENIBNAL_FMR 1
-#define OPENIBNAL_CKSUM 0
-//#define OPENIBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_PROCESS
-#define OPENIBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_INTERRUPT
+#define IBNAL_RDMA_BASE 0x0eeb0000
+#define IBNAL_FMR 1
+#define IBNAL_CKSUM 0
+//#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_PROCESS
+#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_INTERRUPT
typedef struct
{
- int koib_io_timeout; /* comms timeout (seconds) */
- struct ctl_table_header *koib_sysctl; /* sysctl interface */
-} koib_tunables_t;
+ int kib_io_timeout; /* comms timeout (seconds) */
+ struct ctl_table_header *kib_sysctl; /* sysctl interface */
+} kib_tunables_t;
typedef struct
{
- int oibp_npages; /* # pages */
- int oibp_mapped; /* mapped? */
- __u64 oibp_vaddr; /* mapped region vaddr */
- __u32 oibp_lkey; /* mapped region lkey */
- __u32 oibp_rkey; /* mapped region rkey */
- struct ib_mr *oibp_handle; /* mapped region handle */
- struct page *oibp_pages[0];
-} koib_pages_t;
+ int ibp_npages; /* # pages */
+ int ibp_mapped; /* mapped? */
+ __u64 ibp_vaddr; /* mapped region vaddr */
+ __u32 ibp_lkey; /* mapped region lkey */
+ __u32 ibp_rkey; /* mapped region rkey */
+ struct ib_mr *ibp_handle; /* mapped region handle */
+ struct page *ibp_pages[0];
+} kib_pages_t;
typedef struct
{
- int koib_init; /* initialisation state */
- __u64 koib_incarnation; /* which one am I */
- int koib_shutdown; /* shut down? */
- atomic_t koib_nthreads; /* # live threads */
-
- __u64 koib_cm_service_id; /* service number I listen on */
- ptl_nid_t koib_nid; /* my NID */
- struct semaphore koib_nid_mutex; /* serialise NID ops */
- struct semaphore koib_nid_signal; /* signal completion */
-
- rwlock_t koib_global_lock; /* stabilize peer/conn ops */
-
- struct list_head *koib_peers; /* hash table of all my known peers */
- int koib_peer_hash_size; /* size of koib_peers */
- atomic_t koib_npeers; /* # peers extant */
- atomic_t koib_nconns; /* # connections extant */
-
- struct list_head koib_connd_conns; /* connections to progress */
- struct list_head koib_connd_peers; /* peers waiting for a connection */
- wait_queue_head_t koib_connd_waitq; /* connection daemons sleep here */
- unsigned long koib_connd_waketime; /* when connd will wake */
- spinlock_t koib_connd_lock; /* serialise */
-
- wait_queue_head_t koib_sched_waitq; /* schedulers sleep here */
- struct list_head koib_sched_txq; /* tx requiring attention */
- struct list_head koib_sched_rxq; /* rx requiring attention */
- spinlock_t koib_sched_lock; /* serialise */
+ int kib_init; /* initialisation state */
+ __u64 kib_incarnation; /* which one am I */
+ int kib_shutdown; /* shut down? */
+ atomic_t kib_nthreads; /* # live threads */
+
+ __u64 kib_service_id; /* service number I listen on */
+ ptl_nid_t kib_nid; /* my NID */
+ struct semaphore kib_nid_mutex; /* serialise NID ops */
+ struct semaphore kib_nid_signal; /* signal completion */
+
+ rwlock_t kib_global_lock; /* stabilize peer/conn ops */
+
+ struct list_head *kib_peers; /* hash table of all my known peers */
+ int kib_peer_hash_size; /* size of kib_peers */
+ atomic_t kib_npeers; /* # peers extant */
+ atomic_t kib_nconns; /* # connections extant */
+
+ struct list_head kib_connd_conns; /* connections to progress */
+ struct list_head kib_connd_peers; /* peers waiting for a connection */
+ wait_queue_head_t kib_connd_waitq; /* connection daemons sleep here */
+ unsigned long kib_connd_waketime; /* when connd will wake */
+ spinlock_t kib_connd_lock; /* serialise */
+
+ wait_queue_head_t kib_sched_waitq; /* schedulers sleep here */
+ struct list_head kib_sched_txq; /* tx requiring attention */
+ struct list_head kib_sched_rxq; /* rx requiring attention */
+ spinlock_t kib_sched_lock; /* serialise */
- struct koib_tx *koib_tx_descs; /* all the tx descriptors */
- koib_pages_t *koib_tx_pages; /* premapped tx msg pages */
-
- struct list_head koib_idle_txs; /* idle tx descriptors */
- struct list_head koib_idle_nblk_txs; /* idle reserved tx descriptors */
- wait_queue_head_t koib_idle_tx_waitq; /* block here for tx descriptor */
- __u64 koib_next_tx_cookie; /* RDMA completion cookie */
- spinlock_t koib_tx_lock; /* serialise */
+ struct kib_tx *kib_tx_descs; /* all the tx descriptors */
+ kib_pages_t *kib_tx_pages; /* premapped tx msg pages */
+
+ struct list_head kib_idle_txs; /* idle tx descriptors */
+ struct list_head kib_idle_nblk_txs; /* idle reserved tx descriptors */
+ wait_queue_head_t kib_idle_tx_waitq; /* block here for tx descriptor */
+ __u64 kib_next_tx_cookie; /* RDMA completion cookie */
+ spinlock_t kib_tx_lock; /* serialise */
- struct ib_device *koib_device; /* "the" device */
- struct ib_device_properties koib_device_props; /* its properties */
- int koib_port; /* port on the device */
- struct ib_port_properties koib_port_props; /* its properties */
- struct ib_pd *koib_pd; /* protection domain */
-#if OPENIBNAL_FMR
- struct ib_fmr_pool *koib_fmr_pool; /* fast memory region pool */
+ struct ib_device *kib_device; /* "the" device */
+ struct ib_device_properties kib_device_props; /* its properties */
+ int kib_port; /* port on the device */
+ struct ib_port_properties kib_port_props; /* its properties */
+ struct ib_pd *kib_pd; /* protection domain */
+#if IBNAL_FMR
+ struct ib_fmr_pool *kib_fmr_pool; /* fast memory region pool */
#endif
- struct ib_cq *koib_rx_cq; /* receive completion queue */
- struct ib_cq *koib_tx_cq; /* transmit completion queue */
- void *koib_listen_handle; /* where I listen for connections */
- struct ib_common_attrib_service koib_service; /* SM service */
+ struct ib_cq *kib_cq; /* completion queue */
+ void *kib_listen_handle; /* where I listen for connections */
-} koib_data_t;
-
-#define OPENIBNAL_INIT_NOTHING 0
-#define OPENIBNAL_INIT_DATA 1
-#define OPENIBNAL_INIT_LIB 2
-#define OPENIBNAL_INIT_PD 3
-#define OPENIBNAL_INIT_FMR 4
-#define OPENIBNAL_INIT_TXD 5
-#define OPENIBNAL_INIT_RX_CQ 6
-#define OPENIBNAL_INIT_TX_CQ 7
-#define OPENIBNAL_INIT_ALL 8
+} kib_data_t;
+
+#define IBNAL_INIT_NOTHING 0
+#define IBNAL_INIT_DATA 1
+#define IBNAL_INIT_LIB 2
+#define IBNAL_INIT_PD 3
+#define IBNAL_INIT_FMR 4
+#define IBNAL_INIT_TXD 5
+#define IBNAL_INIT_CQ 6
+#define IBNAL_INIT_ALL 7
/************************************************************************
* Wire message structs.
__u32 md_lkey;
__u32 md_rkey;
__u64 md_addr;
-} koib_md_t;
+} kib_md_t;
typedef struct
{
__u32 rd_key; /* remote key */
__u32 rd_nob; /* # of bytes */
__u64 rd_addr; /* remote io vaddr */
-} koib_rdma_desc_t;
+} kib_rdma_desc_t;
typedef struct
{
- ptl_hdr_t oibim_hdr; /* portals header */
- char oibim_payload[0]; /* piggy-backed payload */
-} koib_immediate_msg_t;
+ ptl_hdr_t ibim_hdr; /* portals header */
+ char ibim_payload[0]; /* piggy-backed payload */
+} kib_immediate_msg_t;
typedef struct
{
- ptl_hdr_t oibrm_hdr; /* portals header */
- __u64 oibrm_cookie; /* opaque completion cookie */
- koib_rdma_desc_t oibrm_desc; /* where to suck/blow */
-} koib_rdma_msg_t;
+ ptl_hdr_t ibrm_hdr; /* portals header */
+ __u64 ibrm_cookie; /* opaque completion cookie */
+ kib_rdma_desc_t ibrm_desc; /* where to suck/blow */
+} kib_rdma_msg_t;
typedef struct
{
- __u64 oibcm_cookie; /* opaque completion cookie */
- __u32 oibcm_status; /* completion status */
-} koib_completion_msg_t;
+ __u64 ibcm_cookie; /* opaque completion cookie */
+ __u32 ibcm_status; /* completion status */
+} kib_completion_msg_t;
typedef struct
{
- __u32 oibm_magic; /* I'm an openibnal message */
- __u16 oibm_version; /* this is my version number */
- __u8 oibm_type; /* msg type */
- __u8 oibm_credits; /* returned credits */
-#if OPENIBNAL_CKSUM
- __u32 oibm_nob;
- __u32 oibm_cksum;
+ __u32 ibm_magic; /* I'm an openibnal message */
+ __u16 ibm_version; /* this is my version number */
+ __u8 ibm_type; /* msg type */
+ __u8 ibm_credits; /* returned credits */
+#if IBNAL_CKSUM
+ __u32 ibm_nob;
+ __u32 ibm_cksum;
#endif
union {
- koib_immediate_msg_t immediate;
- koib_rdma_msg_t rdma;
- koib_completion_msg_t completion;
- } oibm_u;
-} koib_msg_t;
-
-#define OPENIBNAL_MSG_MAGIC 0x0be91b91 /* unique magic */
-#define OPENIBNAL_MSG_VERSION 1 /* current protocol version */
-
-#define OPENIBNAL_MSG_NOOP 0xd0 /* nothing (just credits) */
-#define OPENIBNAL_MSG_IMMEDIATE 0xd1 /* portals hdr + payload */
-#define OPENIBNAL_MSG_PUT_RDMA 0xd2 /* portals PUT hdr + source rdma desc */
-#define OPENIBNAL_MSG_PUT_DONE 0xd3 /* signal PUT rdma completion */
-#define OPENIBNAL_MSG_GET_RDMA 0xd4 /* portals GET hdr + sink rdma desc */
-#define OPENIBNAL_MSG_GET_DONE 0xd5 /* signal GET rdma completion */
+ kib_immediate_msg_t immediate;
+ kib_rdma_msg_t rdma;
+ kib_completion_msg_t completion;
+ } ibm_u;
+} kib_msg_t;
+
+#define IBNAL_MSG_MAGIC 0x0be91b91 /* unique magic */
+#define IBNAL_MSG_VERSION 1 /* current protocol version */
+
+#define IBNAL_MSG_NOOP 0xd0 /* nothing (just credits) */
+#define IBNAL_MSG_IMMEDIATE 0xd1 /* portals hdr + payload */
+#define IBNAL_MSG_PUT_RDMA 0xd2 /* portals PUT hdr + source rdma desc */
+#define IBNAL_MSG_PUT_DONE 0xd3 /* signal PUT rdma completion */
+#define IBNAL_MSG_GET_RDMA 0xd4 /* portals GET hdr + sink rdma desc */
+#define IBNAL_MSG_GET_DONE 0xd5 /* signal GET rdma completion */
/***********************************************************************/
-typedef struct koib_rx /* receive message */
+typedef struct kib_rx /* receive message */
{
struct list_head rx_list; /* queue for attention */
- struct koib_conn *rx_conn; /* owning conn */
+ struct kib_conn *rx_conn; /* owning conn */
int rx_rdma; /* RDMA completion posted? */
int rx_posted; /* posted? */
__u64 rx_vaddr; /* pre-mapped buffer (hca vaddr) */
- koib_msg_t *rx_msg; /* pre-mapped buffer (host vaddr) */
+ kib_msg_t *rx_msg; /* pre-mapped buffer (host vaddr) */
struct ib_receive_param rx_sp; /* receive work item */
struct ib_gather_scatter rx_gl; /* and it's memory */
-} koib_rx_t;
+} kib_rx_t;
-typedef struct koib_tx /* transmit message */
+typedef struct kib_tx /* transmit message */
{
struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */
int tx_isnblk; /* I'm reserved for non-blocking sends */
- struct koib_conn *tx_conn; /* owning conn */
+ struct kib_conn *tx_conn; /* owning conn */
int tx_mapped; /* mapped for RDMA? */
int tx_sending; /* # tx callbacks outstanding */
int tx_status; /* completion status */
- int tx_passive_rdma; /* waiting for peer to RDMA? */
- int tx_passive_rdma_wait; /* on ibc_rdma_queue */
- unsigned long tx_passive_rdma_deadline; /* completion deadline */
+ unsigned long tx_deadline; /* completion deadline */
+ int tx_passive_rdma; /* peer sucks/blows */
+ int tx_passive_rdma_wait; /* waiting for peer to complete */
__u64 tx_passive_rdma_cookie; /* completion cookie */
lib_msg_t *tx_libmsg[2]; /* lib msgs to finalize on completion */
- koib_md_t tx_md; /* RDMA mapping (active/passive) */
+ kib_md_t tx_md; /* RDMA mapping (active/passive) */
__u64 tx_vaddr; /* pre-mapped buffer (hca vaddr) */
- koib_msg_t *tx_msg; /* pre-mapped buffer (host vaddr) */
+ kib_msg_t *tx_msg; /* pre-mapped buffer (host vaddr) */
int tx_nsp; /* # send work items */
struct ib_send_param tx_sp[2]; /* send work items... */
struct ib_gather_scatter tx_gl[2]; /* ...and their memory */
-} koib_tx_t;
+} kib_tx_t;
-#define KOIB_TX_UNMAPPED 0
-#define KOIB_TX_MAPPED 1
-#define KOIB_TX_MAPPED_FMR 2
+#define KIB_TX_UNMAPPED 0
+#define KIB_TX_MAPPED 1
+#define KIB_TX_MAPPED_FMR 2
-typedef struct koib_wire_connreq
+typedef struct kib_wire_connreq
{
__u32 wcr_magic; /* I'm an openibnal connreq */
__u16 wcr_version; /* this is my version number */
__u16 wcr_queue_depth; /* this is my receive queue size */
__u64 wcr_nid; /* peer's NID */
__u64 wcr_incarnation; /* peer's incarnation */
-} koib_wire_connreq_t;
+} kib_wire_connreq_t;
-typedef struct koib_connreq
+typedef struct kib_connreq
{
/* connection-in-progress */
- struct koib_conn *cr_conn;
- koib_wire_connreq_t cr_wcr;
+ struct kib_conn *cr_conn;
+ kib_wire_connreq_t cr_wcr;
__u64 cr_tid;
struct ib_common_attrib_service cr_service;
tTS_IB_GID cr_gid;
struct ib_path_record cr_path;
struct ib_cm_active_param cr_connparam;
-} koib_connreq_t;
+} kib_connreq_t;
-typedef struct koib_conn
+typedef struct kib_conn
{
- struct koib_peer *ibc_peer; /* owning peer */
+ struct kib_peer *ibc_peer; /* owning peer */
struct list_head ibc_list; /* stash on peer's conn list */
__u64 ibc_incarnation; /* which instance of the peer */
atomic_t ibc_refcount; /* # users */
int ibc_credits; /* # credits I have */
int ibc_outstanding_credits; /* # credits to return */
struct list_head ibc_tx_queue; /* send queue */
- struct list_head ibc_rdma_queue; /* tx awaiting RDMA completion */
+ struct list_head ibc_active_txs; /* active tx awaiting completion */
spinlock_t ibc_lock; /* serialise */
- koib_rx_t *ibc_rxs; /* the rx descs */
- koib_pages_t *ibc_rx_pages; /* premapped rx msg pages */
+ kib_rx_t *ibc_rxs; /* the rx descs */
+ kib_pages_t *ibc_rx_pages; /* premapped rx msg pages */
struct ib_qp *ibc_qp; /* queue pair */
__u32 ibc_qpn; /* queue pair number */
tTS_IB_CM_COMM_ID ibc_comm_id; /* connection ID? */
- koib_connreq_t *ibc_connreq; /* connection request state */
-} koib_conn_t;
+ kib_connreq_t *ibc_connreq; /* connection request state */
+} kib_conn_t;
-#define OPENIBNAL_CONN_INIT_NOTHING 0 /* initial state */
-#define OPENIBNAL_CONN_INIT_QP 1 /* ibc_qp set up */
-#define OPENIBNAL_CONN_CONNECTING 2 /* started to connect */
-#define OPENIBNAL_CONN_ESTABLISHED 3 /* connection established */
-#define OPENIBNAL_CONN_DEATHROW 4 /* waiting to be closed */
-#define OPENIBNAL_CONN_ZOMBIE 5 /* waiting to be freed */
+#define IBNAL_CONN_INIT_NOTHING 0 /* initial state */
+#define IBNAL_CONN_INIT_QP 1 /* ibc_qp set up */
+#define IBNAL_CONN_CONNECTING 2 /* started to connect */
+#define IBNAL_CONN_ESTABLISHED 3 /* connection established */
+#define IBNAL_CONN_DEATHROW 4 /* waiting to be closed */
+#define IBNAL_CONN_ZOMBIE 5 /* waiting to be freed */
-typedef struct koib_peer
+typedef struct kib_peer
{
struct list_head ibp_list; /* stash on global peer list */
- struct list_head ibp_connd_list; /* schedule on koib_connd_peers */
+ struct list_head ibp_connd_list; /* schedule on kib_connd_peers */
ptl_nid_t ibp_nid; /* who's on the other end(s) */
atomic_t ibp_refcount; /* # users */
int ibp_persistence; /* "known" peer refs */
int ibp_connecting; /* connecting+accepting */
unsigned long ibp_reconnect_time; /* when reconnect may be attempted */
unsigned long ibp_reconnect_interval; /* exponential backoff */
-} koib_peer_t;
+} kib_peer_t;
-extern lib_nal_t koibnal_lib;
-extern koib_data_t koibnal_data;
-extern koib_tunables_t koibnal_tunables;
+extern lib_nal_t kibnal_lib;
+extern kib_data_t kibnal_data;
+extern kib_tunables_t kibnal_tunables;
static inline struct list_head *
-koibnal_nid2peerlist (ptl_nid_t nid)
+kibnal_nid2peerlist (ptl_nid_t nid)
{
- unsigned int hash = ((unsigned int)nid) % koibnal_data.koib_peer_hash_size;
+ unsigned int hash = ((unsigned int)nid) % kibnal_data.kib_peer_hash_size;
- return (&koibnal_data.koib_peers [hash]);
+ return (&kibnal_data.kib_peers [hash]);
}
static inline int
-koibnal_peer_active(koib_peer_t *peer)
+kibnal_peer_active(kib_peer_t *peer)
{
/* Am I in the peer hash table? */
return (!list_empty(&peer->ibp_list));
}
static inline void
-koibnal_queue_tx_locked (koib_tx_t *tx, koib_conn_t *conn)
+kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn)
{
/* CAVEAT EMPTOR: tx takes caller's ref on conn */
LASSERT (tx->tx_conn == NULL); /* only set here */
tx->tx_conn = conn;
+ tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ;
list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
}
-#define KOIBNAL_SERVICE_KEY_MASK (IB_SA_SERVICE_COMP_MASK_NAME | \
- IB_SA_SERVICE_COMP_MASK_DATA8_1 | \
- IB_SA_SERVICE_COMP_MASK_DATA8_2 | \
- IB_SA_SERVICE_COMP_MASK_DATA8_3 | \
- IB_SA_SERVICE_COMP_MASK_DATA8_4 | \
- IB_SA_SERVICE_COMP_MASK_DATA8_5 | \
- IB_SA_SERVICE_COMP_MASK_DATA8_6 | \
- IB_SA_SERVICE_COMP_MASK_DATA8_7 | \
- IB_SA_SERVICE_COMP_MASK_DATA8_8)
+#define KIBNAL_SERVICE_KEY_MASK (IB_SA_SERVICE_COMP_MASK_NAME | \
+ IB_SA_SERVICE_COMP_MASK_DATA8_1 | \
+ IB_SA_SERVICE_COMP_MASK_DATA8_2 | \
+ IB_SA_SERVICE_COMP_MASK_DATA8_3 | \
+ IB_SA_SERVICE_COMP_MASK_DATA8_4 | \
+ IB_SA_SERVICE_COMP_MASK_DATA8_5 | \
+ IB_SA_SERVICE_COMP_MASK_DATA8_6 | \
+ IB_SA_SERVICE_COMP_MASK_DATA8_7 | \
+ IB_SA_SERVICE_COMP_MASK_DATA8_8)
static inline __u64*
-koibnal_service_nid_field(struct ib_common_attrib_service *srv)
+kibnal_service_nid_field(struct ib_common_attrib_service *srv)
{
- /* must be consistent with KOIBNAL_SERVICE_KEY_MASK */
+ /* must be consistent with KIBNAL_SERVICE_KEY_MASK */
return (__u64 *)srv->service_data8;
}
static inline void
-koibnal_set_service_keys(struct ib_common_attrib_service *srv, ptl_nid_t nid)
+kibnal_set_service_keys(struct ib_common_attrib_service *srv, ptl_nid_t nid)
{
- LASSERT (strlen (OPENIBNAL_SERVICE_NAME) < sizeof(srv->service_name));
+ LASSERT (strlen (IBNAL_SERVICE_NAME) < sizeof(srv->service_name));
memset (srv->service_name, 0, sizeof(srv->service_name));
- strcpy (srv->service_name, OPENIBNAL_SERVICE_NAME);
+ strcpy (srv->service_name, IBNAL_SERVICE_NAME);
- *koibnal_service_nid_field(srv) = cpu_to_le64(nid);
+ *kibnal_service_nid_field(srv) = cpu_to_le64(nid);
}
#if 0
static inline void
-koibnal_show_rdma_attr (koib_conn_t *conn)
+kibnal_show_rdma_attr (kib_conn_t *conn)
{
struct ib_qp_attribute qp_attr;
int rc;
#if CONFIG_X86
static inline __u64
-koibnal_page2phys (struct page *p)
+kibnal_page2phys (struct page *p)
{
__u64 page_number = p - mem_map;
# error "no page->phys"
#endif
-extern koib_peer_t *koibnal_create_peer (ptl_nid_t nid);
-extern void koibnal_put_peer (koib_peer_t *peer);
-extern int koibnal_del_peer (ptl_nid_t nid, int single_share);
-extern koib_peer_t *koibnal_find_peer_locked (ptl_nid_t nid);
-extern void koibnal_unlink_peer_locked (koib_peer_t *peer);
-extern int koibnal_close_stale_conns_locked (koib_peer_t *peer,
+/* CAVEAT EMPTOR:
+ * We rely on tx/rx descriptor alignment to allow us to use the lowest bit
+ * of the work request id as a flag to determine if the completion is for a
+ * transmit or a receive. It seems that that the CQ entry's 'op' field
+ * isn't always set correctly on completions that occur after QP teardown. */
+
+static inline __u64
+kibnal_ptr2wreqid (void *ptr, int isrx)
+{
+ unsigned long lptr = (unsigned long)ptr;
+
+ LASSERT ((lptr & 1) == 0);
+ return (__u64)(lptr | (isrx ? 1 : 0));
+}
+
+static inline void *
+kibnal_wreqid2ptr (__u64 wreqid)
+{
+ return (void *)(((unsigned long)wreqid) & ~1UL);
+}
+
+static inline int
+kibnal_wreqid_is_rx (__u64 wreqid)
+{
+ return (wreqid & 1) != 0;
+}
+
+extern kib_peer_t *kibnal_create_peer (ptl_nid_t nid);
+extern void kibnal_put_peer (kib_peer_t *peer);
+extern int kibnal_del_peer (ptl_nid_t nid, int single_share);
+extern kib_peer_t *kibnal_find_peer_locked (ptl_nid_t nid);
+extern void kibnal_unlink_peer_locked (kib_peer_t *peer);
+extern int kibnal_close_stale_conns_locked (kib_peer_t *peer,
__u64 incarnation);
-extern koib_conn_t *koibnal_create_conn (void);
-extern void koibnal_put_conn (koib_conn_t *conn);
-extern void koibnal_destroy_conn (koib_conn_t *conn);
-extern int koibnal_alloc_pages (koib_pages_t **pp, int npages, int access);
-extern void koibnal_free_pages (koib_pages_t *p);
+extern kib_conn_t *kibnal_create_conn (void);
+extern void kibnal_put_conn (kib_conn_t *conn);
+extern void kibnal_destroy_conn (kib_conn_t *conn);
+extern int kibnal_alloc_pages (kib_pages_t **pp, int npages, int access);
+extern void kibnal_free_pages (kib_pages_t *p);
-extern void koibnal_check_sends (koib_conn_t *conn);
+extern void kibnal_check_sends (kib_conn_t *conn);
extern tTS_IB_CM_CALLBACK_RETURN
-koibnal_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid,
+kibnal_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid,
void *param, void *arg);
extern tTS_IB_CM_CALLBACK_RETURN
-koibnal_passive_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid,
+kibnal_passive_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid,
void *param, void *arg);
-extern void koibnal_close_conn_locked (koib_conn_t *conn, int error);
-extern void koibnal_destroy_conn (koib_conn_t *conn);
-extern int koibnal_thread_start (int (*fn)(void *arg), void *arg);
-extern int koibnal_scheduler(void *arg);
-extern int koibnal_connd (void *arg);
-extern void koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg);
-extern void koibnal_tx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg);
-extern void koibnal_init_tx_msg (koib_tx_t *tx, int type, int body_nob);
-extern int koibnal_close_conn (koib_conn_t *conn, int why);
-extern void koibnal_start_active_rdma (int type, int status,
- koib_rx_t *rx, lib_msg_t *libmsg,
- unsigned int niov,
- struct iovec *iov, ptl_kiov_t *kiov,
- size_t offset, size_t nob);
+extern void kibnal_close_conn_locked (kib_conn_t *conn, int error);
+extern void kibnal_destroy_conn (kib_conn_t *conn);
+extern int kibnal_thread_start (int (*fn)(void *arg), void *arg);
+extern int kibnal_scheduler(void *arg);
+extern int kibnal_connd (void *arg);
+extern void kibnal_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg);
+extern void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob);
+extern int kibnal_close_conn (kib_conn_t *conn, int why);
+extern void kibnal_start_active_rdma (int type, int status,
+ kib_rx_t *rx, lib_msg_t *libmsg,
+ unsigned int niov,
+ struct iovec *iov, ptl_kiov_t *kiov,
+ size_t offset, size_t nob);
+
*
*/
void
-koibnal_schedule_tx_done (koib_tx_t *tx)
+kibnal_schedule_tx_done (kib_tx_t *tx)
{
unsigned long flags;
- spin_lock_irqsave (&koibnal_data.koib_sched_lock, flags);
+ spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags);
- list_add_tail(&tx->tx_list, &koibnal_data.koib_sched_txq);
- wake_up (&koibnal_data.koib_sched_waitq);
+ list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq);
+ wake_up (&kibnal_data.kib_sched_waitq);
- spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, flags);
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
}
void
-koibnal_tx_done (koib_tx_t *tx)
+kibnal_tx_done (kib_tx_t *tx)
{
ptl_err_t ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL;
unsigned long flags;
int rc;
LASSERT (tx->tx_sending == 0); /* mustn't be awaiting callback */
- LASSERT (!tx->tx_passive_rdma_wait); /* mustn't be on ibc_rdma_queue */
+ LASSERT (!tx->tx_passive_rdma_wait); /* mustn't be awaiting RDMA */
switch (tx->tx_mapped) {
default:
LBUG();
- case KOIB_TX_UNMAPPED:
+ case KIB_TX_UNMAPPED:
break;
- case KOIB_TX_MAPPED:
+ case KIB_TX_MAPPED:
if (in_interrupt()) {
/* can't deregister memory in IRQ context... */
- koibnal_schedule_tx_done(tx);
+ kibnal_schedule_tx_done(tx);
return;
}
rc = ib_memory_deregister(tx->tx_md.md_handle.mr);
LASSERT (rc == 0);
- tx->tx_mapped = KOIB_TX_UNMAPPED;
+ tx->tx_mapped = KIB_TX_UNMAPPED;
break;
-#if OPENIBNAL_FMR
- case KOIB_TX_MAPPED_FMR:
+#if IBNAL_FMR
+ case KIB_TX_MAPPED_FMR:
if (in_interrupt() && tx->tx_status != 0) {
/* can't flush FMRs in IRQ context... */
- koibnal_schedule_tx_done(tx);
+ kibnal_schedule_tx_done(tx);
return;
}
LASSERT (rc == 0);
if (tx->tx_status != 0)
- ib_fmr_pool_force_flush(koibnal_data.koib_fmr_pool);
- tx->tx_mapped = KOIB_TX_UNMAPPED;
+ ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool);
+ tx->tx_mapped = KIB_TX_UNMAPPED;
break;
#endif
}
if (tx->tx_libmsg[i] == NULL)
continue;
- lib_finalize (&koibnal_lib, NULL, tx->tx_libmsg[i], ptlrc);
+ lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc);
tx->tx_libmsg[i] = NULL;
}
if (tx->tx_conn != NULL) {
- koibnal_put_conn (tx->tx_conn);
+ kibnal_put_conn (tx->tx_conn);
tx->tx_conn = NULL;
}
tx->tx_passive_rdma = 0;
tx->tx_status = 0;
- spin_lock_irqsave (&koibnal_data.koib_tx_lock, flags);
+ spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
if (tx->tx_isnblk) {
- list_add_tail (&tx->tx_list, &koibnal_data.koib_idle_nblk_txs);
+ list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs);
} else {
- list_add_tail (&tx->tx_list, &koibnal_data.koib_idle_txs);
- wake_up (&koibnal_data.koib_idle_tx_waitq);
+ list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs);
+ wake_up (&kibnal_data.kib_idle_tx_waitq);
}
- spin_unlock_irqrestore (&koibnal_data.koib_tx_lock, flags);
+ spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
}
-koib_tx_t *
-koibnal_get_idle_tx (int may_block)
+kib_tx_t *
+kibnal_get_idle_tx (int may_block)
{
- unsigned long flags;
- koib_tx_t *tx = NULL;
+ unsigned long flags;
+ kib_tx_t *tx = NULL;
for (;;) {
- spin_lock_irqsave (&koibnal_data.koib_tx_lock, flags);
+ spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
/* "normal" descriptor is free */
- if (!list_empty (&koibnal_data.koib_idle_txs)) {
- tx = list_entry (koibnal_data.koib_idle_txs.next,
- koib_tx_t, tx_list);
+ if (!list_empty (&kibnal_data.kib_idle_txs)) {
+ tx = list_entry (kibnal_data.kib_idle_txs.next,
+ kib_tx_t, tx_list);
break;
}
if (!may_block) {
/* may dip into reserve pool */
- if (list_empty (&koibnal_data.koib_idle_nblk_txs)) {
+ if (list_empty (&kibnal_data.kib_idle_nblk_txs)) {
CERROR ("reserved tx desc pool exhausted\n");
break;
}
- tx = list_entry (koibnal_data.koib_idle_nblk_txs.next,
- koib_tx_t, tx_list);
+ tx = list_entry (kibnal_data.kib_idle_nblk_txs.next,
+ kib_tx_t, tx_list);
break;
}
/* block for idle tx */
- spin_unlock_irqrestore (&koibnal_data.koib_tx_lock, flags);
+ spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
- wait_event (koibnal_data.koib_idle_tx_waitq,
- !list_empty (&koibnal_data.koib_idle_txs) ||
- koibnal_data.koib_shutdown);
+ wait_event (kibnal_data.kib_idle_tx_waitq,
+ !list_empty (&kibnal_data.kib_idle_txs) ||
+ kibnal_data.kib_shutdown);
}
if (tx != NULL) {
/* Allocate a new passive RDMA completion cookie. It might
* not be needed, but we've got a lock right now and we're
* unlikely to wrap... */
- tx->tx_passive_rdma_cookie = koibnal_data.koib_next_tx_cookie++;
+ tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++;
- LASSERT (tx->tx_mapped == KOIB_TX_UNMAPPED);
+ LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
LASSERT (tx->tx_nsp == 0);
LASSERT (tx->tx_sending == 0);
LASSERT (tx->tx_status == 0);
LASSERT (tx->tx_libmsg[1] == NULL);
}
- spin_unlock_irqrestore (&koibnal_data.koib_tx_lock, flags);
+ spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
return (tx);
}
int
-koibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
+kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
{
- /* I would guess that if koibnal_get_peer (nid) == NULL,
+ /* I would guess that if kibnal_get_peer (nid) == NULL,
and we're not routing, then 'nid' is very distant :) */
if ( nal->libnal_ni.ni_pid.nid == nid ) {
*dist = 0;
}
void
-koibnal_complete_passive_rdma(koib_conn_t *conn, __u64 cookie, int status)
+kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status)
{
struct list_head *ttmp;
unsigned long flags;
spin_lock_irqsave (&conn->ibc_lock, flags);
- list_for_each (ttmp, &conn->ibc_rdma_queue) {
- koib_tx_t *tx = list_entry(ttmp, koib_tx_t, tx_list);
-
- LASSERT (tx->tx_passive_rdma);
- LASSERT (tx->tx_passive_rdma_wait);
+ list_for_each (ttmp, &conn->ibc_active_txs) {
+ kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list);
- if (tx->tx_passive_rdma_cookie != cookie)
- continue;
+ LASSERT (tx->tx_passive_rdma ||
+ !tx->tx_passive_rdma_wait);
- CDEBUG(D_NET, "Complete %p "LPD64"\n", tx, cookie);
+ LASSERT (tx->tx_passive_rdma_wait ||
+ tx->tx_sending != 0);
- list_del (&tx->tx_list);
+ if (!tx->tx_passive_rdma_wait ||
+ tx->tx_passive_rdma_cookie != cookie)
+ continue;
+
+ CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status);
+ tx->tx_status = status;
tx->tx_passive_rdma_wait = 0;
idle = (tx->tx_sending == 0);
- tx->tx_status = status;
+ if (idle)
+ list_del (&tx->tx_list);
spin_unlock_irqrestore (&conn->ibc_lock, flags);
/* I could be racing with tx callbacks. It's whoever
* _makes_ tx idle that frees it */
if (idle)
- koibnal_tx_done (tx);
+ kibnal_tx_done (tx);
return;
}
}
void
-koibnal_post_rx (koib_rx_t *rx, int do_credits)
+kibnal_post_rx (kib_rx_t *rx, int do_credits)
{
- koib_conn_t *conn = rx->rx_conn;
+ kib_conn_t *conn = rx->rx_conn;
int rc;
unsigned long flags;
rx->rx_gl = (struct ib_gather_scatter) {
.address = rx->rx_vaddr,
- .length = OPENIBNAL_MSG_SIZE,
- .key = conn->ibc_rx_pages->oibp_lkey,
+ .length = IBNAL_MSG_SIZE,
+ .key = conn->ibc_rx_pages->ibp_lkey,
};
-
+
rx->rx_sp = (struct ib_receive_param) {
- .work_request_id = (__u64)(unsigned long)rx,
+ .work_request_id = kibnal_ptr2wreqid(rx, 1),
.scatter_list = &rx->rx_gl,
.num_scatter_entries = 1,
.device_specific = NULL,
.signaled = 1,
};
- LASSERT (conn->ibc_state >= OPENIBNAL_CONN_ESTABLISHED);
+ LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
LASSERT (!rx->rx_posted);
rx->rx_posted = 1;
mb();
- if (conn->ibc_state != OPENIBNAL_CONN_ESTABLISHED)
+ if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
rc = -ECONNABORTED;
else
rc = ib_receive (conn->ibc_qp, &rx->rx_sp, 1);
conn->ibc_outstanding_credits++;
spin_unlock_irqrestore(&conn->ibc_lock, flags);
- koibnal_check_sends(conn);
+ kibnal_check_sends(conn);
}
return;
}
- if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED) {
+ if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
CERROR ("Error posting receive -> "LPX64": %d\n",
conn->ibc_peer->ibp_nid, rc);
- koibnal_close_conn (rx->rx_conn, rc);
+ kibnal_close_conn (rx->rx_conn, rc);
} else {
CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n",
conn->ibc_peer->ibp_nid, rc);
}
/* Drop rx's ref */
- koibnal_put_conn (conn);
+ kibnal_put_conn (conn);
}
-#if OPENIBNAL_CKSUM
-__u32 koibnal_cksum (void *ptr, int nob)
+#if IBNAL_CKSUM
+__u32 kibnal_cksum (void *ptr, int nob)
{
char *c = ptr;
__u32 sum = 0;
#endif
void
-koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
+kibnal_rx_callback (struct ib_cq_entry *e)
{
- koib_rx_t *rx = (koib_rx_t *)((unsigned long)e->work_request_id);
- koib_msg_t *msg = rx->rx_msg;
- koib_conn_t *conn = rx->rx_conn;
+ kib_rx_t *rx = (kib_rx_t *)kibnal_wreqid2ptr(e->work_request_id);
+ kib_msg_t *msg = rx->rx_msg;
+ kib_conn_t *conn = rx->rx_conn;
int nob = e->bytes_transferred;
- const int base_nob = offsetof(koib_msg_t, oibm_u);
+ const int base_nob = offsetof(kib_msg_t, ibm_u);
int credits;
int flipped;
unsigned long flags;
-#if OPENIBNAL_CKSUM
+#if IBNAL_CKSUM
__u32 msg_cksum;
__u32 computed_cksum;
#endif
/* receives complete with error in any case after we've started
* closing the QP */
- if (conn->ibc_state >= OPENIBNAL_CONN_DEATHROW)
+ if (conn->ibc_state >= IBNAL_CONN_DEATHROW)
goto failed;
/* We don't post receives until the conn is established */
- LASSERT (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED);
+ LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
if (e->status != IB_COMPLETION_STATUS_SUCCESS) {
CERROR("Rx from "LPX64" failed: %d\n",
/* Receiver does any byte flipping if necessary... */
- if (msg->oibm_magic == OPENIBNAL_MSG_MAGIC) {
+ if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
flipped = 0;
} else {
- if (msg->oibm_magic != __swab32(OPENIBNAL_MSG_MAGIC)) {
+ if (msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) {
CERROR ("Unrecognised magic: %08x from "LPX64"\n",
- msg->oibm_magic, conn->ibc_peer->ibp_nid);
+ msg->ibm_magic, conn->ibc_peer->ibp_nid);
goto failed;
}
flipped = 1;
- __swab16s (&msg->oibm_version);
- LASSERT (sizeof(msg->oibm_type) == 1);
- LASSERT (sizeof(msg->oibm_credits) == 1);
+ __swab16s (&msg->ibm_version);
+ LASSERT (sizeof(msg->ibm_type) == 1);
+ LASSERT (sizeof(msg->ibm_credits) == 1);
}
- if (msg->oibm_version != OPENIBNAL_MSG_VERSION) {
+ if (msg->ibm_version != IBNAL_MSG_VERSION) {
CERROR ("Incompatible msg version %d (%d expected)\n",
- msg->oibm_version, OPENIBNAL_MSG_VERSION);
+ msg->ibm_version, IBNAL_MSG_VERSION);
goto failed;
}
-#if OPENIBNAL_CKSUM
- if (nob != msg->oibm_nob) {
- CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->oibm_nob);
+#if IBNAL_CKSUM
+ if (nob != msg->ibm_nob) {
+ CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->ibm_nob);
goto failed;
}
- msg_cksum = le32_to_cpu(msg->oibm_cksum);
- msg->oibm_cksum = 0;
- computed_cksum = koibnal_cksum (msg, nob);
+ msg_cksum = le32_to_cpu(msg->ibm_cksum);
+ msg->ibm_cksum = 0;
+ computed_cksum = kibnal_cksum (msg, nob);
if (msg_cksum != computed_cksum) {
CERROR ("Checksum failure %d: (%d expected)\n",
#endif
/* Have I received credits that will let me send? */
- credits = msg->oibm_credits;
+ credits = msg->ibm_credits;
if (credits != 0) {
spin_lock_irqsave(&conn->ibc_lock, flags);
conn->ibc_credits += credits;
spin_unlock_irqrestore(&conn->ibc_lock, flags);
- koibnal_check_sends(conn);
+ kibnal_check_sends(conn);
}
- switch (msg->oibm_type) {
- case OPENIBNAL_MSG_NOOP:
- koibnal_post_rx (rx, 1);
+ switch (msg->ibm_type) {
+ case IBNAL_MSG_NOOP:
+ kibnal_post_rx (rx, 1);
return;
- case OPENIBNAL_MSG_IMMEDIATE:
- if (nob < base_nob + sizeof (koib_immediate_msg_t)) {
+ case IBNAL_MSG_IMMEDIATE:
+ if (nob < base_nob + sizeof (kib_immediate_msg_t)) {
CERROR ("Short IMMEDIATE from "LPX64": %d\n",
conn->ibc_peer->ibp_nid, nob);
goto failed;
}
break;
- case OPENIBNAL_MSG_PUT_RDMA:
- case OPENIBNAL_MSG_GET_RDMA:
- if (nob < base_nob + sizeof (koib_rdma_msg_t)) {
+ case IBNAL_MSG_PUT_RDMA:
+ case IBNAL_MSG_GET_RDMA:
+ if (nob < base_nob + sizeof (kib_rdma_msg_t)) {
CERROR ("Short RDMA msg from "LPX64": %d\n",
conn->ibc_peer->ibp_nid, nob);
goto failed;
}
if (flipped) {
- __swab32s(&msg->oibm_u.rdma.oibrm_desc.rd_key);
- __swab32s(&msg->oibm_u.rdma.oibrm_desc.rd_nob);
- __swab64s(&msg->oibm_u.rdma.oibrm_desc.rd_addr);
+ __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_key);
+ __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_nob);
+ __swab64s(&msg->ibm_u.rdma.ibrm_desc.rd_addr);
}
CDEBUG(D_NET, "%d RDMA: cookie "LPX64", key %x, addr "LPX64", nob %d\n",
- msg->oibm_type, msg->oibm_u.rdma.oibrm_cookie,
- msg->oibm_u.rdma.oibrm_desc.rd_key,
- msg->oibm_u.rdma.oibrm_desc.rd_addr,
- msg->oibm_u.rdma.oibrm_desc.rd_nob);
+ msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie,
+ msg->ibm_u.rdma.ibrm_desc.rd_key,
+ msg->ibm_u.rdma.ibrm_desc.rd_addr,
+ msg->ibm_u.rdma.ibrm_desc.rd_nob);
break;
- case OPENIBNAL_MSG_PUT_DONE:
- case OPENIBNAL_MSG_GET_DONE:
- if (nob < base_nob + sizeof (koib_completion_msg_t)) {
+ case IBNAL_MSG_PUT_DONE:
+ case IBNAL_MSG_GET_DONE:
+ if (nob < base_nob + sizeof (kib_completion_msg_t)) {
CERROR ("Short COMPLETION msg from "LPX64": %d\n",
conn->ibc_peer->ibp_nid, nob);
goto failed;
}
if (flipped)
- __swab32s(&msg->oibm_u.completion.oibcm_status);
+ __swab32s(&msg->ibm_u.completion.ibcm_status);
CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n",
- msg->oibm_type, msg->oibm_u.completion.oibcm_cookie,
- msg->oibm_u.completion.oibcm_status);
+ msg->ibm_type, msg->ibm_u.completion.ibcm_cookie,
+ msg->ibm_u.completion.ibcm_status);
- koibnal_complete_passive_rdma (conn,
- msg->oibm_u.completion.oibcm_cookie,
- msg->oibm_u.completion.oibcm_status);
- koibnal_post_rx (rx, 1);
+ kibnal_complete_passive_rdma (conn,
+ msg->ibm_u.completion.ibcm_cookie,
+ msg->ibm_u.completion.ibcm_status);
+ kibnal_post_rx (rx, 1);
return;
default:
CERROR ("Can't parse type from "LPX64": %d\n",
- conn->ibc_peer->ibp_nid, msg->oibm_type);
+ conn->ibc_peer->ibp_nid, msg->ibm_type);
goto failed;
}
- /* schedule for koibnal_rx() in thread context */
- spin_lock_irqsave(&koibnal_data.koib_sched_lock, flags);
+ /* schedule for kibnal_rx() in thread context */
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
- list_add_tail (&rx->rx_list, &koibnal_data.koib_sched_rxq);
- wake_up (&koibnal_data.koib_sched_waitq);
+ list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq);
+ wake_up (&kibnal_data.kib_sched_waitq);
- spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, flags);
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
return;
failed:
CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
- koibnal_close_conn(conn, -ECONNABORTED);
+ kibnal_close_conn(conn, -ECONNABORTED);
/* Don't re-post rx & drop its ref on conn */
- koibnal_put_conn(conn);
+ kibnal_put_conn(conn);
}
void
-koibnal_rx (koib_rx_t *rx)
+kibnal_rx (kib_rx_t *rx)
{
- koib_msg_t *msg = rx->rx_msg;
+ kib_msg_t *msg = rx->rx_msg;
/* Clear flag so I can detect if I've sent an RDMA completion */
rx->rx_rdma = 0;
- switch (msg->oibm_type) {
- case OPENIBNAL_MSG_GET_RDMA:
- lib_parse(&koibnal_lib, &msg->oibm_u.rdma.oibrm_hdr, rx);
+ switch (msg->ibm_type) {
+ case IBNAL_MSG_GET_RDMA:
+ lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
/* If the incoming get was matched, I'll have initiated the
* RDMA and the completion message... */
if (rx->rx_rdma)
* the peer's GET blocking for the full timeout. */
CERROR ("Completing unmatched RDMA GET from "LPX64"\n",
rx->rx_conn->ibc_peer->ibp_nid);
- koibnal_start_active_rdma (OPENIBNAL_MSG_GET_DONE, -EIO,
- rx, NULL, 0, NULL, NULL, 0, 0);
+ kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO,
+ rx, NULL, 0, NULL, NULL, 0, 0);
break;
- case OPENIBNAL_MSG_PUT_RDMA:
- lib_parse(&koibnal_lib, &msg->oibm_u.rdma.oibrm_hdr, rx);
+ case IBNAL_MSG_PUT_RDMA:
+ lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
if (rx->rx_rdma)
break;
/* This is most unusual, since even if lib_parse() didn't
rx->rx_conn->ibc_peer->ibp_nid);
break;
- case OPENIBNAL_MSG_IMMEDIATE:
- lib_parse(&koibnal_lib, &msg->oibm_u.immediate.oibim_hdr, rx);
+ case IBNAL_MSG_IMMEDIATE:
+ lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx);
LASSERT (!rx->rx_rdma);
break;
break;
}
- koibnal_post_rx (rx, 1);
+ kibnal_post_rx (rx, 1);
}
#if 0
int
-koibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp)
+kibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp)
{
struct page *page;
else if (vaddr >= PKMAP_BASE &&
vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
page = vmalloc_to_page ((void *)vaddr);
- /* in 2.4 ^ just walks the page tables */
+ /* in 2.4 ^ just walks the page tables */
#endif
else
page = virt_to_page (vaddr);
!VALID_PAGE (page))
return (-EFAULT);
- *physp = koibnal_page2phys(page) + (vaddr & (PAGE_SIZE - 1));
+ *physp = kibnal_page2phys(page) + (vaddr & (PAGE_SIZE - 1));
return (0);
}
#endif
int
-koibnal_map_iov (koib_tx_t *tx, enum ib_memory_access access,
+kibnal_map_iov (kib_tx_t *tx, enum ib_memory_access access,
int niov, struct iovec *iov, int offset, int nob)
{
LASSERT (nob > 0);
LASSERT (niov > 0);
- LASSERT (tx->tx_mapped == KOIB_TX_UNMAPPED);
+ LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
while (offset >= iov->iov_len) {
offset -= iov->iov_len;
vaddr = (void *)(((unsigned long)iov->iov_base) + offset);
tx->tx_md.md_addr = (__u64)((unsigned long)vaddr);
- rc = ib_memory_register (koibnal_data.koib_pd,
+ rc = ib_memory_register (kibnal_data.kib_pd,
vaddr, nob,
access,
&tx->tx_md.md_handle.mr,
return (rc);
}
- tx->tx_mapped = KOIB_TX_MAPPED;
+ tx->tx_mapped = KIB_TX_MAPPED;
return (0);
}
int
-koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access,
+kibnal_map_kiov (kib_tx_t *tx, enum ib_memory_access access,
int nkiov, ptl_kiov_t *kiov,
int offset, int nob)
{
-#if OPENIBNAL_FMR
+#if IBNAL_FMR
__u64 *phys;
- const int mapped = KOIB_TX_MAPPED_FMR;
+ const int mapped = KIB_TX_MAPPED_FMR;
#else
struct ib_physical_buffer *phys;
- const int mapped = KOIB_TX_MAPPED;
+ const int mapped = KIB_TX_MAPPED;
#endif
int page_offset;
int nphys;
LASSERT (nob > 0);
LASSERT (nkiov > 0);
- LASSERT (tx->tx_mapped == KOIB_TX_UNMAPPED);
+ LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
while (offset >= kiov->kiov_len) {
offset -= kiov->kiov_len;
}
page_offset = kiov->kiov_offset + offset;
-#if OPENIBNAL_FMR
- phys[0] = koibnal_page2phys(kiov->kiov_page);
+#if IBNAL_FMR
+ phys[0] = kibnal_page2phys(kiov->kiov_page);
#else
- phys[0].address = koibnal_page2phys(kiov->kiov_page);
+ phys[0].address = kibnal_page2phys(kiov->kiov_page);
phys[0].size = PAGE_SIZE;
#endif
nphys = 1;
}
LASSERT (nphys * sizeof (*phys) < phys_size);
-#if OPENIBNAL_FMR
- phys[nphys] = koibnal_page2phys(kiov->kiov_page);
+#if IBNAL_FMR
+ phys[nphys] = kibnal_page2phys(kiov->kiov_page);
#else
- phys[nphys].address = koibnal_page2phys(kiov->kiov_page);
+ phys[nphys].address = kibnal_page2phys(kiov->kiov_page);
phys[nphys].size = PAGE_SIZE;
#endif
nphys++;
for (rc = 0; rc < nphys; rc++)
CWARN (" [%d] "LPX64" / %d\n", rc, phys[rc].address, phys[rc].size);
#endif
- tx->tx_md.md_addr = OPENIBNAL_RDMA_BASE;
+ tx->tx_md.md_addr = IBNAL_RDMA_BASE;
-#if OPENIBNAL_FMR
- rc = ib_fmr_register_physical (koibnal_data.koib_fmr_pool,
+#if IBNAL_FMR
+ rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool,
phys, nphys,
&tx->tx_md.md_addr,
page_offset,
&tx->tx_md.md_lkey,
&tx->tx_md.md_rkey);
#else
- rc = ib_memory_register_physical (koibnal_data.koib_pd,
+ rc = ib_memory_register_physical (kibnal_data.kib_pd,
phys, nphys,
&tx->tx_md.md_addr,
nob, page_offset,
return (rc);
}
-koib_conn_t *
-koibnal_find_conn_locked (koib_peer_t *peer)
+kib_conn_t *
+kibnal_find_conn_locked (kib_peer_t *peer)
{
struct list_head *tmp;
/* just return the first connection */
list_for_each (tmp, &peer->ibp_conns) {
- return (list_entry(tmp, koib_conn_t, ibc_list));
+ return (list_entry(tmp, kib_conn_t, ibc_list));
}
return (NULL);
}
void
-koibnal_check_sends (koib_conn_t *conn)
+kibnal_check_sends (kib_conn_t *conn)
{
unsigned long flags;
- koib_tx_t *tx;
+ kib_tx_t *tx;
int rc;
int i;
int done;
spin_lock_irqsave (&conn->ibc_lock, flags);
+ LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE);
+
if (list_empty(&conn->ibc_tx_queue) &&
- conn->ibc_outstanding_credits >= OPENIBNAL_CREDIT_HIGHWATER) {
+ conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) {
spin_unlock_irqrestore(&conn->ibc_lock, flags);
-
- tx = koibnal_get_idle_tx(0); /* don't block */
+
+ tx = kibnal_get_idle_tx(0); /* don't block */
if (tx != NULL)
- koibnal_init_tx_msg(tx, OPENIBNAL_MSG_NOOP, 0);
+ kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
spin_lock_irqsave(&conn->ibc_lock, flags);
-
+
if (tx != NULL) {
atomic_inc(&conn->ibc_refcount);
- koibnal_queue_tx_locked(tx, conn);
+ kibnal_queue_tx_locked(tx, conn);
}
}
- LASSERT (conn->ibc_nsends_posted <= OPENIBNAL_MSG_QUEUE_SIZE);
-
while (!list_empty (&conn->ibc_tx_queue)) {
- tx = list_entry (conn->ibc_tx_queue.next, koib_tx_t, tx_list);
+ tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list);
/* We rely on this for QP sizing */
LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= 2);
LASSERT (conn->ibc_outstanding_credits >= 0);
- LASSERT (conn->ibc_outstanding_credits <= OPENIBNAL_MSG_QUEUE_SIZE);
+ LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
LASSERT (conn->ibc_credits >= 0);
- LASSERT (conn->ibc_credits <= OPENIBNAL_MSG_QUEUE_SIZE);
+ LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
/* Not on ibc_rdma_queue */
LASSERT (!tx->tx_passive_rdma_wait);
- if (conn->ibc_nsends_posted == OPENIBNAL_MSG_QUEUE_SIZE)
+ if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE)
break;
if (conn->ibc_credits == 0) /* no credits */
list_del (&tx->tx_list);
- if (tx->tx_msg->oibm_type == OPENIBNAL_MSG_NOOP &&
+ if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
(!list_empty(&conn->ibc_tx_queue) ||
- conn->ibc_outstanding_credits < OPENIBNAL_CREDIT_HIGHWATER)) {
- /* Redundant NOOP */
+ conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) {
+ /* redundant NOOP */
spin_unlock_irqrestore(&conn->ibc_lock, flags);
- koibnal_tx_done(tx);
+ kibnal_tx_done(tx);
spin_lock_irqsave(&conn->ibc_lock, flags);
continue;
}
-
- /* incoming RDMA completion can find this one now */
- if (tx->tx_passive_rdma) {
- list_add (&tx->tx_list, &conn->ibc_rdma_queue);
- tx->tx_passive_rdma_wait = 1;
- tx->tx_passive_rdma_deadline =
- jiffies + koibnal_tunables.koib_io_timeout * HZ;
- }
- tx->tx_msg->oibm_credits = conn->ibc_outstanding_credits;
+ tx->tx_msg->ibm_credits = conn->ibc_outstanding_credits;
conn->ibc_outstanding_credits = 0;
- /* use the free memory barrier when we unlock to ensure
- * sending set before we can get the tx callback. */
conn->ibc_nsends_posted++;
conn->ibc_credits--;
- tx->tx_sending = tx->tx_nsp;
-#if OPENIBNAL_CKSUM
- tx->tx_msg->oibm_cksum = 0;
- tx->tx_msg->oibm_cksum = koibnal_cksum(tx->tx_msg, tx->tx_msg->oibm_nob);
- CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->oibm_cksum, tx->tx_msg->oibm_nob);
+ tx->tx_sending = tx->tx_nsp;
+ tx->tx_passive_rdma_wait = tx->tx_passive_rdma;
+ list_add (&tx->tx_list, &conn->ibc_active_txs);
+#if IBNAL_CKSUM
+ tx->tx_msg->ibm_cksum = 0;
+ tx->tx_msg->ibm_cksum = kibnal_cksum(tx->tx_msg, tx->tx_msg->ibm_nob);
+ CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->ibm_cksum, tx->tx_msg->ibm_nob);
#endif
spin_unlock_irqrestore (&conn->ibc_lock, flags);
rc = -ECONNABORTED;
nwork = 0;
- if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED) {
+ if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
tx->tx_status = 0;
/* Driver only accepts 1 item at a time */
for (i = 0; i < tx->tx_nsp; i++) {
if (rc != 0) {
/* NB credits are transferred in the actual
* message, which can only be the last work item */
- conn->ibc_outstanding_credits += tx->tx_msg->oibm_credits;
+ conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
conn->ibc_credits++;
conn->ibc_nsends_posted--;
- tx->tx_sending -= tx->tx_nsp - nwork;
+
tx->tx_status = rc;
+ tx->tx_passive_rdma_wait = 0;
+ tx->tx_sending -= tx->tx_nsp - nwork;
+
done = (tx->tx_sending == 0);
-
- if (tx->tx_passive_rdma) {
- tx->tx_passive_rdma_wait = 0;
+ if (done)
list_del (&tx->tx_list);
- }
spin_unlock_irqrestore (&conn->ibc_lock, flags);
- if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED)
+ if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
CERROR ("Error %d posting transmit to "LPX64"\n",
rc, conn->ibc_peer->ibp_nid);
else
CDEBUG (D_NET, "Error %d posting transmit to "
LPX64"\n", rc, conn->ibc_peer->ibp_nid);
- koibnal_close_conn (conn, rc);
+ kibnal_close_conn (conn, rc);
if (done)
- koibnal_tx_done (tx);
+ kibnal_tx_done (tx);
return;
}
}
void
-koibnal_tx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
+kibnal_tx_callback (struct ib_cq_entry *e)
{
- koib_tx_t *tx = (koib_tx_t *)((unsigned long)e->work_request_id);
- koib_conn_t *conn;
+ kib_tx_t *tx = (kib_tx_t *)kibnal_wreqid2ptr(e->work_request_id);
+ kib_conn_t *conn;
unsigned long flags;
int idle;
tx->tx_sending--;
idle = (tx->tx_sending == 0) && /* This is the final callback */
(!tx->tx_passive_rdma_wait); /* Not waiting for RDMA completion */
+ if (idle)
+ list_del(&tx->tx_list);
CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
spin_unlock_irqrestore(&conn->ibc_lock, flags);
if (idle)
- koibnal_tx_done (tx);
+ kibnal_tx_done (tx);
if (e->status != IB_COMPLETION_STATUS_SUCCESS) {
CERROR ("Tx completion to "LPX64" failed: %d\n",
conn->ibc_peer->ibp_nid, e->status);
- koibnal_close_conn (conn, -ENETDOWN);
+ kibnal_close_conn (conn, -ENETDOWN);
} else {
/* can I shovel some more sends out the door? */
- koibnal_check_sends(conn);
+ kibnal_check_sends(conn);
}
- koibnal_put_conn (conn);
+ kibnal_put_conn (conn);
}
void
-koibnal_init_tx_msg (koib_tx_t *tx, int type, int body_nob)
+kibnal_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
+{
+ if (kibnal_wreqid_is_rx(e->work_request_id))
+ kibnal_rx_callback (e);
+ else
+ kibnal_tx_callback (e);
+}
+
+void
+kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
{
struct ib_gather_scatter *gl = &tx->tx_gl[tx->tx_nsp];
struct ib_send_param *sp = &tx->tx_sp[tx->tx_nsp];
int fence;
- int nob = offsetof (koib_msg_t, oibm_u) + body_nob;
+ int nob = offsetof (kib_msg_t, ibm_u) + body_nob;
LASSERT (tx->tx_nsp >= 0 &&
tx->tx_nsp < sizeof(tx->tx_sp)/sizeof(tx->tx_sp[0]));
- LASSERT (nob <= OPENIBNAL_MSG_SIZE);
+ LASSERT (nob <= IBNAL_MSG_SIZE);
- tx->tx_msg->oibm_magic = OPENIBNAL_MSG_MAGIC;
- tx->tx_msg->oibm_version = OPENIBNAL_MSG_VERSION;
- tx->tx_msg->oibm_type = type;
-#if OPENIBNAL_CKSUM
- tx->tx_msg->oibm_nob = nob;
+ tx->tx_msg->ibm_magic = IBNAL_MSG_MAGIC;
+ tx->tx_msg->ibm_version = IBNAL_MSG_VERSION;
+ tx->tx_msg->ibm_type = type;
+#if IBNAL_CKSUM
+ tx->tx_msg->ibm_nob = nob;
#endif
/* Fence the message if it's bundled with an RDMA read */
fence = (tx->tx_nsp > 0) &&
- (type == OPENIBNAL_MSG_PUT_DONE);
+ (type == IBNAL_MSG_PUT_DONE);
*gl = (struct ib_gather_scatter) {
.address = tx->tx_vaddr,
.length = nob,
- .key = koibnal_data.koib_tx_pages->oibp_lkey,
+ .key = kibnal_data.kib_tx_pages->ibp_lkey,
};
/* NB If this is an RDMA read, the completion message must wait for
* the RDMA to complete. Sends wait for previous RDMA writes
* anyway... */
*sp = (struct ib_send_param) {
- .work_request_id = (__u64)((unsigned long)tx),
+ .work_request_id = kibnal_ptr2wreqid(tx, 0),
.op = IB_OP_SEND,
.gather_list = gl,
.num_gather_entries = 1,
}
void
-koibnal_queue_tx (koib_tx_t *tx, koib_conn_t *conn)
+kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
{
unsigned long flags;
spin_lock_irqsave(&conn->ibc_lock, flags);
- koibnal_queue_tx_locked (tx, conn);
+ kibnal_queue_tx_locked (tx, conn);
spin_unlock_irqrestore(&conn->ibc_lock, flags);
- koibnal_check_sends(conn);
+ kibnal_check_sends(conn);
}
void
-koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid)
+kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
{
unsigned long flags;
- koib_peer_t *peer;
- koib_conn_t *conn;
- rwlock_t *g_lock = &koibnal_data.koib_global_lock;
+ kib_peer_t *peer;
+ kib_conn_t *conn;
+ rwlock_t *g_lock = &kibnal_data.kib_global_lock;
/* If I get here, I've committed to send, so I complete the tx with
* failure on any problems */
read_lock (g_lock);
- peer = koibnal_find_peer_locked (nid);
+ peer = kibnal_find_peer_locked (nid);
if (peer == NULL) {
read_unlock (g_lock);
tx->tx_status = -EHOSTUNREACH;
- koibnal_tx_done (tx);
+ kibnal_tx_done (tx);
return;
}
- conn = koibnal_find_conn_locked (peer);
+ conn = kibnal_find_conn_locked (peer);
if (conn != NULL) {
CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
read_unlock (g_lock);
- koibnal_queue_tx (tx, conn);
+ kibnal_queue_tx (tx, conn);
return;
}
read_unlock (g_lock);
write_lock_irqsave (g_lock, flags);
- peer = koibnal_find_peer_locked (nid);
+ peer = kibnal_find_peer_locked (nid);
if (peer == NULL) {
write_unlock_irqrestore (g_lock, flags);
tx->tx_status = -EHOSTUNREACH;
- koibnal_tx_done (tx);
+ kibnal_tx_done (tx);
return;
}
- conn = koibnal_find_conn_locked (peer);
+ conn = kibnal_find_conn_locked (peer);
if (conn != NULL) {
/* Connection exists; queue message on it */
CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
write_unlock_irqrestore (g_lock, flags);
- koibnal_queue_tx (tx, conn);
+ kibnal_queue_tx (tx, conn);
return;
}
if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) {
write_unlock_irqrestore (g_lock, flags);
tx->tx_status = -EHOSTUNREACH;
- koibnal_tx_done (tx);
+ kibnal_tx_done (tx);
return;
}
peer->ibp_connecting = 1;
atomic_inc (&peer->ibp_refcount); /* extra ref for connd */
- spin_lock (&koibnal_data.koib_connd_lock);
+ spin_lock (&kibnal_data.kib_connd_lock);
list_add_tail (&peer->ibp_connd_list,
- &koibnal_data.koib_connd_peers);
- wake_up (&koibnal_data.koib_connd_waitq);
+ &kibnal_data.kib_connd_peers);
+ wake_up (&kibnal_data.kib_connd_waitq);
- spin_unlock (&koibnal_data.koib_connd_lock);
+ spin_unlock (&kibnal_data.kib_connd_lock);
}
/* A connection is being established; queue the message... */
}
ptl_err_t
-koibnal_start_passive_rdma (int type, ptl_nid_t nid,
+kibnal_start_passive_rdma (int type, ptl_nid_t nid,
lib_msg_t *libmsg, ptl_hdr_t *hdr)
{
int nob = libmsg->md->length;
- koib_tx_t *tx;
- koib_msg_t *oibmsg;
+ kib_tx_t *tx;
+ kib_msg_t *ibmsg;
int rc;
int access;
- LASSERT (type == OPENIBNAL_MSG_PUT_RDMA ||
- type == OPENIBNAL_MSG_GET_RDMA);
+ LASSERT (type == IBNAL_MSG_PUT_RDMA ||
+ type == IBNAL_MSG_GET_RDMA);
LASSERT (nob > 0);
LASSERT (!in_interrupt()); /* Mapping could block */
- if (type == OPENIBNAL_MSG_PUT_RDMA) {
+ if (type == IBNAL_MSG_PUT_RDMA) {
access = IB_ACCESS_REMOTE_READ;
} else {
access = IB_ACCESS_REMOTE_WRITE |
IB_ACCESS_LOCAL_WRITE;
}
- tx = koibnal_get_idle_tx (1); /* May block; caller is an app thread */
+ tx = kibnal_get_idle_tx (1); /* May block; caller is an app thread */
LASSERT (tx != NULL);
if ((libmsg->md->options & PTL_MD_KIOV) == 0)
- rc = koibnal_map_iov (tx, access,
- libmsg->md->md_niov,
- libmsg->md->md_iov.iov,
- 0, nob);
+ rc = kibnal_map_iov (tx, access,
+ libmsg->md->md_niov,
+ libmsg->md->md_iov.iov,
+ 0, nob);
else
- rc = koibnal_map_kiov (tx, access,
- libmsg->md->md_niov,
- libmsg->md->md_iov.kiov,
- 0, nob);
+ rc = kibnal_map_kiov (tx, access,
+ libmsg->md->md_niov,
+ libmsg->md->md_iov.kiov,
+ 0, nob);
if (rc != 0) {
CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc);
goto failed;
}
- if (type == OPENIBNAL_MSG_GET_RDMA) {
+ if (type == IBNAL_MSG_GET_RDMA) {
/* reply gets finalized when tx completes */
- tx->tx_libmsg[1] = lib_create_reply_msg(&koibnal_lib,
+ tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib,
nid, libmsg);
if (tx->tx_libmsg[1] == NULL) {
CERROR ("Can't create reply for GET -> "LPX64"\n",
tx->tx_passive_rdma = 1;
- oibmsg = tx->tx_msg;
+ ibmsg = tx->tx_msg;
- oibmsg->oibm_u.rdma.oibrm_hdr = *hdr;
- oibmsg->oibm_u.rdma.oibrm_cookie = tx->tx_passive_rdma_cookie;
- oibmsg->oibm_u.rdma.oibrm_desc.rd_key = tx->tx_md.md_rkey;
- oibmsg->oibm_u.rdma.oibrm_desc.rd_addr = tx->tx_md.md_addr;
- oibmsg->oibm_u.rdma.oibrm_desc.rd_nob = nob;
+ ibmsg->ibm_u.rdma.ibrm_hdr = *hdr;
+ ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie;
+ ibmsg->ibm_u.rdma.ibrm_desc.rd_key = tx->tx_md.md_rkey;
+ ibmsg->ibm_u.rdma.ibrm_desc.rd_addr = tx->tx_md.md_addr;
+ ibmsg->ibm_u.rdma.ibrm_desc.rd_nob = nob;
- koibnal_init_tx_msg (tx, type, sizeof (koib_rdma_msg_t));
+ kibnal_init_tx_msg (tx, type, sizeof (kib_rdma_msg_t));
CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr "
LPX64", nob %d\n",
/* libmsg gets finalized when tx completes. */
tx->tx_libmsg[0] = libmsg;
- koibnal_launch_tx(tx, nid);
+ kibnal_launch_tx(tx, nid);
return (PTL_OK);
failed:
tx->tx_status = rc;
- koibnal_tx_done (tx);
+ kibnal_tx_done (tx);
return (PTL_FAIL);
}
void
-koibnal_start_active_rdma (int type, int status,
- koib_rx_t *rx, lib_msg_t *libmsg,
+kibnal_start_active_rdma (int type, int status,
+ kib_rx_t *rx, lib_msg_t *libmsg,
unsigned int niov,
struct iovec *iov, ptl_kiov_t *kiov,
size_t offset, size_t nob)
{
- koib_msg_t *rxmsg = rx->rx_msg;
- koib_msg_t *txmsg;
- koib_tx_t *tx;
+ kib_msg_t *rxmsg = rx->rx_msg;
+ kib_msg_t *txmsg;
+ kib_tx_t *tx;
int access;
int rdma_op;
int rc;
/* No data if we're completing with failure */
LASSERT (status == 0 || nob == 0);
- LASSERT (type == OPENIBNAL_MSG_GET_DONE ||
- type == OPENIBNAL_MSG_PUT_DONE);
+ LASSERT (type == IBNAL_MSG_GET_DONE ||
+ type == IBNAL_MSG_PUT_DONE);
/* Flag I'm completing the RDMA. Even if I fail to send the
* completion message, I will have tried my best so further
LASSERT (!rx->rx_rdma);
rx->rx_rdma = 1;
- if (type == OPENIBNAL_MSG_GET_DONE) {
+ if (type == IBNAL_MSG_GET_DONE) {
access = 0;
rdma_op = IB_OP_RDMA_WRITE;
- LASSERT (rxmsg->oibm_type == OPENIBNAL_MSG_GET_RDMA);
+ LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA);
} else {
access = IB_ACCESS_LOCAL_WRITE;
rdma_op = IB_OP_RDMA_READ;
- LASSERT (rxmsg->oibm_type == OPENIBNAL_MSG_PUT_RDMA);
+ LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA);
}
- tx = koibnal_get_idle_tx (0); /* Mustn't block */
+ tx = kibnal_get_idle_tx (0); /* Mustn't block */
if (tx == NULL) {
CERROR ("tx descs exhausted on RDMA from "LPX64
" completing locally with failure\n",
- rx->rx_conn->ibc_peer->ibp_nid);
- lib_finalize (&koibnal_lib, NULL, libmsg, PTL_NO_SPACE);
+ rx->rx_conn->ibc_peer->ibp_nid);
+ lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE);
return;
}
LASSERT (tx->tx_nsp == 0);
* message is matched) */
if (kiov != NULL)
- rc = koibnal_map_kiov (tx, access,
- niov, kiov, offset, nob);
+ rc = kibnal_map_kiov (tx, access,
+ niov, kiov, offset, nob);
else
- rc = koibnal_map_iov (tx, access,
- niov, iov, offset, nob);
+ rc = kibnal_map_iov (tx, access,
+ niov, iov, offset, nob);
if (rc != 0) {
CERROR ("Can't map RDMA -> "LPX64": %d\n",
};
tx->tx_sp[0] = (struct ib_send_param) {
- .work_request_id = (__u64)((unsigned long)tx),
+ .work_request_id = kibnal_ptr2wreqid(tx, 0),
.op = rdma_op,
.gather_list = &tx->tx_gl[0],
.num_gather_entries = 1,
- .remote_address = rxmsg->oibm_u.rdma.oibrm_desc.rd_addr,
- .rkey = rxmsg->oibm_u.rdma.oibrm_desc.rd_key,
+ .remote_address = rxmsg->ibm_u.rdma.ibrm_desc.rd_addr,
+ .rkey = rxmsg->ibm_u.rdma.ibrm_desc.rd_key,
.device_specific = NULL,
.solicited_event = 0,
.signaled = 1,
txmsg = tx->tx_msg;
- txmsg->oibm_u.completion.oibcm_cookie = rxmsg->oibm_u.rdma.oibrm_cookie;
- txmsg->oibm_u.completion.oibcm_status = status;
+ txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie;
+ txmsg->ibm_u.completion.ibcm_status = status;
- koibnal_init_tx_msg(tx, type, sizeof (koib_completion_msg_t));
+ kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
if (status == 0 && nob != 0) {
LASSERT (tx->tx_nsp > 1);
LASSERT (tx->tx_nsp == 1);
/* No RDMA: local completion happens now! */
CDEBUG(D_WARNING,"No data: immediate completion\n");
- lib_finalize (&koibnal_lib, NULL, libmsg,
+ lib_finalize (&kibnal_lib, NULL, libmsg,
status == 0 ? PTL_OK : PTL_FAIL);
}
atomic_read (&rx->rx_conn->ibc_refcount));
atomic_inc (&rx->rx_conn->ibc_refcount);
/* ...and queue it up */
- koibnal_queue_tx(tx, rx->rx_conn);
+ kibnal_queue_tx(tx, rx->rx_conn);
}
ptl_err_t
-koibnal_sendmsg(lib_nal_t *nal,
+kibnal_sendmsg(lib_nal_t *nal,
void *private,
lib_msg_t *libmsg,
ptl_hdr_t *hdr,
size_t payload_offset,
size_t payload_nob)
{
- koib_msg_t *oibmsg;
- koib_tx_t *tx;
+ kib_msg_t *ibmsg;
+ kib_tx_t *tx;
int nob;
/* NB 'private' is different depending on what we're sending.... */
case PTL_MSG_REPLY: {
/* reply's 'private' is the incoming receive */
- koib_rx_t *rx = private;
+ kib_rx_t *rx = private;
/* RDMA reply expected? */
- if (rx->rx_msg->oibm_type == OPENIBNAL_MSG_GET_RDMA) {
- koibnal_start_active_rdma(OPENIBNAL_MSG_GET_DONE, 0,
- rx, libmsg, payload_niov,
- payload_iov, payload_kiov,
- payload_offset, payload_nob);
+ if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) {
+ kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0,
+ rx, libmsg, payload_niov,
+ payload_iov, payload_kiov,
+ payload_offset, payload_nob);
return (PTL_OK);
}
/* Incoming message consistent with immediate reply? */
- if (rx->rx_msg->oibm_type != OPENIBNAL_MSG_IMMEDIATE) {
+ if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) {
CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n",
- nid, rx->rx_msg->oibm_type);
+ nid, rx->rx_msg->ibm_type);
return (PTL_FAIL);
}
/* Will it fit in a message? */
- nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[payload_nob]);
- if (nob >= OPENIBNAL_MSG_SIZE) {
+ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+ if (nob >= IBNAL_MSG_SIZE) {
CERROR("REPLY for "LPX64" too big (RDMA not requested): %d\n",
nid, payload_nob);
return (PTL_FAIL);
case PTL_MSG_GET:
/* might the REPLY message be big enough to need RDMA? */
- nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[libmsg->md->length]);
- if (nob > OPENIBNAL_MSG_SIZE)
- return (koibnal_start_passive_rdma(OPENIBNAL_MSG_GET_RDMA,
- nid, libmsg, hdr));
+ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]);
+ if (nob > IBNAL_MSG_SIZE)
+ return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA,
+ nid, libmsg, hdr));
break;
case PTL_MSG_ACK:
case PTL_MSG_PUT:
/* Is the payload big enough to need RDMA? */
- nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[payload_nob]);
- if (nob > OPENIBNAL_MSG_SIZE)
- return (koibnal_start_passive_rdma(OPENIBNAL_MSG_PUT_RDMA,
- nid, libmsg, hdr));
+ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+ if (nob > IBNAL_MSG_SIZE)
+ return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA,
+ nid, libmsg, hdr));
break;
}
- tx = koibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
- type == PTL_MSG_REPLY ||
- in_interrupt()));
+ tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
+ type == PTL_MSG_REPLY ||
+ in_interrupt()));
if (tx == NULL) {
CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n",
type, nid, in_interrupt() ? " (intr)" : "");
return (PTL_NO_SPACE);
}
- oibmsg = tx->tx_msg;
- oibmsg->oibm_u.immediate.oibim_hdr = *hdr;
+ ibmsg = tx->tx_msg;
+ ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
if (payload_nob > 0) {
if (payload_kiov != NULL)
- lib_copy_kiov2buf(oibmsg->oibm_u.immediate.oibim_payload,
+ lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload,
payload_niov, payload_kiov,
payload_offset, payload_nob);
else
- lib_copy_iov2buf(oibmsg->oibm_u.immediate.oibim_payload,
+ lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload,
payload_niov, payload_iov,
payload_offset, payload_nob);
}
- koibnal_init_tx_msg (tx, OPENIBNAL_MSG_IMMEDIATE,
- offsetof(koib_immediate_msg_t,
- oibim_payload[payload_nob]));
+ kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE,
+ offsetof(kib_immediate_msg_t,
+ ibim_payload[payload_nob]));
/* libmsg gets finalized when tx completes */
tx->tx_libmsg[0] = libmsg;
- koibnal_launch_tx(tx, nid);
+ kibnal_launch_tx(tx, nid);
return (PTL_OK);
}
ptl_err_t
-koibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
+kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
unsigned int payload_niov, struct iovec *payload_iov,
size_t payload_offset, size_t payload_len)
{
- return (koibnal_sendmsg(nal, private, cookie,
- hdr, type, nid, pid,
- payload_niov, payload_iov, NULL,
- payload_offset, payload_len));
+ return (kibnal_sendmsg(nal, private, cookie,
+ hdr, type, nid, pid,
+ payload_niov, payload_iov, NULL,
+ payload_offset, payload_len));
}
ptl_err_t
-koibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie,
+kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie,
ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
unsigned int payload_niov, ptl_kiov_t *payload_kiov,
size_t payload_offset, size_t payload_len)
{
- return (koibnal_sendmsg(nal, private, cookie,
- hdr, type, nid, pid,
- payload_niov, NULL, payload_kiov,
- payload_offset, payload_len));
+ return (kibnal_sendmsg(nal, private, cookie,
+ hdr, type, nid, pid,
+ payload_niov, NULL, payload_kiov,
+ payload_offset, payload_len));
}
ptl_err_t
-koibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
+kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov,
size_t offset, size_t mlen, size_t rlen)
{
- koib_rx_t *rx = private;
- koib_msg_t *rxmsg = rx->rx_msg;
- int msg_nob;
+ kib_rx_t *rx = private;
+ kib_msg_t *rxmsg = rx->rx_msg;
+ int msg_nob;
LASSERT (mlen <= rlen);
LASSERT (!in_interrupt ());
/* Either all pages or all vaddrs */
LASSERT (!(kiov != NULL && iov != NULL));
- switch (rxmsg->oibm_type) {
+ switch (rxmsg->ibm_type) {
default:
LBUG();
return (PTL_FAIL);
- case OPENIBNAL_MSG_IMMEDIATE:
- msg_nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[rlen]);
- if (msg_nob > OPENIBNAL_MSG_SIZE) {
+ case IBNAL_MSG_IMMEDIATE:
+ msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
+ if (msg_nob > IBNAL_MSG_SIZE) {
CERROR ("Immediate message from "LPX64" too big: %d\n",
- rxmsg->oibm_u.immediate.oibim_hdr.src_nid, rlen);
+ rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen);
return (PTL_FAIL);
}
if (kiov != NULL)
lib_copy_buf2kiov(niov, kiov, offset,
- rxmsg->oibm_u.immediate.oibim_payload,
+ rxmsg->ibm_u.immediate.ibim_payload,
mlen);
else
lib_copy_buf2iov(niov, iov, offset,
- rxmsg->oibm_u.immediate.oibim_payload,
+ rxmsg->ibm_u.immediate.ibim_payload,
mlen);
lib_finalize (nal, NULL, libmsg, PTL_OK);
return (PTL_OK);
- case OPENIBNAL_MSG_GET_RDMA:
+ case IBNAL_MSG_GET_RDMA:
/* We get called here just to discard any junk after the
* GET hdr. */
LASSERT (libmsg == NULL);
lib_finalize (nal, NULL, libmsg, PTL_OK);
return (PTL_OK);
- case OPENIBNAL_MSG_PUT_RDMA:
- koibnal_start_active_rdma (OPENIBNAL_MSG_PUT_DONE, 0,
- rx, libmsg,
- niov, iov, kiov, offset, mlen);
+ case IBNAL_MSG_PUT_RDMA:
+ kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0,
+ rx, libmsg,
+ niov, iov, kiov, offset, mlen);
return (PTL_OK);
}
}
ptl_err_t
-koibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
+kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
unsigned int niov, struct iovec *iov,
size_t offset, size_t mlen, size_t rlen)
{
- return (koibnal_recvmsg (nal, private, msg, niov, iov, NULL,
- offset, mlen, rlen));
+ return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL,
+ offset, mlen, rlen));
}
ptl_err_t
-koibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
+kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
unsigned int niov, ptl_kiov_t *kiov,
size_t offset, size_t mlen, size_t rlen)
{
- return (koibnal_recvmsg (nal, private, msg, niov, NULL, kiov,
- offset, mlen, rlen));
+ return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov,
+ offset, mlen, rlen));
}
int
-koibnal_thread_start (int (*fn)(void *arg), void *arg)
+kibnal_thread_start (int (*fn)(void *arg), void *arg)
{
long pid = kernel_thread (fn, arg, 0);
if (pid < 0)
return ((int)pid);
- atomic_inc (&koibnal_data.koib_nthreads);
+ atomic_inc (&kibnal_data.kib_nthreads);
return (0);
}
void
-koibnal_thread_fini (void)
+kibnal_thread_fini (void)
{
- atomic_dec (&koibnal_data.koib_nthreads);
+ atomic_dec (&kibnal_data.kib_nthreads);
}
void
-koibnal_close_conn_locked (koib_conn_t *conn, int error)
+kibnal_close_conn_locked (kib_conn_t *conn, int error)
{
/* This just does the immmediate housekeeping, and schedules the
* connection for the connd to finish off.
- * Caller holds koib_global_lock exclusively in irq context */
- koib_peer_t *peer = conn->ibc_peer;
+ * Caller holds kib_global_lock exclusively in irq context */
+ kib_peer_t *peer = conn->ibc_peer;
CDEBUG (error == 0 ? D_NET : D_ERROR,
"closing conn to "LPX64": error %d\n", peer->ibp_nid, error);
- LASSERT (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED ||
- conn->ibc_state == OPENIBNAL_CONN_CONNECTING);
+ LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED ||
+ conn->ibc_state == IBNAL_CONN_CONNECTING);
- if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED) {
- /* koib_connd_conns takes ibc_list's ref */
+ if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
+ /* kib_connd_conns takes ibc_list's ref */
list_del (&conn->ibc_list);
} else {
- /* new ref for koib_connd_conns */
+ /* new ref for kib_connd_conns */
CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
atomic_read (&conn->ibc_refcount));
if (list_empty (&peer->ibp_conns) &&
peer->ibp_persistence == 0) {
/* Non-persistent peer with no more conns... */
- koibnal_unlink_peer_locked (peer);
+ kibnal_unlink_peer_locked (peer);
}
- conn->ibc_state = OPENIBNAL_CONN_DEATHROW;
+ conn->ibc_state = IBNAL_CONN_DEATHROW;
/* Schedule conn for closing/destruction */
- spin_lock (&koibnal_data.koib_connd_lock);
+ spin_lock (&kibnal_data.kib_connd_lock);
- list_add_tail (&conn->ibc_list, &koibnal_data.koib_connd_conns);
- wake_up (&koibnal_data.koib_connd_waitq);
+ list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
+ wake_up (&kibnal_data.kib_connd_waitq);
- spin_unlock (&koibnal_data.koib_connd_lock);
+ spin_unlock (&kibnal_data.kib_connd_lock);
}
int
-koibnal_close_conn (koib_conn_t *conn, int why)
+kibnal_close_conn (kib_conn_t *conn, int why)
{
unsigned long flags;
int count = 0;
- write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
- LASSERT (conn->ibc_state >= OPENIBNAL_CONN_CONNECTING);
+ LASSERT (conn->ibc_state >= IBNAL_CONN_CONNECTING);
- if (conn->ibc_state <= OPENIBNAL_CONN_ESTABLISHED) {
+ if (conn->ibc_state <= IBNAL_CONN_ESTABLISHED) {
count = 1;
- koibnal_close_conn_locked (conn, why);
+ kibnal_close_conn_locked (conn, why);
}
- write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
return (count);
}
void
-koibnal_peer_connect_failed (koib_peer_t *peer, int active, int rc)
+kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc)
{
LIST_HEAD (zombies);
- koib_tx_t *tx;
+ kib_tx_t *tx;
unsigned long flags;
LASSERT (rc != 0);
- LASSERT (peer->ibp_reconnect_interval >= OPENIBNAL_MIN_RECONNECT_INTERVAL);
+ LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL);
- write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
LASSERT (peer->ibp_connecting != 0);
peer->ibp_connecting--;
if (peer->ibp_connecting != 0) {
/* another connection attempt under way (loopback?)... */
- write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
return;
}
peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval;
/* Increase reconnection interval */
peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2,
- OPENIBNAL_MAX_RECONNECT_INTERVAL);
+ IBNAL_MAX_RECONNECT_INTERVAL);
/* Take peer's blocked blocked transmits; I'll complete
* them with error */
while (!list_empty (&peer->ibp_tx_queue)) {
tx = list_entry (peer->ibp_tx_queue.next,
- koib_tx_t, tx_list);
+ kib_tx_t, tx_list);
list_del (&tx->tx_list);
list_add_tail (&tx->tx_list, &zombies);
}
- if (koibnal_peer_active(peer) &&
+ if (kibnal_peer_active(peer) &&
(peer->ibp_persistence == 0)) {
/* failed connection attempt on non-persistent peer */
- koibnal_unlink_peer_locked (peer);
+ kibnal_unlink_peer_locked (peer);
}
} else {
/* Can't have blocked transmits if there are connections */
LASSERT (list_empty(&peer->ibp_tx_queue));
}
- write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
if (!list_empty (&zombies))
CERROR ("Deleting messages for "LPX64": connection failed\n",
peer->ibp_nid);
while (!list_empty (&zombies)) {
- tx = list_entry (zombies.next, koib_tx_t, tx_list);
+ tx = list_entry (zombies.next, kib_tx_t, tx_list);
list_del (&tx->tx_list);
/* complete now */
tx->tx_status = -EHOSTUNREACH;
- koibnal_tx_done (tx);
+ kibnal_tx_done (tx);
}
}
void
-koibnal_connreq_done (koib_conn_t *conn, int active, int status)
+kibnal_connreq_done (kib_conn_t *conn, int active, int status)
{
int state = conn->ibc_state;
- koib_peer_t *peer = conn->ibc_peer;
- koib_tx_t *tx;
+ kib_peer_t *peer = conn->ibc_peer;
+ kib_tx_t *tx;
unsigned long flags;
int rc;
int i;
conn->ibc_connreq = NULL;
}
- if (state == OPENIBNAL_CONN_CONNECTING) {
+ if (state == IBNAL_CONN_CONNECTING) {
/* Install common (active/passive) callback for
* disconnect/idle notification if I got as far as getting
* a CM comm_id */
rc = tsIbCmCallbackModify(conn->ibc_comm_id,
- koibnal_conn_callback, conn);
+ kibnal_conn_callback, conn);
LASSERT (rc == 0);
}
- write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
LASSERT (peer->ibp_connecting != 0);
if (status == 0) {
/* connection established... */
- LASSERT (state == OPENIBNAL_CONN_CONNECTING);
- conn->ibc_state = OPENIBNAL_CONN_ESTABLISHED;
+ LASSERT (state == IBNAL_CONN_CONNECTING);
+ conn->ibc_state = IBNAL_CONN_ESTABLISHED;
- if (!koibnal_peer_active(peer)) {
+ if (!kibnal_peer_active(peer)) {
/* ...but peer deleted meantime */
status = -ECONNABORTED;
}
} else {
- LASSERT (state == OPENIBNAL_CONN_INIT_QP ||
- state == OPENIBNAL_CONN_CONNECTING);
+ LASSERT (state == IBNAL_CONN_INIT_QP ||
+ state == IBNAL_CONN_CONNECTING);
}
if (status == 0) {
list_add (&conn->ibc_list, &peer->ibp_conns);
/* reset reconnect interval for next attempt */
- peer->ibp_reconnect_interval = OPENIBNAL_MIN_RECONNECT_INTERVAL;
+ peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
/* post blocked sends to the new connection */
spin_lock (&conn->ibc_lock);
while (!list_empty (&peer->ibp_tx_queue)) {
tx = list_entry (peer->ibp_tx_queue.next,
- koib_tx_t, tx_list);
+ kib_tx_t, tx_list);
list_del (&tx->tx_list);
conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
atomic_read (&conn->ibc_refcount));
atomic_inc (&conn->ibc_refcount);
- koibnal_queue_tx_locked (tx, conn);
+ kibnal_queue_tx_locked (tx, conn);
}
spin_unlock (&conn->ibc_lock);
/* Nuke any dangling conns from a different peer instance... */
- koibnal_close_stale_conns_locked (conn->ibc_peer,
- conn->ibc_incarnation);
+ kibnal_close_stale_conns_locked (conn->ibc_peer,
+ conn->ibc_incarnation);
- write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
/* queue up all the receives */
- for (i = 0; i < OPENIBNAL_RX_MSGS; i++) {
+ for (i = 0; i < IBNAL_RX_MSGS; i++) {
/* +1 ref for rx desc */
CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg,
conn->ibc_rxs[i].rx_vaddr);
- koibnal_post_rx (&conn->ibc_rxs[i], 0);
+ kibnal_post_rx (&conn->ibc_rxs[i], 0);
}
- koibnal_check_sends (conn);
+ kibnal_check_sends (conn);
return;
}
/* connection failed */
- if (state == OPENIBNAL_CONN_CONNECTING) {
+ if (state == IBNAL_CONN_CONNECTING) {
/* schedule for connd to close */
- koibnal_close_conn_locked (conn, status);
+ kibnal_close_conn_locked (conn, status);
} else {
/* Don't have a CM comm_id; just wait for refs to drain */
- conn->ibc_state = OPENIBNAL_CONN_ZOMBIE;
+ conn->ibc_state = IBNAL_CONN_ZOMBIE;
}
- write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
- koibnal_peer_connect_failed (conn->ibc_peer, active, status);
+ kibnal_peer_connect_failed (conn->ibc_peer, active, status);
- if (state != OPENIBNAL_CONN_CONNECTING) {
+ if (state != IBNAL_CONN_CONNECTING) {
/* drop caller's ref if we're not waiting for the
* IB_CM_IDLE callback */
- koibnal_put_conn (conn);
+ kibnal_put_conn (conn);
}
}
int
-koibnal_accept (koib_conn_t **connp, tTS_IB_CM_COMM_ID cid,
+kibnal_accept (kib_conn_t **connp, tTS_IB_CM_COMM_ID cid,
ptl_nid_t nid, __u64 incarnation, int queue_depth)
{
- koib_conn_t *conn = koibnal_create_conn();
- koib_peer_t *peer;
- koib_peer_t *peer2;
+ kib_conn_t *conn = kibnal_create_conn();
+ kib_peer_t *peer;
+ kib_peer_t *peer2;
unsigned long flags;
if (conn == NULL)
return (-ENOMEM);
- if (queue_depth != OPENIBNAL_MSG_QUEUE_SIZE) {
+ if (queue_depth != IBNAL_MSG_QUEUE_SIZE) {
CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n",
- nid, queue_depth, OPENIBNAL_MSG_QUEUE_SIZE);
+ nid, queue_depth, IBNAL_MSG_QUEUE_SIZE);
return (-EPROTO);
}
/* assume 'nid' is a new peer */
- peer = koibnal_create_peer (nid);
+ peer = kibnal_create_peer (nid);
if (peer == NULL) {
CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n",
conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
atomic_read (&conn->ibc_refcount));
atomic_dec (&conn->ibc_refcount);
- koibnal_destroy_conn(conn);
+ kibnal_destroy_conn(conn);
return (-ENOMEM);
}
- write_lock_irqsave (&koibnal_data.koib_global_lock, flags);
+ write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
- peer2 = koibnal_find_peer_locked(nid);
+ peer2 = kibnal_find_peer_locked(nid);
if (peer2 == NULL) {
/* peer table takes my ref on peer */
list_add_tail (&peer->ibp_list,
- koibnal_nid2peerlist(nid));
+ kibnal_nid2peerlist(nid));
} else {
- koibnal_put_peer (peer);
+ kibnal_put_peer (peer);
peer = peer2;
}
atomic_inc (&peer->ibp_refcount);
peer->ibp_connecting++;
- write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags);
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
conn->ibc_peer = peer;
- conn->ibc_state = OPENIBNAL_CONN_CONNECTING;
+ conn->ibc_state = IBNAL_CONN_CONNECTING;
conn->ibc_comm_id = cid;
conn->ibc_incarnation = incarnation;
- conn->ibc_credits = OPENIBNAL_MSG_QUEUE_SIZE;
+ conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
*connp = conn;
return (0);
}
tTS_IB_CM_CALLBACK_RETURN
-koibnal_idle_conn_callback (tTS_IB_CM_EVENT event,
+kibnal_idle_conn_callback (tTS_IB_CM_EVENT event,
tTS_IB_CM_COMM_ID cid,
void *param,
void *arg)
}
tTS_IB_CM_CALLBACK_RETURN
-koibnal_conn_callback (tTS_IB_CM_EVENT event,
+kibnal_conn_callback (tTS_IB_CM_EVENT event,
tTS_IB_CM_COMM_ID cid,
void *param,
void *arg)
{
- koib_conn_t *conn = arg;
- int rc;
+ kib_conn_t *conn = arg;
+ LIST_HEAD (zombies);
+ struct list_head *tmp;
+ struct list_head *nxt;
+ kib_tx_t *tx;
+ unsigned long flags;
+ int done;
+ int rc;
/* Established Connection Notifier */
default:
CERROR("Connection %p -> "LPX64" ERROR %d\n",
conn, conn->ibc_peer->ibp_nid, event);
- koibnal_close_conn (conn, -ECONNABORTED);
+ kibnal_close_conn (conn, -ECONNABORTED);
break;
case TS_IB_CM_DISCONNECTED:
CDEBUG(D_WARNING, "Connection %p -> "LPX64" DISCONNECTED.\n",
conn, conn->ibc_peer->ibp_nid);
- koibnal_close_conn (conn, 0);
+ kibnal_close_conn (conn, 0);
break;
case TS_IB_CM_IDLE:
CDEBUG(D_NET, "Connection %p -> "LPX64" IDLE.\n",
conn, conn->ibc_peer->ibp_nid);
- koibnal_put_conn (conn); /* Lose CM's ref */
+ kibnal_put_conn (conn); /* Lose CM's ref */
/* LASSERT (no further callbacks) */
rc = tsIbCmCallbackModify(cid,
- koibnal_idle_conn_callback, conn);
+ kibnal_idle_conn_callback, conn);
LASSERT (rc == 0);
+
+ /* NB we wait until the connection has closed before
+ * completing outstanding passive RDMAs so we can be sure
+ * the network can't touch the mapped memory any more. */
+
+ spin_lock_irqsave (&conn->ibc_lock, flags);
+
+ /* grab passive RDMAs not waiting for the tx callback */
+ list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) {
+ tx = list_entry (tmp, kib_tx_t, tx_list);
+
+ LASSERT (tx->tx_passive_rdma ||
+ !tx->tx_passive_rdma_wait);
+
+ LASSERT (tx->tx_passive_rdma_wait ||
+ tx->tx_sending != 0);
+
+ /* still waiting for tx callback? */
+ if (!tx->tx_passive_rdma_wait)
+ continue;
+
+ tx->tx_status = -ECONNABORTED;
+ tx->tx_passive_rdma_wait = 0;
+ done = (tx->tx_sending == 0);
+
+ if (!done)
+ continue;
+
+ list_del (&tx->tx_list);
+ list_add (&tx->tx_list, &zombies);
+ }
+
+ /* grab all blocked transmits */
+ list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) {
+ tx = list_entry (tmp, kib_tx_t, tx_list);
+
+ list_del (&tx->tx_list);
+ list_add (&tx->tx_list, &zombies);
+ }
+
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+ while (!list_empty(&zombies)) {
+ tx = list_entry (zombies.next, kib_tx_t, tx_list);
+
+ list_del(&tx->tx_list);
+ kibnal_tx_done (tx);
+ }
break;
}
}
tTS_IB_CM_CALLBACK_RETURN
-koibnal_passive_conn_callback (tTS_IB_CM_EVENT event,
+kibnal_passive_conn_callback (tTS_IB_CM_EVENT event,
tTS_IB_CM_COMM_ID cid,
void *param,
void *arg)
{
- koib_conn_t *conn = arg;
+ kib_conn_t *conn = arg;
int rc;
switch (event) {
CERROR ("Unexpected event %p -> "LPX64": %d\n",
conn, conn->ibc_peer->ibp_nid, event);
- koibnal_connreq_done (conn, 0, -ECONNABORTED);
+ kibnal_connreq_done (conn, 0, -ECONNABORTED);
break;
case TS_IB_CM_REQ_RECEIVED: {
struct ib_cm_req_received_param *req = param;
- koib_wire_connreq_t *wcr = req->remote_private_data;
+ kib_wire_connreq_t *wcr = req->remote_private_data;
LASSERT (conn == NULL);
return TS_IB_CM_CALLBACK_ABORT;
}
- if (wcr->wcr_magic != cpu_to_le32(OPENIBNAL_MSG_MAGIC)) {
+ if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
CERROR ("Can't accept LID %04x: bad magic %08x\n",
req->dlid, le32_to_cpu(wcr->wcr_magic));
return TS_IB_CM_CALLBACK_ABORT;
}
- if (wcr->wcr_version != cpu_to_le16(OPENIBNAL_MSG_VERSION)) {
+ if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
CERROR ("Can't accept LID %04x: bad version %d\n",
req->dlid, le16_to_cpu(wcr->wcr_magic));
return TS_IB_CM_CALLBACK_ABORT;
}
- rc = koibnal_accept(&conn,
- cid,
- le64_to_cpu(wcr->wcr_nid),
- le64_to_cpu(wcr->wcr_incarnation),
- le16_to_cpu(wcr->wcr_queue_depth));
+ rc = kibnal_accept(&conn,
+ cid,
+ le64_to_cpu(wcr->wcr_nid),
+ le64_to_cpu(wcr->wcr_incarnation),
+ le16_to_cpu(wcr->wcr_queue_depth));
if (rc != 0) {
CERROR ("Can't accept "LPX64": %d\n",
le64_to_cpu(wcr->wcr_nid), rc);
/* update 'arg' for next callback */
rc = tsIbCmCallbackModify(cid,
- koibnal_passive_conn_callback, conn);
+ kibnal_passive_conn_callback, conn);
LASSERT (rc == 0);
req->accept_param.qp = conn->ibc_qp;
- *((koib_wire_connreq_t *)req->accept_param.reply_private_data)
- = (koib_wire_connreq_t) {
- .wcr_magic = cpu_to_le32(OPENIBNAL_MSG_MAGIC),
- .wcr_version = cpu_to_le16(OPENIBNAL_MSG_VERSION),
- .wcr_queue_depth = cpu_to_le32(OPENIBNAL_MSG_QUEUE_SIZE),
- .wcr_nid = cpu_to_le64(koibnal_data.koib_nid),
- .wcr_incarnation = cpu_to_le64(koibnal_data.koib_incarnation),
+ *((kib_wire_connreq_t *)req->accept_param.reply_private_data)
+ = (kib_wire_connreq_t) {
+ .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC),
+ .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION),
+ .wcr_queue_depth = cpu_to_le32(IBNAL_MSG_QUEUE_SIZE),
+ .wcr_nid = cpu_to_le64(kibnal_data.kib_nid),
+ .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
};
- req->accept_param.reply_private_data_len = sizeof(koib_wire_connreq_t);
- req->accept_param.responder_resources = OPENIBNAL_RESPONDER_RESOURCES;
- req->accept_param.initiator_depth = OPENIBNAL_RESPONDER_RESOURCES;
- req->accept_param.rnr_retry_count = OPENIBNAL_RNR_RETRY;
- req->accept_param.flow_control = OPENIBNAL_FLOW_CONTROL;
+ req->accept_param.reply_private_data_len = sizeof(kib_wire_connreq_t);
+ req->accept_param.responder_resources = IBNAL_RESPONDER_RESOURCES;
+ req->accept_param.initiator_depth = IBNAL_RESPONDER_RESOURCES;
+ req->accept_param.rnr_retry_count = IBNAL_RNR_RETRY;
+ req->accept_param.flow_control = IBNAL_FLOW_CONTROL;
CDEBUG(D_NET, "Proceeding\n");
break;
CDEBUG(D_WARNING, "Connection %p -> "LPX64" ESTABLISHED.\n",
conn, conn->ibc_peer->ibp_nid);
- koibnal_connreq_done (conn, 0, 0);
+ kibnal_connreq_done (conn, 0, 0);
break;
}
- /* NB if the connreq is done, we switch to koibnal_conn_callback */
+ /* NB if the connreq is done, we switch to kibnal_conn_callback */
return TS_IB_CM_CALLBACK_PROCEED;
}
tTS_IB_CM_CALLBACK_RETURN
-koibnal_active_conn_callback (tTS_IB_CM_EVENT event,
+kibnal_active_conn_callback (tTS_IB_CM_EVENT event,
tTS_IB_CM_COMM_ID cid,
void *param,
void *arg)
{
- koib_conn_t *conn = arg;
+ kib_conn_t *conn = arg;
switch (event) {
case TS_IB_CM_REP_RECEIVED: {
struct ib_cm_rep_received_param *rep = param;
- koib_wire_connreq_t *wcr = rep->remote_private_data;
+ kib_wire_connreq_t *wcr = rep->remote_private_data;
if (rep->remote_private_data_len < sizeof (*wcr)) {
CERROR ("Short reply from "LPX64": %d\n",
conn->ibc_peer->ibp_nid,
rep->remote_private_data_len);
- koibnal_connreq_done (conn, 1, -EPROTO);
+ kibnal_connreq_done (conn, 1, -EPROTO);
break;
}
- if (wcr->wcr_magic != cpu_to_le32(OPENIBNAL_MSG_MAGIC)) {
+ if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
CERROR ("Can't connect "LPX64": bad magic %08x\n",
conn->ibc_peer->ibp_nid, le32_to_cpu(wcr->wcr_magic));
- koibnal_connreq_done (conn, 1, -EPROTO);
+ kibnal_connreq_done (conn, 1, -EPROTO);
break;
}
- if (wcr->wcr_version != cpu_to_le16(OPENIBNAL_MSG_VERSION)) {
+ if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
CERROR ("Can't connect "LPX64": bad version %d\n",
conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_magic));
- koibnal_connreq_done (conn, 1, -EPROTO);
+ kibnal_connreq_done (conn, 1, -EPROTO);
break;
}
- if (wcr->wcr_queue_depth != cpu_to_le16(OPENIBNAL_MSG_QUEUE_SIZE)) {
+ if (wcr->wcr_queue_depth != cpu_to_le16(IBNAL_MSG_QUEUE_SIZE)) {
CERROR ("Can't connect "LPX64": bad queue depth %d\n",
conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_queue_depth));
- koibnal_connreq_done (conn, 1, -EPROTO);
+ kibnal_connreq_done (conn, 1, -EPROTO);
break;
}
if (le64_to_cpu(wcr->wcr_nid) != conn->ibc_peer->ibp_nid) {
CERROR ("Unexpected NID "LPX64" from "LPX64"\n",
le64_to_cpu(wcr->wcr_nid), conn->ibc_peer->ibp_nid);
- koibnal_connreq_done (conn, 1, -EPROTO);
+ kibnal_connreq_done (conn, 1, -EPROTO);
break;
}
conn, conn->ibc_peer->ibp_nid);
conn->ibc_incarnation = le64_to_cpu(wcr->wcr_incarnation);
- conn->ibc_credits = OPENIBNAL_MSG_QUEUE_SIZE;
+ conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
break;
}
CDEBUG(D_WARNING, "Connection %p -> "LPX64" Established\n",
conn, conn->ibc_peer->ibp_nid);
- koibnal_connreq_done (conn, 1, 0);
+ kibnal_connreq_done (conn, 1, 0);
break;
case TS_IB_CM_IDLE:
CERROR("Connection %p -> "LPX64" IDLE\n",
conn, conn->ibc_peer->ibp_nid);
/* Back out state change: I'm disengaged from CM */
- conn->ibc_state = OPENIBNAL_CONN_INIT_QP;
+ conn->ibc_state = IBNAL_CONN_INIT_QP;
- koibnal_connreq_done (conn, 1, -ECONNABORTED);
+ kibnal_connreq_done (conn, 1, -ECONNABORTED);
break;
default:
CERROR("Connection %p -> "LPX64" ERROR %d\n",
conn, conn->ibc_peer->ibp_nid, event);
- koibnal_connreq_done (conn, 1, -ECONNABORTED);
+ kibnal_connreq_done (conn, 1, -ECONNABORTED);
break;
}
- /* NB if the connreq is done, we switch to koibnal_conn_callback */
+ /* NB if the connreq is done, we switch to kibnal_conn_callback */
return TS_IB_CM_CALLBACK_PROCEED;
}
int
-koibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
+kibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
struct ib_path_record *resp, int remaining,
void *arg)
{
- koib_conn_t *conn = arg;
+ kib_conn_t *conn = arg;
if (status != 0) {
CERROR ("status %d\n", status);
- koibnal_connreq_done (conn, 1, status);
+ kibnal_connreq_done (conn, 1, status);
goto out;
}
conn->ibc_connreq->cr_path = *resp;
- conn->ibc_connreq->cr_wcr = (koib_wire_connreq_t) {
- .wcr_magic = cpu_to_le32(OPENIBNAL_MSG_MAGIC),
- .wcr_version = cpu_to_le16(OPENIBNAL_MSG_VERSION),
- .wcr_queue_depth = cpu_to_le16(OPENIBNAL_MSG_QUEUE_SIZE),
- .wcr_nid = cpu_to_le64(koibnal_data.koib_nid),
- .wcr_incarnation = cpu_to_le64(koibnal_data.koib_incarnation),
+ conn->ibc_connreq->cr_wcr = (kib_wire_connreq_t) {
+ .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC),
+ .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION),
+ .wcr_queue_depth = cpu_to_le16(IBNAL_MSG_QUEUE_SIZE),
+ .wcr_nid = cpu_to_le64(kibnal_data.kib_nid),
+ .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
};
conn->ibc_connreq->cr_connparam = (struct ib_cm_active_param) {
.qp = conn->ibc_qp,
.req_private_data = &conn->ibc_connreq->cr_wcr,
.req_private_data_len = sizeof(conn->ibc_connreq->cr_wcr),
- .responder_resources = OPENIBNAL_RESPONDER_RESOURCES,
- .initiator_depth = OPENIBNAL_RESPONDER_RESOURCES,
- .retry_count = OPENIBNAL_RETRY,
- .rnr_retry_count = OPENIBNAL_RNR_RETRY,
- .cm_response_timeout = koibnal_tunables.koib_io_timeout,
- .max_cm_retries = OPENIBNAL_CM_RETRY,
- .flow_control = OPENIBNAL_FLOW_CONTROL,
+ .responder_resources = IBNAL_RESPONDER_RESOURCES,
+ .initiator_depth = IBNAL_RESPONDER_RESOURCES,
+ .retry_count = IBNAL_RETRY,
+ .rnr_retry_count = IBNAL_RNR_RETRY,
+ .cm_response_timeout = kibnal_tunables.kib_io_timeout,
+ .max_cm_retries = IBNAL_CM_RETRY,
+ .flow_control = IBNAL_FLOW_CONTROL,
};
/* XXX set timeout just like SDP!!!*/
conn->ibc_connreq->cr_path.packet_life = 13;
/* Flag I'm getting involved with the CM... */
- conn->ibc_state = OPENIBNAL_CONN_CONNECTING;
+ conn->ibc_state = IBNAL_CONN_CONNECTING;
CDEBUG(D_NET, "Connecting to, service id "LPX64", on "LPX64"\n",
conn->ibc_connreq->cr_service.service_id,
- *koibnal_service_nid_field(&conn->ibc_connreq->cr_service));
+ *kibnal_service_nid_field(&conn->ibc_connreq->cr_service));
- /* koibnal_connect_callback gets my conn ref */
+ /* kibnal_connect_callback gets my conn ref */
status = ib_cm_connect (&conn->ibc_connreq->cr_connparam,
&conn->ibc_connreq->cr_path, NULL,
conn->ibc_connreq->cr_service.service_id, 0,
- koibnal_active_conn_callback, conn,
+ kibnal_active_conn_callback, conn,
&conn->ibc_comm_id);
if (status != 0) {
CERROR ("Connect: %d\n", status);
/* Back out state change: I've not got a CM comm_id yet... */
- conn->ibc_state = OPENIBNAL_CONN_INIT_QP;
- koibnal_connreq_done (conn, 1, status);
+ conn->ibc_state = IBNAL_CONN_INIT_QP;
+ kibnal_connreq_done (conn, 1, status);
}
out:
}
void
-koibnal_service_get_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
- struct ib_common_attrib_service *resp, void *arg)
+kibnal_service_get_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
+ struct ib_common_attrib_service *resp, void *arg)
{
- koib_conn_t *conn = arg;
+ kib_conn_t *conn = arg;
if (status != 0) {
CERROR ("status %d\n", status);
- koibnal_connreq_done (conn, 1, status);
+ kibnal_connreq_done (conn, 1, status);
return;
}
CDEBUG(D_NET, "Got status %d, service id "LPX64", on "LPX64"\n",
status, resp->service_id,
- *koibnal_service_nid_field(resp));
+ *kibnal_service_nid_field(resp));
conn->ibc_connreq->cr_service = *resp;
- status = ib_cached_gid_get(koibnal_data.koib_device,
- koibnal_data.koib_port, 0,
+ status = ib_cached_gid_get(kibnal_data.kib_device,
+ kibnal_data.kib_port, 0,
conn->ibc_connreq->cr_gid);
LASSERT (status == 0);
- /* koibnal_pathreq_callback gets my conn ref */
- status = tsIbPathRecordRequest (koibnal_data.koib_device,
- koibnal_data.koib_port,
+ /* kibnal_pathreq_callback gets my conn ref */
+ status = tsIbPathRecordRequest (kibnal_data.kib_device,
+ kibnal_data.kib_port,
conn->ibc_connreq->cr_gid,
conn->ibc_connreq->cr_service.service_gid,
conn->ibc_connreq->cr_service.service_pkey,
0,
- koibnal_tunables.koib_io_timeout * HZ,
+ kibnal_tunables.kib_io_timeout * HZ,
0,
- koibnal_pathreq_callback, conn,
+ kibnal_pathreq_callback, conn,
&conn->ibc_connreq->cr_tid);
if (status == 0)
return;
CERROR ("Path record request: %d\n", status);
- koibnal_connreq_done (conn, 1, status);
+ kibnal_connreq_done (conn, 1, status);
}
void
-koibnal_connect_peer (koib_peer_t *peer)
+kibnal_connect_peer (kib_peer_t *peer)
{
- koib_conn_t *conn = koibnal_create_conn();
+ kib_conn_t *conn = kibnal_create_conn();
int rc;
LASSERT (peer->ibp_connecting != 0);
if (conn == NULL) {
CERROR ("Can't allocate conn\n");
- koibnal_peer_connect_failed (peer, 1, -ENOMEM);
+ kibnal_peer_connect_failed (peer, 1, -ENOMEM);
return;
}
PORTAL_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
if (conn->ibc_connreq == NULL) {
CERROR ("Can't allocate connreq\n");
- koibnal_connreq_done (conn, 1, -ENOMEM);
+ kibnal_connreq_done (conn, 1, -ENOMEM);
return;
}
memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq));
- koibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid);
+ kibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid);
- /* koibnal_service_get_callback gets my conn ref */
- rc = ib_service_get (koibnal_data.koib_device,
- koibnal_data.koib_port,
+ /* kibnal_service_get_callback gets my conn ref */
+ rc = ib_service_get (kibnal_data.kib_device,
+ kibnal_data.kib_port,
&conn->ibc_connreq->cr_service,
- KOIBNAL_SERVICE_KEY_MASK,
- koibnal_tunables.koib_io_timeout * HZ,
- koibnal_service_get_callback, conn,
+ KIBNAL_SERVICE_KEY_MASK,
+ kibnal_tunables.kib_io_timeout * HZ,
+ kibnal_service_get_callback, conn,
&conn->ibc_connreq->cr_tid);
if (rc == 0)
return;
CERROR ("ib_service_get: %d\n", rc);
- koibnal_connreq_done (conn, 1, rc);
+ kibnal_connreq_done (conn, 1, rc);
}
int
-koibnal_conn_timed_out (koib_conn_t *conn)
+kibnal_conn_timed_out (kib_conn_t *conn)
{
- koib_tx_t *tx;
+ kib_tx_t *tx;
struct list_head *ttmp;
unsigned long flags;
- int rc = 0;
spin_lock_irqsave (&conn->ibc_lock, flags);
- list_for_each (ttmp, &conn->ibc_rdma_queue) {
- tx = list_entry (ttmp, koib_tx_t, tx_list);
+ list_for_each (ttmp, &conn->ibc_tx_queue) {
+ tx = list_entry (ttmp, kib_tx_t, tx_list);
- LASSERT (tx->tx_passive_rdma);
- LASSERT (tx->tx_passive_rdma_wait);
+ LASSERT (!tx->tx_passive_rdma_wait);
+ LASSERT (tx->tx_sending == 0);
- if (time_after_eq (jiffies, tx->tx_passive_rdma_deadline)) {
- rc = 1;
- break;
+ if (time_after_eq (jiffies, tx->tx_deadline)) {
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+ return 1;
}
}
+
+ list_for_each (ttmp, &conn->ibc_active_txs) {
+ tx = list_entry (ttmp, kib_tx_t, tx_list);
+
+ LASSERT (tx->tx_passive_rdma ||
+ !tx->tx_passive_rdma_wait);
+
+ LASSERT (tx->tx_passive_rdma_wait ||
+ tx->tx_sending != 0);
+
+ if (time_after_eq (jiffies, tx->tx_deadline)) {
+ spin_unlock_irqrestore (&conn->ibc_lock, flags);
+ return 1;
+ }
+ }
+
spin_unlock_irqrestore (&conn->ibc_lock, flags);
- return rc;
+ return 0;
}
void
-koibnal_check_conns (int idx)
+kibnal_check_conns (int idx)
{
- struct list_head *peers = &koibnal_data.koib_peers[idx];
+ struct list_head *peers = &kibnal_data.kib_peers[idx];
struct list_head *ptmp;
- koib_peer_t *peer;
- koib_conn_t *conn;
+ kib_peer_t *peer;
+ kib_conn_t *conn;
struct list_head *ctmp;
again:
/* NB. We expect to have a look at all the peers and not find any
* rdmas to time out, so we just use a shared lock while we
* take a look... */
- read_lock (&koibnal_data.koib_global_lock);
+ read_lock (&kibnal_data.kib_global_lock);
list_for_each (ptmp, peers) {
- peer = list_entry (ptmp, koib_peer_t, ibp_list);
+ peer = list_entry (ptmp, kib_peer_t, ibp_list);
list_for_each (ctmp, &peer->ibp_conns) {
- conn = list_entry (ctmp, koib_conn_t, ibc_list);
+ conn = list_entry (ctmp, kib_conn_t, ibc_list);
+
+ LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
- LASSERT (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED);
/* In case we have enough credits to return via a
* NOOP, but there were no non-blocking tx descs
* free to do it last time... */
- koibnal_check_sends(conn);
+ kibnal_check_sends(conn);
- if (!koibnal_conn_timed_out(conn))
+ if (!kibnal_conn_timed_out(conn))
continue;
CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
atomic_read (&conn->ibc_refcount));
atomic_inc (&conn->ibc_refcount);
- read_unlock (&koibnal_data.koib_global_lock);
+ read_unlock (&kibnal_data.kib_global_lock);
CERROR("Timed out RDMA with "LPX64"\n",
peer->ibp_nid);
- koibnal_close_conn (conn, -ETIMEDOUT);
- koibnal_put_conn (conn);
+ kibnal_close_conn (conn, -ETIMEDOUT);
+ kibnal_put_conn (conn);
/* start again now I've dropped the lock */
goto again;
}
}
- read_unlock (&koibnal_data.koib_global_lock);
+ read_unlock (&kibnal_data.kib_global_lock);
}
void
-koibnal_terminate_conn (koib_conn_t *conn)
+kibnal_terminate_conn (kib_conn_t *conn)
{
- unsigned long flags;
int rc;
- int done;
CDEBUG(D_NET, "conn %p\n", conn);
- LASSERT (conn->ibc_state == OPENIBNAL_CONN_DEATHROW);
- conn->ibc_state = OPENIBNAL_CONN_ZOMBIE;
+ LASSERT (conn->ibc_state == IBNAL_CONN_DEATHROW);
+ conn->ibc_state = IBNAL_CONN_ZOMBIE;
rc = ib_cm_disconnect (conn->ibc_comm_id);
if (rc != 0)
CERROR ("Error %d disconnecting conn %p -> "LPX64"\n",
rc, conn, conn->ibc_peer->ibp_nid);
-
- /* complete blocked passive RDMAs */
- spin_lock_irqsave (&conn->ibc_lock, flags);
-
- while (!list_empty (&conn->ibc_rdma_queue)) {
- koib_tx_t *tx = list_entry (conn->ibc_rdma_queue.next,
- koib_tx_t, tx_list);
-
- LASSERT (tx->tx_passive_rdma);
- LASSERT (tx->tx_passive_rdma_wait);
-
- list_del (&tx->tx_list);
-
- tx->tx_passive_rdma_wait = 0;
- done = (tx->tx_sending == 0);
-
- tx->tx_status = -ECONNABORTED;
-
- spin_unlock_irqrestore (&conn->ibc_lock, flags);
-
- if (done)
- koibnal_tx_done (tx);
-
- spin_lock_irqsave (&conn->ibc_lock, flags);
- }
-
- spin_unlock_irqrestore (&conn->ibc_lock, flags);
-
- /* Complete all blocked transmits */
- koibnal_check_sends(conn);
}
int
-koibnal_connd (void *arg)
+kibnal_connd (void *arg)
{
wait_queue_t wait;
unsigned long flags;
- koib_conn_t *conn;
- koib_peer_t *peer;
+ kib_conn_t *conn;
+ kib_peer_t *peer;
int timeout;
int i;
int peer_index = 0;
unsigned long deadline = jiffies;
- kportal_daemonize ("koibnal_connd");
+ kportal_daemonize ("kibnal_connd");
kportal_blockallsigs ();
init_waitqueue_entry (&wait, current);
- spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags);
+ spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
for (;;) {
- if (!list_empty (&koibnal_data.koib_connd_conns)) {
- conn = list_entry (koibnal_data.koib_connd_conns.next,
- koib_conn_t, ibc_list);
+ if (!list_empty (&kibnal_data.kib_connd_conns)) {
+ conn = list_entry (kibnal_data.kib_connd_conns.next,
+ kib_conn_t, ibc_list);
list_del (&conn->ibc_list);
- spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags);
+ spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
switch (conn->ibc_state) {
- case OPENIBNAL_CONN_DEATHROW:
+ case IBNAL_CONN_DEATHROW:
LASSERT (conn->ibc_comm_id != TS_IB_CM_COMM_ID_INVALID);
/* Disconnect: conn becomes a zombie in the
* callback and last ref reschedules it
* here... */
- koibnal_terminate_conn(conn);
- koibnal_put_conn (conn);
+ kibnal_terminate_conn(conn);
+ kibnal_put_conn (conn);
break;
- case OPENIBNAL_CONN_ZOMBIE:
- koibnal_destroy_conn (conn);
+ case IBNAL_CONN_ZOMBIE:
+ kibnal_destroy_conn (conn);
break;
default:
LBUG();
}
- spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags);
+ spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
continue;
}
- if (!list_empty (&koibnal_data.koib_connd_peers)) {
- peer = list_entry (koibnal_data.koib_connd_peers.next,
- koib_peer_t, ibp_connd_list);
+ if (!list_empty (&kibnal_data.kib_connd_peers)) {
+ peer = list_entry (kibnal_data.kib_connd_peers.next,
+ kib_peer_t, ibp_connd_list);
list_del_init (&peer->ibp_connd_list);
- spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags);
+ spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
- koibnal_connect_peer (peer);
- koibnal_put_peer (peer);
+ kibnal_connect_peer (peer);
+ kibnal_put_peer (peer);
- spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags);
+ spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
}
/* shut down and nobody left to reap... */
- if (koibnal_data.koib_shutdown &&
- atomic_read(&koibnal_data.koib_nconns) == 0)
+ if (kibnal_data.kib_shutdown &&
+ atomic_read(&kibnal_data.kib_nconns) == 0)
break;
- spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags);
+ spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
/* careful with the jiffy wrap... */
while ((timeout = (int)(deadline - jiffies)) <= 0) {
const int n = 4;
const int p = 1;
- int chunk = koibnal_data.koib_peer_hash_size;
+ int chunk = kibnal_data.kib_peer_hash_size;
/* Time to check for RDMA timeouts on a few more
* peers: I do checks every 'p' seconds on a
* connection within (n+1)/n times the timeout
* interval. */
- if (koibnal_tunables.koib_io_timeout > n * p)
+ if (kibnal_tunables.kib_io_timeout > n * p)
chunk = (chunk * n * p) /
- koibnal_tunables.koib_io_timeout;
+ kibnal_tunables.kib_io_timeout;
if (chunk == 0)
chunk = 1;
for (i = 0; i < chunk; i++) {
- koibnal_check_conns (peer_index);
+ kibnal_check_conns (peer_index);
peer_index = (peer_index + 1) %
- koibnal_data.koib_peer_hash_size;
+ kibnal_data.kib_peer_hash_size;
}
deadline += p * HZ;
}
- koibnal_data.koib_connd_waketime = jiffies + timeout;
+ kibnal_data.kib_connd_waketime = jiffies + timeout;
set_current_state (TASK_INTERRUPTIBLE);
- add_wait_queue (&koibnal_data.koib_connd_waitq, &wait);
+ add_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
- if (!koibnal_data.koib_shutdown &&
- list_empty (&koibnal_data.koib_connd_conns) &&
- list_empty (&koibnal_data.koib_connd_peers))
+ if (!kibnal_data.kib_shutdown &&
+ list_empty (&kibnal_data.kib_connd_conns) &&
+ list_empty (&kibnal_data.kib_connd_peers))
schedule_timeout (timeout);
set_current_state (TASK_RUNNING);
- remove_wait_queue (&koibnal_data.koib_connd_waitq, &wait);
+ remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
- spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags);
+ spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
}
- spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags);
+ spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
- koibnal_thread_fini ();
+ kibnal_thread_fini ();
return (0);
}
int
-koibnal_scheduler(void *arg)
+kibnal_scheduler(void *arg)
{
long id = (long)arg;
char name[16];
- koib_rx_t *rx;
- koib_tx_t *tx;
+ kib_rx_t *rx;
+ kib_tx_t *tx;
unsigned long flags;
int rc;
int counter = 0;
int did_something;
- snprintf(name, sizeof(name), "koibnal_sd_%02ld", id);
+ snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
kportal_daemonize(name);
kportal_blockallsigs();
- spin_lock_irqsave(&koibnal_data.koib_sched_lock, flags);
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
for (;;) {
did_something = 0;
- while (!list_empty(&koibnal_data.koib_sched_txq)) {
- tx = list_entry(koibnal_data.koib_sched_txq.next,
- koib_tx_t, tx_list);
+ while (!list_empty(&kibnal_data.kib_sched_txq)) {
+ tx = list_entry(kibnal_data.kib_sched_txq.next,
+ kib_tx_t, tx_list);
list_del(&tx->tx_list);
- spin_unlock_irqrestore(&koibnal_data.koib_sched_lock,
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
flags);
- koibnal_tx_done(tx);
+ kibnal_tx_done(tx);
- spin_lock_irqsave(&koibnal_data.koib_sched_lock,
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock,
flags);
}
- if (!list_empty(&koibnal_data.koib_sched_rxq)) {
- rx = list_entry(koibnal_data.koib_sched_rxq.next,
- koib_rx_t, rx_list);
+ if (!list_empty(&kibnal_data.kib_sched_rxq)) {
+ rx = list_entry(kibnal_data.kib_sched_rxq.next,
+ kib_rx_t, rx_list);
list_del(&rx->rx_list);
- spin_unlock_irqrestore(&koibnal_data.koib_sched_lock,
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
flags);
- koibnal_rx(rx);
+ kibnal_rx(rx);
did_something = 1;
- spin_lock_irqsave(&koibnal_data.koib_sched_lock,
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock,
flags);
}
/* shut down and no receives to complete... */
- if (koibnal_data.koib_shutdown &&
- atomic_read(&koibnal_data.koib_nconns) == 0)
+ if (kibnal_data.kib_shutdown &&
+ atomic_read(&kibnal_data.kib_nconns) == 0)
break;
/* nothing to do or hogging CPU */
- if (!did_something || counter++ == OPENIBNAL_RESCHED) {
- spin_unlock_irqrestore(&koibnal_data.koib_sched_lock,
+ if (!did_something || counter++ == IBNAL_RESCHED) {
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
flags);
counter = 0;
if (!did_something) {
rc = wait_event_interruptible(
- koibnal_data.koib_sched_waitq,
- !list_empty(&koibnal_data.koib_sched_txq) ||
- !list_empty(&koibnal_data.koib_sched_rxq) ||
- (koibnal_data.koib_shutdown &&
- atomic_read (&koibnal_data.koib_nconns) == 0));
+ kibnal_data.kib_sched_waitq,
+ !list_empty(&kibnal_data.kib_sched_txq) ||
+ !list_empty(&kibnal_data.kib_sched_rxq) ||
+ (kibnal_data.kib_shutdown &&
+ atomic_read (&kibnal_data.kib_nconns) == 0));
} else {
our_cond_resched();
}
- spin_lock_irqsave(&koibnal_data.koib_sched_lock,
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock,
flags);
}
}
- spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, flags);
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
- koibnal_thread_fini();
+ kibnal_thread_fini();
return (0);
}
-lib_nal_t koibnal_lib = {
- libnal_data: &koibnal_data, /* NAL private data */
- libnal_send: koibnal_send,
- libnal_send_pages: koibnal_send_pages,
- libnal_recv: koibnal_recv,
- libnal_recv_pages: koibnal_recv_pages,
- libnal_dist: koibnal_dist
+lib_nal_t kibnal_lib = {
+ libnal_data: &kibnal_data, /* NAL private data */
+ libnal_send: kibnal_send,
+ libnal_send_pages: kibnal_send_pages,
+ libnal_recv: kibnal_recv,
+ libnal_recv_pages: kibnal_recv_pages,
+ libnal_dist: kibnal_dist
};
#define QSWNAL_SYSCTL 201
#define QSWNAL_SYSCTL_OPTIMIZED_GETS 1
-#define QSWNAL_SYSCTL_COPY_SMALL_FWD 2
+#define QSWNAL_SYSCTL_OPTIMIZED_PUTS 2
static ctl_table kqswnal_ctl_table[] = {
- {QSWNAL_SYSCTL_OPTIMIZED_GETS, "optimized_puts",
+ {QSWNAL_SYSCTL_OPTIMIZED_PUTS, "optimized_puts",
&kqswnal_tunables.kqn_optimized_puts, sizeof (int),
0644, NULL, &proc_dointvec},
{QSWNAL_SYSCTL_OPTIMIZED_GETS, "optimized_gets",
kqswnal_shutdown(nal_t *nal)
{
unsigned long flags;
+ kqswnal_tx_t *ktx;
+ kqswnal_rx_t *krx;
int do_lib_fini = 0;
/* NB The first ref was this module! */
* ep_dvma_release() get fixed (and releases any mappings in the
* region), we can delete all the code from here --------> */
- if (kqswnal_data.kqn_txds != NULL) {
- int i;
+ for (ktx = kqswnal_data.kqn_txds; ktx != NULL; ktx = ktx->ktx_alloclist) {
+ /* If ktx has a buffer, it got mapped; unmap now. NB only
+ * the pre-mapped stuff is still mapped since all tx descs
+ * must be idle */
- for (i = 0; i < KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS; i++) {
- kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i];
-
- /* If ktx has a buffer, it got mapped; unmap now.
- * NB only the pre-mapped stuff is still mapped
- * since all tx descs must be idle */
-
- if (ktx->ktx_buffer != NULL)
- ep_dvma_unload(kqswnal_data.kqn_ep,
- kqswnal_data.kqn_ep_tx_nmh,
- &ktx->ktx_ebuffer);
- }
+ if (ktx->ktx_buffer != NULL)
+ ep_dvma_unload(kqswnal_data.kqn_ep,
+ kqswnal_data.kqn_ep_tx_nmh,
+ &ktx->ktx_ebuffer);
}
- if (kqswnal_data.kqn_rxds != NULL) {
- int i;
-
- for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++) {
- kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i];
-
- /* If krx_kiov[0].kiov_page got allocated, it got mapped.
- * NB subsequent pages get merged */
+ for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) {
+ /* If krx_kiov[0].kiov_page got allocated, it got mapped.
+ * NB subsequent pages get merged */
- if (krx->krx_kiov[0].kiov_page != NULL)
- ep_dvma_unload(kqswnal_data.kqn_ep,
- kqswnal_data.kqn_ep_rx_nmh,
- &krx->krx_elanbuffer);
- }
+ if (krx->krx_kiov[0].kiov_page != NULL)
+ ep_dvma_unload(kqswnal_data.kqn_ep,
+ kqswnal_data.kqn_ep_rx_nmh,
+ &krx->krx_elanbuffer);
}
/* <----------- to here */
}
#endif
- if (kqswnal_data.kqn_txds != NULL)
- {
- int i;
+ while (kqswnal_data.kqn_txds != NULL) {
+ ktx = kqswnal_data.kqn_txds;
- for (i = 0; i < KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS; i++)
- {
- kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i];
-
- if (ktx->ktx_buffer != NULL)
- PORTAL_FREE(ktx->ktx_buffer,
- KQSW_TX_BUFFER_SIZE);
- }
+ if (ktx->ktx_buffer != NULL)
+ PORTAL_FREE(ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE);
- PORTAL_FREE(kqswnal_data.kqn_txds,
- sizeof (kqswnal_tx_t) * (KQSW_NTXMSGS +
- KQSW_NNBLK_TXMSGS));
+ kqswnal_data.kqn_txds = ktx->ktx_alloclist;
+ PORTAL_FREE(ktx, sizeof(*ktx));
}
- if (kqswnal_data.kqn_rxds != NULL)
- {
- int i;
- int j;
+ while (kqswnal_data.kqn_rxds != NULL) {
+ int i;
- for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++)
- {
- kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i];
+ krx = kqswnal_data.kqn_rxds;
+ for (i = 0; i < krx->krx_npages; i++)
+ if (krx->krx_kiov[i].kiov_page != NULL)
+ __free_page (krx->krx_kiov[i].kiov_page);
- for (j = 0; j < krx->krx_npages; j++)
- if (krx->krx_kiov[j].kiov_page != NULL)
- __free_page (krx->krx_kiov[j].kiov_page);
- }
-
- PORTAL_FREE(kqswnal_data.kqn_rxds,
- sizeof(kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL +
- KQSW_NRXMSGS_LARGE));
+ kqswnal_data.kqn_rxds = krx->krx_alloclist;
+ PORTAL_FREE(krx, sizeof (*krx));
}
/* resets flags, pointers to NULL etc */
#endif
int rc;
int i;
+ kqswnal_rx_t *krx;
+ kqswnal_tx_t *ktx;
int elan_page_idx;
ptl_process_id_t my_process_id;
int pkmem = atomic_read(&portal_kmemory);
/**********************************************************************/
/* Allocate/Initialise transmit descriptors */
- PORTAL_ALLOC(kqswnal_data.kqn_txds,
- sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS));
- if (kqswnal_data.kqn_txds == NULL)
- {
- kqswnal_shutdown (nal);
- return (PTL_NO_SPACE);
- }
-
- /* clear flags, null pointers etc */
- memset(kqswnal_data.kqn_txds, 0,
- sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS));
+ kqswnal_data.kqn_txds = NULL;
for (i = 0; i < (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS); i++)
{
int premapped_pages;
- kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i];
int basepage = i * KQSW_NTXMSGPAGES;
+ PORTAL_ALLOC (ktx, sizeof(*ktx));
+ if (ktx == NULL) {
+ kqswnal_shutdown (nal);
+ return (PTL_NO_SPACE);
+ }
+
+ memset(ktx, 0, sizeof(*ktx)); /* NULL pointers; zero flags */
+ ktx->ktx_alloclist = kqswnal_data.kqn_txds;
+ kqswnal_data.kqn_txds = ktx;
+
PORTAL_ALLOC (ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE);
if (ktx->ktx_buffer == NULL)
{
/**********************************************************************/
/* Allocate/Initialise receive descriptors */
-
- PORTAL_ALLOC (kqswnal_data.kqn_rxds,
- sizeof (kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE));
- if (kqswnal_data.kqn_rxds == NULL)
- {
- kqswnal_shutdown (nal);
- return (PTL_NO_SPACE);
- }
-
- memset(kqswnal_data.kqn_rxds, 0, /* clear flags, null pointers etc */
- sizeof(kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL+KQSW_NRXMSGS_LARGE));
-
+ kqswnal_data.kqn_rxds = NULL;
elan_page_idx = 0;
for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++)
{
E3_Addr elanbuffer;
#endif
int j;
- kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i];
+
+ PORTAL_ALLOC(krx, sizeof(*krx));
+ if (krx == NULL) {
+ kqswnal_shutdown(nal);
+ return (PTL_NO_SPACE);
+ }
+
+ memset(krx, 0, sizeof(*krx)); /* clear flags, null pointers etc */
+ krx->krx_alloclist = kqswnal_data.kqn_rxds;
+ kqswnal_data.kqn_rxds = krx;
if (i < KQSW_NRXMSGS_SMALL)
{
/**********************************************************************/
/* Queue receives, now that it's OK to run their completion callbacks */
- for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++)
- {
- kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i];
-
+ for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) {
/* NB this enqueue can allocate/sleep (attr == 0) */
krx->krx_state = KRX_POSTED;
#if MULTIRAIL_EKC
#define KQSW_TX_MAXCONTIG (1<<10) /* largest payload that gets made contiguous on transmit */
#define KQSW_NTXMSGS 8 /* # normal transmit messages */
-#define KQSW_NNBLK_TXMSGS 256 /* # reserved transmit messages if can't block */
+#define KQSW_NNBLK_TXMSGS 512 /* # reserved transmit messages if can't block */
#define KQSW_NRXMSGS_LARGE 64 /* # large receive buffers */
-#define KQSW_EP_ENVELOPES_LARGE 128 /* # large ep envelopes */
+#define KQSW_EP_ENVELOPES_LARGE 256 /* # large ep envelopes */
#define KQSW_NRXMSGS_SMALL 256 /* # small receive buffers */
#define KQSW_EP_ENVELOPES_SMALL 2048 /* # small ep envelopes */
#endif
} kqswnal_remotemd_t;
-typedef struct
+typedef struct kqswnal_rx
{
struct list_head krx_list; /* enqueue -> thread */
+ struct kqswnal_rx *krx_alloclist; /* stack in kqn_rxds */
EP_RCVR *krx_eprx; /* port to post receives to */
EP_RXD *krx_rxd; /* receive descriptor (for repost) */
#if MULTIRAIL_EKC
#define KRX_COMPLETING 3 /* waiting to be completed */
-typedef struct
+typedef struct kqswnal_tx
{
struct list_head ktx_list; /* enqueue idle/active */
struct list_head ktx_delayed_list; /* enqueue delayedtxds */
+ struct kqswnal_tx *ktx_alloclist; /* stack in kqn_txds */
unsigned int ktx_isnblk:1; /* reserved descriptor? */
unsigned int ktx_state:7; /* What I'm doing */
unsigned int ktx_firsttmpfrag:1; /* ktx_frags[0] is in my ebuffer ? 0 : 1 */
char kqn_shuttingdown; /* I'm trying to shut down */
atomic_t kqn_nthreads; /* # threads running */
- kqswnal_rx_t *kqn_rxds; /* all the receive descriptors */
- kqswnal_tx_t *kqn_txds; /* all the transmit descriptors */
+ kqswnal_rx_t *kqn_rxds; /* stack of all the receive descriptors */
+ kqswnal_tx_t *kqn_txds; /* stack of all the transmit descriptors */
struct list_head kqn_idletxds; /* transmit descriptors free to use */
struct list_head kqn_nblk_idletxds; /* reserved free transmit descriptors */
}
kscimacnal_data.ksci_nid = (ptl_nid_t)(ntohl(mac_physaddr));
- process_id.pid = requested_pid;
+ process_id.pid = 0;
process_id.nid = kscimacnal_data.ksci_nid;
CDEBUG(D_NET, "calling lib_init with nid "LPX64"\n",
conn2->ksnc_type != conn->ksnc_type ||
conn2->ksnc_incarnation != incarnation)
continue;
-
+
CWARN("Not creating duplicate connection to "
- "%u.%u.%u.%u type %d\n",
+ "%u.%u.%u.%u type %d\n",
HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_type);
rc = -EALREADY;
goto failed_2;
break;
}
+ /* Give conn a ref on sock->file since we're going to return success */
+ get_file(sock->file);
+
conn->ksnc_peer = peer; /* conn takes my ref on peer */
conn->ksnc_incarnation = incarnation;
peer->ksnp_last_alive = jiffies;
ksocknal_putconnsock(conn);
}
- CWARN("New conn nid:"LPX64" [type:%d] %u.%u.%u.%u -> %u.%u.%u.%u/%d"
+ CWARN("New conn nid:"LPX64" %u.%u.%u.%u -> %u.%u.%u.%u/%d"
" incarnation:"LPX64" sched[%d]/%d\n",
- nid, conn->ksnc_type, HIPQUAD(conn->ksnc_myipaddr),
+ nid, HIPQUAD(conn->ksnc_myipaddr),
HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port, incarnation,
(int)(conn->ksnc_scheduler - ksocknal_data.ksnd_schedulers), irq);
rc = -EINVAL;
break;
}
- if (rc != 0)
- fput (sock->file);
+ fput (sock->file);
break;
}
case NAL_CMD_CLOSE_CONNECTION: {
#include <portals/lib-p30.h>
#include <portals/nal.h>
#include <portals/socknal.h>
-#include <linux/lustre_idl.h>
-#include <linux/lustre_idl.h>
#define SOCKNAL_N_AUTOCONNECTD 4 /* # socknal autoconnect daemons */
#define SOCKNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */
return (0);
}
-int
-ksocknal_connect_peer (ksock_route_t *route, int type)
+static int
+ksocknal_connect_sock(struct socket **sockp, int *may_retry,
+ ksock_route_t *route, int local_port)
{
- struct sockaddr_in ipaddr;
- mm_segment_t oldmm = get_fs();
- struct timeval tv;
- int fd;
+ struct sockaddr_in locaddr;
+ struct sockaddr_in srvaddr;
struct socket *sock;
int rc;
-
+ int option;
+ mm_segment_t oldmm = get_fs();
+ struct timeval tv;
+
+ memset(&locaddr, 0, sizeof(locaddr));
+ locaddr.sin_family = AF_INET;
+ locaddr.sin_port = htons(local_port);
+ locaddr.sin_addr.s_addr =
+ (route->ksnr_myipaddr != 0) ? htonl(route->ksnr_myipaddr)
+ : INADDR_ANY;
+
+ memset (&srvaddr, 0, sizeof (srvaddr));
+ srvaddr.sin_family = AF_INET;
+ srvaddr.sin_port = htons (route->ksnr_port);
+ srvaddr.sin_addr.s_addr = htonl (route->ksnr_ipaddr);
+
+ *may_retry = 0;
+
rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock);
+ *sockp = sock;
if (rc != 0) {
CERROR ("Can't create autoconnect socket: %d\n", rc);
return (rc);
* from userspace. And we actually need the sock->file refcounting
* that this gives you :) */
- fd = sock_map_fd (sock);
- if (fd < 0) {
+ rc = sock_map_fd (sock);
+ if (rc < 0) {
sock_release (sock);
- CERROR ("sock_map_fd error %d\n", fd);
- return (fd);
+ CERROR ("sock_map_fd error %d\n", rc);
+ return (rc);
}
- /* NB the fd now owns the ref on sock->file */
+ /* NB the file descriptor (rc) now owns the ref on sock->file */
LASSERT (sock->file != NULL);
LASSERT (file_count(sock->file) == 1);
+ get_file(sock->file); /* extra ref makes sock->file */
+ sys_close(rc); /* survive this close */
+
+ /* Still got a single ref on sock->file */
+ LASSERT (file_count(sock->file) == 1);
+
/* Set the socket timeouts, so our connection attempt completes in
* finite time */
tv.tv_sec = ksocknal_tunables.ksnd_io_timeout;
if (rc != 0) {
CERROR ("Can't set send timeout %d: %d\n",
ksocknal_tunables.ksnd_io_timeout, rc);
- goto out;
+ goto failed;
}
set_fs (KERNEL_DS);
if (rc != 0) {
CERROR ("Can't set receive timeout %d: %d\n",
ksocknal_tunables.ksnd_io_timeout, rc);
- goto out;
+ goto failed;
}
- if (route->ksnr_myipaddr != 0) {
- /* Bind to the local IP address */
- memset (&ipaddr, 0, sizeof (ipaddr));
- ipaddr.sin_family = AF_INET;
- ipaddr.sin_port = htons (0); /* ANY */
- ipaddr.sin_addr.s_addr = htonl(route->ksnr_myipaddr);
+ set_fs (KERNEL_DS);
+ option = 1;
+ rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
+ (char *)&option, sizeof (option));
+ set_fs (oldmm);
+ if (rc != 0) {
+ CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc);
+ goto failed;
+ }
- rc = sock->ops->bind (sock, (struct sockaddr *)&ipaddr,
- sizeof (ipaddr));
- if (rc != 0) {
- CERROR ("Can't bind to local IP %u.%u.%u.%u: %d\n",
- HIPQUAD(route->ksnr_myipaddr), rc);
- goto out;
- }
+ rc = sock->ops->bind(sock,
+ (struct sockaddr *)&locaddr, sizeof(locaddr));
+ if (rc == -EADDRINUSE) {
+ CDEBUG(D_NET, "Port %d already in use\n", local_port);
+ *may_retry = 1;
+ goto failed;
}
-
- memset (&ipaddr, 0, sizeof (ipaddr));
- ipaddr.sin_family = AF_INET;
- ipaddr.sin_port = htons (route->ksnr_port);
- ipaddr.sin_addr.s_addr = htonl (route->ksnr_ipaddr);
-
- rc = sock->ops->connect (sock, (struct sockaddr *)&ipaddr,
- sizeof (ipaddr), sock->file->f_flags);
if (rc != 0) {
- CERROR ("Can't connect to nid "LPX64
- " local IP: %u.%u.%u.%u,"
- " remote IP: %u.%u.%u.%u/%d: %d\n",
- route->ksnr_peer->ksnp_nid,
- HIPQUAD(route->ksnr_myipaddr),
- HIPQUAD(route->ksnr_ipaddr),
- route->ksnr_port, rc);
- goto out;
+ CERROR("Error trying to bind to reserved port %d: %d\n",
+ local_port, rc);
+ goto failed;
}
- rc = ksocknal_create_conn (route, sock, type);
- if (rc == 0) {
- /* Take an extra ref on sock->file to compensate for the
- * upcoming close which will lose fd's ref on it. */
- get_file (sock->file);
+ rc = sock->ops->connect(sock,
+ (struct sockaddr *)&srvaddr, sizeof(srvaddr),
+ sock->file->f_flags);
+ if (rc == 0)
+ return 0;
+
+ /* EADDRNOTAVAIL probably means we're already connected to the same
+ * peer/port on the same local port on a differently typed
+ * connection. Let our caller retry with a different local
+ * port... */
+ *may_retry = (rc == -EADDRNOTAVAIL);
+
+ CDEBUG(*may_retry ? D_NET : D_ERROR,
+ "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc,
+ HIPQUAD(route->ksnr_myipaddr), local_port,
+ HIPQUAD(route->ksnr_ipaddr), route->ksnr_port);
+
+ failed:
+ fput(sock->file);
+ return rc;
+}
+
+int
+ksocknal_connect_peer (ksock_route_t *route, int type)
+{
+ struct socket *sock;
+ int rc;
+ int port;
+ int may_retry;
+
+ /* Iterate through reserved ports. When typed connections are
+ * used, we will need to bind to multiple ports, but we only know
+ * this at connect time. But, by that time we've already called
+ * bind() so we need a new socket. */
+
+ for (port = 1023; port > 512; --port) {
+
+ rc = ksocknal_connect_sock(&sock, &may_retry, route, port);
+
+ if (rc == 0) {
+ rc = ksocknal_create_conn(route, sock, type);
+ fput(sock->file);
+ return rc;
+ }
+
+ if (!may_retry)
+ return rc;
}
- out:
- sys_close (fd);
- return (rc);
+ CERROR("Out of ports trying to bind to a reserved port\n");
+ return (-EADDRINUSE);
}
void
LASSERT (type < SOCKNAL_CONN_NTYPES);
rc = ksocknal_connect_peer (route, type);
-
if (rc != 0)
break;
#endif
unsigned int portal_subsystem_debug = ~0 - (S_PORTALS | S_QSWNAL | S_SOCKNAL |
- S_GMNAL | S_OPENIBNAL);
+ S_GMNAL | S_IBNAL);
EXPORT_SYMBOL(portal_subsystem_debug);
unsigned int portal_debug = (D_WARNING | D_DLMTRACE | D_ERROR | D_EMERG | D_HA |
snprintf(debug_file_name, sizeof(debug_file_path) - 1,
"%s.%ld.%ld", debug_file_path, CURRENT_SECONDS, (long)arg);
+ printk(KERN_ALERT "LustreError: dumping log to %s\n", debug_file_name);
tracefile_dump_all_pages(debug_file_name);
current->journal_info = journal_info;
int portals_debug_mark_buffer(char *text)
{
CDEBUG(D_TRACE,"***************************************************\n");
- CWARN("DEBUG MARKER: %s\n", text);
+ CDEBUG(D_WARNING, "DEBUG MARKER: %s\n", text);
CDEBUG(D_TRACE,"***************************************************\n");
return 0;
char *portals_nid2str(int nal, ptl_nid_t nid, char *str)
{
if (nid == PTL_NID_ANY) {
- snprintf(str, PTL_NALFMT_SIZE - 1, "%s",
- "PTL_NID_ANY");
+ snprintf(str, PTL_NALFMT_SIZE, "%s", "PTL_NID_ANY");
return str;
}
switch(nal){
/* XXX this could be a nal method of some sort, 'cept it's config
* dependent whether (say) socknal NIDs are actually IP addresses... */
-#ifndef CRAY_PORTALS
+#if !CRAY_PORTALS
case TCPNAL:
/* userspace NAL */
+ case IIBNAL:
case OPENIBNAL:
case SOCKNAL:
- snprintf(str, PTL_NALFMT_SIZE - 1, "%u:%u.%u.%u.%u",
+ snprintf(str, PTL_NALFMT_SIZE, "%u:%u.%u.%u.%u",
(__u32)(nid >> 32), HIPQUAD(nid));
break;
case QSWNAL:
case GMNAL:
- snprintf(str, PTL_NALFMT_SIZE - 1, "%u:%u",
+ snprintf(str, PTL_NALFMT_SIZE, "%u:%u",
(__u32)(nid >> 32), (__u32)nid);
break;
#endif
default:
- snprintf(str, PTL_NALFMT_SIZE - 1, "?%d? %llx",
+ snprintf(str, PTL_NALFMT_SIZE, "?%x? %llx",
nal, (long long)nid);
break;
}
return str;
}
-/* bug #4615 */
+
char *portals_id2str(int nal, ptl_process_id_t id, char *str)
{
- switch(nal){
-#ifndef CRAY_PORTALS
- case TCPNAL:
- /* userspace NAL */
- case OPENIBNAL:
- case SOCKNAL:
- snprintf(str, PTL_NALFMT_SIZE - 1, "%u:%u.%u.%u.%u,%u",
- (__u32)(id.nid >> 32), HIPQUAD((id.nid)) , id.pid);
- break;
- case QSWNAL:
- case GMNAL:
- snprintf(str, PTL_NALFMT_SIZE - 1, "%u:%u,%u",
- (__u32)(id.nid >> 32), (__u32)id.nid, id.pid);
- break;
-#endif
- default:
- snprintf(str, PTL_NALFMT_SIZE - 1, "?%d? %llx,%lx",
- nal, (long long)id.nid, (long)id.pid );
- break;
- }
+ int len;
+
+ portals_nid2str(nal, id.nid, str);
+ len = strlen(str);
+ snprintf(str + len, PTL_NALFMT_SIZE, "-%u", id.pid);
return str;
}
-
#ifdef __KERNEL__
char stack_backtrace[LUSTRE_TRACE_SIZE];
spinlock_t stack_backtrace_lock = SPIN_LOCK_UNLOCKED;
CDEBUG(D_IOCTL, "calling handler nal: %d, cmd: %d\n", nal,
pcfg->pcfg_command);
rc = cmd->nch_handler(pcfg, cmd->nch_private);
+ } else {
+ CERROR("invalid nal: %d, cmd: %d\n", nal, pcfg->pcfg_command);
}
up(&nal_cmd_sem);
portals_debug_mark_buffer(data->ioc_inlbuf1);
RETURN(0);
#if LWT_SUPPORT
- case IOC_PORTAL_LWT_CONTROL:
+ case IOC_PORTAL_LWT_CONTROL:
err = lwt_control (data->ioc_flags, data->ioc_misc);
break;
-
+
case IOC_PORTAL_LWT_SNAPSHOT: {
cycles_t now;
int ncpu;
int total_size;
-
+
err = lwt_snapshot (&now, &ncpu, &total_size,
data->ioc_pbuf1, data->ioc_plen1);
data->ioc_nid = now;
data->ioc_misc = total_size;
/* Hedge against broken user/kernel typedefs (e.g. cycles_t) */
- data->ioc_nid = sizeof(lwt_event_t);
- data->ioc_nid2 = offsetof(lwt_event_t, lwte_where);
+ data->ioc_nid2 = sizeof(lwt_event_t);
+ data->ioc_nid3 = offsetof(lwt_event_t, lwte_where);
if (err == 0 &&
copy_to_user((char *)arg, data, sizeof (*data)))
err = -EFAULT;
break;
}
-
+
case IOC_PORTAL_LWT_LOOKUP_STRING:
err = lwt_lookup_string (&data->ioc_count, data->ioc_pbuf1,
data->ioc_pbuf2, data->ioc_plen2);
break;
}
- if (copy_from_user(&pcfg, (void *)data->ioc_pbuf1,
+ if (copy_from_user(&pcfg, (void *)data->ioc_pbuf1,
sizeof(pcfg))) {
err = -EFAULT;
break;
err = libcfs_nal_cmd(&pcfg);
if (err == 0 &&
- copy_to_user((char *)data->ioc_pbuf1, &pcfg,
+ copy_to_user((char *)data->ioc_pbuf1, &pcfg,
sizeof (pcfg)))
err = -EFAULT;
break;
#include <linux/kp30.h>
#include <linux/portals_compat25.h>
-#include <linux/lustre_compat25.h>
#include <linux/libcfs.h>
#define TCD_MAX_PAGES 1280
prefix = "Lustre";
ptype = KERN_INFO;
}
-
+
printk("%s%s: %d:%d:(%s:%d:%s()) %.*s", ptype, prefix, hdr->ph_pid,
hdr->ph_extern_pid, file, hdr->ph_line_num, fn, len, buf);
}
if (IS_ERR(filp)) {
rc = PTR_ERR(filp);
printk(KERN_ERR "LustreError: can't open %s for dump: rc %d\n",
- filename, rc);
+ filename, rc);
goto out;
}
"(%lu).\n", max * smp_num_cpus, num_physpages / 5 * 4);
return count;
}
+
for (i = 0; i < NR_CPUS; i++) {
struct trace_cpu_data *tcd;
tcd = &trace_data[i].tcd;
me->match_id.nid != src_nid)
continue;
- CDEBUG(D_NET,"match_id.pid [%x], src_pid [%x]\n", me->match_id.pid, src_pid);
+ CDEBUG(D_NET, "match_id.pid [%x], src_pid [%x]\n",
+ me->match_id.pid, src_pid);
if (me->match_id.pid != PTL_PID_ANY &&
me->match_id.pid != src_pid)
CDEBUG (D_IOCTL, "Getting nid for nal [%d]\n", data->ioc_nal);
- err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL, NULL, &nih);
+ err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL,
+ NULL, &nih);
if (!(err == PTL_OK || err == PTL_IFACE_DUP))
RETURN (-EINVAL);
CDEBUG (D_IOCTL, "fail nid: [%d] "LPU64" count %d\n",
data->ioc_nal, data->ioc_nid, data->ioc_count);
- err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL, NULL, &nih);
+ err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL,
+ NULL, &nih);
if (!(err == PTL_OK || err == PTL_IFACE_DUP))
return (-EINVAL);
*start = page + prd->skip;
user_len = -prd->skip;
- for (; prd->curr != &kpr_routes; prd->curr = prd->curr->next) {
+ while ((prd->curr != NULL) && (prd->curr != &kpr_routes)) {
re = list_entry(prd->curr, kpr_route_entry_t, kpre_list);
ge = re->kpre_gateway;
chunk_len += line_len;
user_len += line_len;
- /* The route table will exceed one page */
- if ((chunk_len > (PAGE_SIZE - 80)) || (user_len > count)) {
- prd->curr = prd->curr->next;
- break;
+ /* Abort the route list changed */
+ if (prd->curr->next == NULL) {
+ prd->curr = NULL;
+ read_unlock(&kpr_rwlock);
+ return sprintf(page, "\nError: Routes Changed\n");
}
+
+ prd->curr = prd->curr->next;
+
+ /* The route table will exceed one page, break the while loop
+ * so the function can be re-called with a new page.
+ */
+ if ((chunk_len > (PAGE_SIZE - 80)) || (user_len > count))
+ break;
}
*eof = 0;
{
connection conn;
struct sockaddr_in addr;
+ struct sockaddr_in locaddr;
unsigned int id[2];
struct timeval tv;
__u64 incarnation;
+ int fd;
+ int option;
+ int rc;
+ int rport;
+ ptl_nid_t peernid = PTL_NID_ANY;
+
port = tcpnal_acceptor_port;
id[0] = ip;
pthread_mutex_lock(&m->conn_lock);
conn = hash_table_find(m->connections, id);
- if (!conn) {
- int fd;
- int option;
- ptl_nid_t peernid = PTL_NID_ANY;
-
- bzero((char *) &addr, sizeof(addr));
- addr.sin_family = AF_INET;
- addr.sin_addr.s_addr = htonl(ip);
- addr.sin_port = htons(port);
-
- if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
- perror("tcpnal socket failed");
- exit(-1);
- }
- if (connect(fd, (struct sockaddr *)&addr,
- sizeof(struct sockaddr_in))) {
- perror("tcpnal connect");
- return(0);
- }
+ if (conn)
+ goto out;
+ memset(&addr, 0, sizeof(addr));
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = htonl(ip);
+ addr.sin_port = htons(port);
+
+ memset(&locaddr, 0, sizeof(locaddr));
+ locaddr.sin_family = AF_INET;
+ locaddr.sin_addr.s_addr = INADDR_ANY;
+
+ for (rport = IPPORT_RESERVED - 1; rport > IPPORT_RESERVED / 2; --rport) {
+ fd = socket(AF_INET, SOCK_STREAM, 0);
+ if (fd < 0) {
+ perror("tcpnal socket failed");
+ goto out;
+ }
+
+ option = 1;
+ rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR,
+ &option, sizeof(option));
+ if (rc != 0) {
+ perror ("Can't set SO_REUSEADDR for socket");
+ close(fd);
+ goto out;
+ }
+
+ locaddr.sin_port = htons(rport);
+ rc = bind(fd, (struct sockaddr *)&locaddr, sizeof(locaddr));
+ if (rc == 0 || errno == EACCES) {
+ rc = connect(fd, (struct sockaddr *)&addr,
+ sizeof(struct sockaddr_in));
+ if (rc == 0) {
+ break;
+ } else if (errno != EADDRINUSE && errno != EADDRNOTAVAIL) {
+ perror("Error connecting to remote host");
+ close(fd);
+ goto out;
+ }
+ } else if (errno != EADDRINUSE) {
+ perror("Error binding to privileged port");
+ close(fd);
+ goto out;
+ }
+ close(fd);
+ }
+
+ if (rport == IPPORT_RESERVED / 2) {
+ fprintf(stderr, "Out of ports trying to bind to a reserved port\n");
+ goto out;
+ }
+
#if 1
- option = 1;
- setsockopt(fd, SOL_TCP, TCP_NODELAY, &option, sizeof(option));
- option = 1<<20;
- setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &option, sizeof(option));
- option = 1<<20;
- setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &option, sizeof(option));
+ option = 1;
+ setsockopt(fd, SOL_TCP, TCP_NODELAY, &option, sizeof(option));
+ option = 1<<20;
+ setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &option, sizeof(option));
+ option = 1<<20;
+ setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &option, sizeof(option));
#endif
- gettimeofday(&tv, NULL);
- incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+ gettimeofday(&tv, NULL);
+ incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
- /* say hello */
- if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, incarnation))
+ /* say hello */
+ if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, incarnation))
exit(-1);
+
+ conn = allocate_connection(m, ip, port, fd);
+
+ /* let nal thread know this event right away */
+ if (conn)
+ procbridge_wakeup_nal(pb);
- conn = allocate_connection(m, ip, port, fd);
-
- /* let nal thread know this event right away */
- if (conn)
- procbridge_wakeup_nal(pb);
- }
-
+out:
pthread_mutex_unlock(&m->conn_lock);
return (conn);
}
void init_unix_timer(void);
void select_timer_block(when until);
when now(void);
+
+/*
+ * hacking for CFS internal MPI testing
+ */
+#if !CRAY_PORTALS
+#define ENABLE_SELECT_DISPATCH
+#endif
ptl_nid_t tcpnal_mynid;
+#ifdef ENABLE_SELECT_DISPATCH
+procbridge __global_procbridge = NULL;
+#endif
+
/* Function: procbridge_startup
*
* Arguments: pid: requested process id (port offset)
return PTL_FAIL;
}
+#ifdef ENABLE_SELECT_DISPATCH
+ __global_procbridge = p;
+#endif
+
/* create nal thread */
if (pthread_create(&p->t, NULL, nal_thread, &args)) {
perror("nal_init: pthread_create");
#include <sys/time.h>
#include <sys/types.h>
#include <stdlib.h>
+#include <syscall.h>
+#include <pthread.h>
+#include <errno.h>
#include <pqtimer.h>
#include <dispatch.h>
+#include <procbridge.h>
static struct timeval beginning_of_epoch;
i->disabled=1;
}
-static void set_flag(io_handler n,fd_set *fds)
+static void set_flag(io_handler n,fd_set *r, fd_set *w, fd_set *e)
{
- if (n->type & READ_HANDLER) FD_SET(n->fd, &fds[0]);
- if (n->type & WRITE_HANDLER) FD_SET(n->fd,&fds[1]);
- if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd, &fds[2]);
+ if (n->type & READ_HANDLER) FD_SET(n->fd, r);
+ if (n->type & WRITE_HANDLER) FD_SET(n->fd, w);
+ if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd, e);
}
-
-/* Function: select_timer_block
- * Arguments: until: an absolute time when the select should return
- *
- * This function dispatches the various file descriptors' handler
- * functions, if the kernel indicates there is io available.
- */
-void select_timer_block(when until)
+static int prepare_fd_sets(fd_set *r, fd_set *w, fd_set *e)
{
- fd_set fds[3];
- struct timeval timeout;
- struct timeval *timeout_pointer;
- int result;
io_handler j;
io_handler *k;
+ int max = 0;
- /* TODO: loop until the entire interval is expired*/
- if (until){
- when interval=until-now();
- timeout.tv_sec=(interval>>32);
- timeout.tv_usec=((interval<<32)/1000000)>>32;
- timeout_pointer=&timeout;
- } else timeout_pointer=0;
-
- FD_ZERO(&fds[0]);
- FD_ZERO(&fds[1]);
- FD_ZERO(&fds[2]);
+ FD_ZERO(r);
+ FD_ZERO(w);
+ FD_ZERO(e);
for (k=&io_handlers;*k;){
if ((*k)->disabled){
j=*k;
free(j);
}
if (*k) {
- set_flag(*k,fds);
+ set_flag(*k,r,w,e);
+ if ((*k)->fd > max)
+ max = (*k)->fd;
k=&(*k)->next;
}
}
+ return max + 1;
+}
+
+static int execute_callbacks(fd_set *r, fd_set *w, fd_set *e)
+{
+ io_handler j;
+ int n = 0, t;
+
+ for (j = io_handlers; j; j = j->next) {
+ if (j->disabled)
+ continue;
+
+ t = 0;
+ if (FD_ISSET(j->fd, r) && (j->type & READ_HANDLER)) {
+ FD_CLR(j->fd, r);
+ t++;
+ }
+ if (FD_ISSET(j->fd, w) && (j->type & WRITE_HANDLER)) {
+ FD_CLR(j->fd, w);
+ t++;
+ }
+ if (FD_ISSET(j->fd, e) && (j->type & EXCEPTION_HANDLER)) {
+ FD_CLR(j->fd, e);
+ t++;
+ }
+ if (t == 0)
+ continue;
+
+ if (!(*j->function)(j->argument))
+ j->disabled = 1;
+
+ n += t;
+ }
+
+ return n;
+}
- result=select(FD_SETSIZE, &fds[0], &fds[1], &fds[2], timeout_pointer);
+#ifdef ENABLE_SELECT_DISPATCH
- if (result > 0)
- for (j=io_handlers;j;j=j->next){
- if (!(j->disabled) &&
- ((FD_ISSET(j->fd, &fds[0]) && (j->type & READ_HANDLER)) ||
- (FD_ISSET(j->fd, &fds[1]) && (j->type & WRITE_HANDLER)) ||
- (FD_ISSET(j->fd, &fds[2]) && (j->type & EXCEPTION_HANDLER)))){
- if (!(*j->function)(j->argument))
- j->disabled=1;
+static struct {
+ pthread_mutex_t mutex;
+ pthread_cond_t cond;
+ int submitted;
+ int nready;
+ int maxfd;
+ fd_set *rset;
+ fd_set *wset;
+ fd_set *eset;
+ struct timeval *timeout;
+ struct timeval submit_time;
+} fd_extra = {
+ PTHREAD_MUTEX_INITIALIZER,
+ PTHREAD_COND_INITIALIZER,
+ 0, 0, 0,
+ NULL, NULL, NULL, NULL,
+};
+
+extern int liblustre_wait_event(int timeout);
+extern procbridge __global_procbridge;
+
+/*
+ * this will intercept syscall select() of user apps
+ * such as MPI libs.
+ */
+int select(int n, fd_set *rset, fd_set *wset, fd_set *eset,
+ struct timeval *timeout)
+{
+ LASSERT(fd_extra.submitted == 0);
+
+ fd_extra.nready = 0;
+ fd_extra.maxfd = n;
+ fd_extra.rset = rset;
+ fd_extra.wset = wset;
+ fd_extra.eset = eset;
+ fd_extra.timeout = timeout;
+
+ liblustre_wait_event(0);
+ pthread_mutex_lock(&fd_extra.mutex);
+ gettimeofday(&fd_extra.submit_time, NULL);
+ fd_extra.submitted = 1;
+ LASSERT(__global_procbridge);
+ procbridge_wakeup_nal(__global_procbridge);
+
+again:
+ if (fd_extra.submitted)
+ pthread_cond_wait(&fd_extra.cond, &fd_extra.mutex);
+ pthread_mutex_unlock(&fd_extra.mutex);
+
+ liblustre_wait_event(0);
+
+ pthread_mutex_lock(&fd_extra.mutex);
+ if (fd_extra.submitted)
+ goto again;
+ pthread_mutex_unlock(&fd_extra.mutex);
+
+ LASSERT(fd_extra.nready >= 0);
+ LASSERT(fd_extra.submitted == 0);
+ return fd_extra.nready;
+}
+
+static int merge_fds(int max, fd_set *rset, fd_set *wset, fd_set *eset)
+{
+ int i;
+
+ LASSERT(rset);
+ LASSERT(wset);
+ LASSERT(eset);
+
+ for (i = 0; i < __FD_SETSIZE/__NFDBITS; i++) {
+ LASSERT(!fd_extra.rset ||
+ !(__FDS_BITS(rset)[i] & __FDS_BITS(fd_extra.rset)[i]));
+ LASSERT(!fd_extra.wset ||
+ !(__FDS_BITS(wset)[i] & __FDS_BITS(fd_extra.wset)[i]));
+ LASSERT(!fd_extra.eset ||
+ !(__FDS_BITS(eset)[i] & __FDS_BITS(fd_extra.eset)[i]));
+
+ if (fd_extra.rset && __FDS_BITS(fd_extra.rset)[i])
+ __FDS_BITS(rset)[i] |= __FDS_BITS(fd_extra.rset)[i];
+ if (fd_extra.wset && __FDS_BITS(fd_extra.wset)[i])
+ __FDS_BITS(wset)[i] |= __FDS_BITS(fd_extra.wset)[i];
+ if (fd_extra.eset && __FDS_BITS(fd_extra.eset)[i])
+ __FDS_BITS(eset)[i] |= __FDS_BITS(fd_extra.eset)[i];
+ }
+
+ return (fd_extra.maxfd > max ? fd_extra.maxfd : max);
+}
+
+static inline
+int timeval_ge(struct timeval *tv1, struct timeval *tv2)
+{
+ LASSERT(tv1 && tv2);
+ return ((tv1->tv_sec - tv2->tv_sec) * 1000000 +
+ (tv1->tv_usec - tv2->tv_usec) >= 0);
+}
+
+/*
+ * choose the most recent timeout value
+ */
+static struct timeval *choose_timeout(struct timeval *tv1,
+ struct timeval *tv2)
+{
+ if (!tv1)
+ return tv2;
+ else if (!tv2)
+ return tv1;
+
+ if (timeval_ge(tv1, tv2))
+ return tv2;
+ else
+ return tv1;
+}
+
+/* Function: select_timer_block
+ * Arguments: until: an absolute time when the select should return
+ *
+ * This function dispatches the various file descriptors' handler
+ * functions, if the kernel indicates there is io available.
+ */
+void select_timer_block(when until)
+{
+ fd_set fds[3];
+ struct timeval timeout;
+ struct timeval *timeout_pointer, *select_timeout;
+ int max, nready, nexec;
+ int fd_handling;
+
+again:
+ if (until) {
+ when interval;
+
+ interval = until - now();
+ timeout.tv_sec = (interval >> 32);
+ timeout.tv_usec = ((interval << 32) / 1000000) >> 32;
+ timeout_pointer = &timeout;
+ } else
+ timeout_pointer = NULL;
+
+ fd_handling = 0;
+ max = prepare_fd_sets(&fds[0], &fds[1], &fds[2]);
+ select_timeout = timeout_pointer;
+
+ pthread_mutex_lock(&fd_extra.mutex);
+ fd_handling = fd_extra.submitted;
+ pthread_mutex_unlock(&fd_extra.mutex);
+ if (fd_handling) {
+ max = merge_fds(max, &fds[0], &fds[1], &fds[2]);
+ select_timeout = choose_timeout(timeout_pointer, fd_extra.timeout);
+ }
+
+ /* XXX only compile for linux */
+#if __WORDSIZE == 64
+ nready = syscall(SYS_select, max, &fds[0], &fds[1], &fds[2],
+ select_timeout);
+#else
+ nready = syscall(SYS__newselect, max, &fds[0], &fds[1], &fds[2],
+ select_timeout);
+#endif
+ if (nready < 0) {
+ CERROR("select return err %d, errno %d\n", nready, errno);
+ return;
+ }
+
+ if (nready) {
+ nexec = execute_callbacks(&fds[0], &fds[1], &fds[2]);
+ nready -= nexec;
+ } else
+ nexec = 0;
+
+ /* even both nready & nexec are 0, we still need try to wakeup
+ * upper thread since it may have timed out
+ */
+ if (fd_handling) {
+ LASSERT(nready >= 0);
+
+ pthread_mutex_lock(&fd_extra.mutex);
+ if (nready) {
+ if (fd_extra.rset)
+ *fd_extra.rset = fds[0];
+ if (fd_extra.wset)
+ *fd_extra.wset = fds[1];
+ if (fd_extra.eset)
+ *fd_extra.eset = fds[2];
+ fd_extra.nready = nready;
+ fd_extra.submitted = 0;
+ } else {
+ struct timeval t;
+
+ fd_extra.nready = 0;
+ if (fd_extra.timeout) {
+ gettimeofday(&t, NULL);
+ if (timeval_ge(&t, &fd_extra.submit_time))
+ fd_extra.submitted = 0;
}
}
+
+ pthread_cond_signal(&fd_extra.cond);
+ pthread_mutex_unlock(&fd_extra.mutex);
+ }
+
+ /* haven't found portals event, go back to loop if time
+ * is not expired */
+ if (!nexec) {
+ if (timeout_pointer == NULL || now() >= until)
+ goto again;
+ }
+}
+
+#else /* !ENABLE_SELECT_DISPATCH */
+
+/* Function: select_timer_block
+ * Arguments: until: an absolute time when the select should return
+ *
+ * This function dispatches the various file descriptors' handler
+ * functions, if the kernel indicates there is io available.
+ */
+void select_timer_block(when until)
+{
+ fd_set fds[3];
+ struct timeval timeout;
+ struct timeval *timeout_pointer;
+ int max, nready;
+
+again:
+ if (until) {
+ when interval;
+ interval = until - now();
+ timeout.tv_sec = (interval >> 32);
+ timeout.tv_usec = ((interval << 32) / 1000000) >> 32;
+ timeout_pointer = &timeout;
+ } else
+ timeout_pointer = NULL;
+
+ max = prepare_fd_sets(&fds[0], &fds[1], &fds[2]);
+
+ nready = select(max, &fds[0], &fds[1], &fds[2], timeout_pointer);
+ if (nready > 0)
+ execute_callbacks(&fds[0], &fds[1], &fds[2]);
}
+#endif /* ENABLE_SELECT_DISPATCH */
/* Function: init_unix_timer()
* is called to initialize the library
newly created junk */
return(PTL_NAL_FAILED);
}
- /* XXX cfs hack */
-// b->lib_nal->libnal_ni.ni_pid.pid=0;
b->lower=m;
return(PTL_OK);
}
void
usage (char *myname)
{
- fprintf (stderr, "Usage: %s [-N nal_id] port\n", myname);
+ fprintf (stderr,
+ "Usage: %s [-N nal_id] [-p] [-l] port\n\n"
+ " -l\tKeep stdin/stdout open\n"
+ " -p\tAllow connections from non-privileged ports\n",
+ myname);
exit (1);
}
int c;
int noclose = 0;
int nal = SOCKNAL;
+ int rport;
+ int require_privports = 1;
- while ((c = getopt (argc, argv, "N:l")) != -1)
- switch (c)
- {
- case 'l':
- noclose = 1;
- break;
-
+ while ((c = getopt (argc, argv, "N:lp")) != -1) {
+ switch (c) {
case 'N':
if (sscanf(optarg, "%d", &nal) != 1 ||
nal < 0 || nal > NAL_MAX_NR)
usage(argv[0]);
break;
-
+ case 'l':
+ noclose = 1;
+ break;
+ case 'p':
+ require_privports = 0;
+ break;
default:
usage (argv[0]);
break;
}
+ }
if (optind >= argc)
usage (argv[0]);
exit(1);
}
- rc = daemon(1, noclose);
+ rc = daemon(0, noclose);
if (rc < 0) {
perror("daemon(): ");
exit(1);
struct portals_cfg pcfg;
#ifdef HAVE_LIBWRAP
struct request_info request;
- char addrstr[INET_ADDRSTRLEN];
#endif
+ char addrstr[INET_ADDRSTRLEN];
cfd = accept(fd, (struct sockaddr *)&clntaddr, &len);
if ( cfd < 0 ) {
continue;
}
#endif
+
+ if (require_privports && ntohs(clntaddr.sin_port) >= IPPORT_RESERVED) {
+ inet_ntop(AF_INET, &clntaddr.sin_addr,
+ addrstr, INET_ADDRSTRLEN);
+ syslog(LOG_ERR, "Closing non-privileged connection from %s:%d\n",
+ addrstr, ntohs(clntaddr.sin_port));
+ rc = close(cfd);
+ if (rc)
+ perror ("close un-privileged client failed");
+ continue;
+ }
+
show_connection (cfd, clntaddr.sin_addr.s_addr);
PCFG_INIT(pcfg, NAL_CMD_REGISTER_PEER_FD);
#include <portals/list.h>
#include <stdio.h>
+#ifdef HAVE_NETDB_H
#include <netdb.h>
+#endif
#include <stdlib.h>
#include <string.h>
+#include "ioctl.h"
#include <fcntl.h>
#include <errno.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/mman.h>
+#ifdef HAVE_LINUX_VERSION_H
#include <linux/version.h>
#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
#define BUG() /* workaround for module.h includes */
#include <linux/module.h>
#endif
+#endif /* !HAVE_LINUX_VERSION_H */
+
#include <sys/utsname.h>
#include <portals/api-support.h>
static char rawbuf[8192];
static char *buf = rawbuf;
static int max = 8192;
-//static int g_pfd = -1;
+/*static int g_pfd = -1;*/
static int subsystem_mask = ~0;
static int debug_mask = ~0;
{"undefined", "mdc", "mds", "osc", "ost", "class", "log", "llite",
"rpc", "mgmt", "portals", "libcfs", "socknal", "qswnal", "pinger",
"filter", "ptlbd", "echo", "ldlm", "lov", "gmnal", "router", "cobd",
- "openibnal", "lmv", "smfs", "cmobd", NULL};
+ "ibnal", NULL};
static const char *portal_debug_masks[] =
{"trace", "inode", "super", "ext2", "malloc", "cache", "info", "ioctl",
"blocks", "net", "warning", "buffs", "other", "dentry", "portals",
fprintf(stderr, "usage: %s [file] [raw]\n", argv[0]);
return 0;
}
- sprintf(filename, "%s.%lu.%u", argc > 1 ? argv[1] : "/tmp/lustre-log",
- time(NULL), getpid());
- if (argc > 2)
+ if (argc > 2) {
raw = atoi(argv[2]);
+ } else if (argc > 1 && (argv[1][0] == '0' || argv[1][0] == '1')) {
+ raw = atoi(argv[1]);
+ argc--;
+ } else {
+ sprintf(filename, "%s.%lu.%u", argc > 1 ? argv[1] :
+ "/tmp/lustre-log", time(NULL), getpid());
+ }
+
unlink(filename);
fd = open("/proc/sys/portals/dump_kernel", O_WRONLY);
if (fd < 0) {
+ if (errno == ENOENT) /* no dump file created */
+ return 0;
+
fprintf(stderr, "open(dump_kernel) failed: %s\n",
strerror(errno));
return 1;
int jt_dbg_debug_daemon(int argc, char **argv)
{
int rc, fd;
-
+
if (argc <= 1) {
fprintf(stderr, debug_daemon_usage);
return 0;
}
-
+
fd = open("/proc/sys/portals/daemon_file", O_WRONLY);
if (fd < 0) {
fprintf(stderr, "open(daemon_file) failed: %s\n",
strerror(errno));
return 1;
}
-
+
if (strcasecmp(argv[1], "start") == 0) {
if (argc != 3) {
fprintf(stderr, debug_daemon_usage);
return 1;
}
-
+
rc = write(fd, argv[2], strlen(argv[2]));
if (rc != strlen(argv[2])) {
fprintf(stderr, "write(%s) failed: %s\n", argv[2],
fprintf(stderr, debug_daemon_usage);
return 1;
}
-
+
close(fd);
return 0;
}
{"obdfilter", "lustre/obdfilter"},
{"extN", "lustre/extN"},
{"lov", "lustre/lov"},
- {"lmv", "lustre/lmv"},
{"fsfilt_ext3", "lustre/lvfs"},
{"fsfilt_extN", "lustre/lvfs"},
{"fsfilt_reiserfs", "lustre/lvfs"},
{"ptlbd", "lustre/ptlbd"},
{"mgmt_svc", "lustre/mgmt"},
{"mgmt_cli", "lustre/mgmt"},
- {"cobd", "lustre/cobd"},
- {"cmobd", "lustre/cmobd"},
+ {"conf_obd", "lustre/obdclass"},
{NULL, NULL}
};
static int jt_dbg_modules_2_4(int argc, char **argv)
{
+#ifdef HAVE_LINUX_VERSION_H
#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
struct mod_paths *mp;
char *path = "..";
}
return 0;
-#else /* Headers are 2.6-only */
+#endif /* Headers are 2.6-only */
+#endif /* !HAVE_LINUX_VERSION_H */
return -EINVAL;
-#endif
}
static int jt_dbg_modules_2_5(int argc, char **argv)
#include <stdio.h>
#include <sys/types.h>
+#ifdef HAVE_NETDB_H
#include <netdb.h>
+#endif
#include <sys/socket.h>
+#ifdef HAVE_NETINET_TCP_H
#include <netinet/tcp.h>
-#include <netdb.h>
+#endif
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
+#include "ioctl.h"
#include <sys/ioctl.h>
#include <errno.h>
#include <unistd.h>
static unsigned int g_nal = 0;
-static int g_socket_txmem = 0;
-static int g_socket_rxmem = 0;
-static int g_socket_nonagle = 1;
-
typedef struct
{
char *name;
{"elan", QSWNAL},
{"gm", GMNAL},
{"openib", OPENIBNAL},
+ {"iib", IIBNAL},
{NULL, -1}
};
return ((e == NULL) ? "???" : e->name);
}
+#ifdef HAVE_GETHOSTBYNAME
static struct hostent *
ptl_gethostbyname(char * hname) {
struct hostent *he;
}
return he;
}
+#endif
int
ptl_parse_port (int *port, char *str)
int
ptl_parse_ipaddr (__u32 *ipaddrp, char *str)
{
+#ifdef HAVE_GETHOSTBYNAME
struct hostent *he;
+#endif
if (!strcmp (str, "_all_"))
{
if (ptl_parse_ipquad(ipaddrp, str) == 0)
return (0);
-
+
+#if HAVE_GETHOSTBYNAME
if ((('a' <= str[0] && str[0] <= 'z') ||
('A' <= str[0] && str[0] <= 'Z')) &&
(he = ptl_gethostbyname (str)) != NULL)
*ipaddrp = ntohl(addr); /* HOST byte order */
return (0);
}
+#endif
return (-1);
}
char *
ptl_ipaddr_2_str (__u32 ipaddr, char *str, int lookup)
{
+#ifdef HAVE_GETHOSTBYNAME
__u32 net_ip;
struct hostent *he;
return (str);
}
}
-
+#endif
+
sprintf (str, "%d.%d.%d.%d",
(ipaddr >> 24) & 0xff, (ipaddr >> 16) & 0xff,
(ipaddr >> 8) & 0xff, ipaddr & 0xff);
ptl_nid2str (char *buffer, ptl_nid_t nid)
{
__u64 nid64 = ptl_nid2u64(nid);
+#ifdef HAVE_GETHOSTBYNAME
struct hostent *he = 0;
/* Don't try to resolve NIDs that are e.g. Elan host IDs. Assume
if (he != NULL)
sprintf(buffer, "%#x:%s", (int)(nid64 >> 32), he->h_name);
else
+#endif /* HAVE_GETHOSTBYNAME */
sprintf(buffer, LPX64, nid64);
return (buffer);
return (-1);
}
-
int
jt_ptl_print_interfaces (int argc, char **argv)
{
__u32 ipaddr;
int rc;
__u32 netmask = 0xffffff00;
+ int i;
+ int count;
+ char *end;
if (argc < 2 || argc > 3) {
fprintf (stderr, "usage: %s ipaddr [netmask]\n", argv[0]);
fprintf (stderr, "Can't parse ip: %s\n", argv[1]);
return -1;
}
-
- if (argc > 2 &&
- ptl_parse_ipquad(&netmask, argv[2]) != 0) {
- fprintf (stderr, "Can't parse netmask: %s\n", argv[2]);
- return -1;
+
+ if (argc > 2 ) {
+ count = strtol(argv[2], &end, 0);
+ if (count > 0 && count < 32 && *end == 0) {
+ netmask = 0;
+ for (i = count; i > 0; i--)
+ netmask = netmask|(1<<(32-i));
+ } else if (ptl_parse_ipquad(&netmask, argv[2]) != 0) {
+ fprintf (stderr, "Can't parse netmask: %s\n", argv[2]);
+ return -1;
+ }
}
-
+
PCFG_INIT(pcfg, NAL_CMD_ADD_INTERFACE);
pcfg.pcfg_id = ipaddr;
pcfg.pcfg_misc = netmask;
strerror (errno));
return -1;
}
-
+
return 0;
}
strerror (errno));
return -1;
}
-
+
return 0;
}
-int
+int
jt_ptl_print_peers (int argc, char **argv)
{
struct portals_cfg pcfg;
int index;
int rc;
- if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, 0))
+ if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0))
return -1;
for (index = 0;;index++) {
int port = 0;
int rc;
- if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, 0))
+ if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0))
return -1;
if (g_nal_is_compatible(NULL, SOCKNAL, 0)) {
return 0;
}
} else if (argc != 2) {
- fprintf (stderr, "usage(openib): %s nid\n", argv[0]);
+ fprintf (stderr, "usage(openib,iib): %s nid\n", argv[0]);
return 0;
}
int argidx;
int rc;
- if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, 0))
+ if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0))
return -1;
if (g_nal_is_compatible(NULL, SOCKNAL, 0)) {
}
if (argc > argidx) {
- if (!strcmp (argv[3], "single_share")) {
+ if (!strcmp (argv[argidx], "single_share")) {
single_share = 1;
} else {
fprintf (stderr, "Unrecognised arg %s'\n", argv[3]);
int index;
int rc;
- if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, 0))
+ if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0))
return -1;
for (index = 0;;index++) {
int jt_ptl_connect(int argc, char **argv)
{
+#ifndef HAVE_CONNECT
+ /* no connect() support */
+ return -1;
+#else /* HAVE_CONNECT */
struct portals_cfg pcfg;
struct sockaddr_in srvaddr;
+ struct sockaddr_in locaddr;
__u32 ipaddr;
char *flag;
int fd, rc;
int type = SOCKNAL_CONN_ANY;
- int port;
+ int port, rport;
+ int o;
if (argc < 3) {
fprintf(stderr, "usage: %s ip port [type]\n", argv[0]);
return (-1);
}
+ memset(&locaddr, 0, sizeof(locaddr));
+ locaddr.sin_family = AF_INET;
+ locaddr.sin_addr.s_addr = INADDR_ANY;
+
memset(&srvaddr, 0, sizeof(srvaddr));
srvaddr.sin_family = AF_INET;
srvaddr.sin_port = htons(port);
srvaddr.sin_addr.s_addr = htonl(ipaddr);
- fd = socket(PF_INET, SOCK_STREAM, 0);
- if ( fd < 0 ) {
- fprintf(stderr, "socket() failed: %s\n", strerror(errno));
- return -1;
+
+ for (rport = IPPORT_RESERVED - 1; rport > IPPORT_RESERVED / 2; --rport) {
+ fd = socket(PF_INET, SOCK_STREAM, 0);
+ if ( fd < 0 ) {
+ fprintf(stderr, "socket() failed: %s\n", strerror(errno));
+ return -1;
+ }
+
+ o = 1;
+ rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR,
+ &o, sizeof(o));
+
+ locaddr.sin_port = htons(rport);
+ rc = bind(fd, (struct sockaddr *)&locaddr, sizeof(locaddr));
+ if (rc == 0 || errno == EACCES) {
+ rc = connect(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr));
+ if (rc == 0) {
+ break;
+ } else if (errno != EADDRINUSE) {
+ fprintf(stderr, "Error connecting to host: %s\n", strerror(errno));
+ close(fd);
+ return -1;
+ }
+ } else if (errno != EADDRINUSE) {
+ fprintf(stderr, "Error binding to port %d: %d: %s\n", port, errno, strerror(errno));
+ close(fd);
+ return -1;
+ }
}
- rc = connect(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr));
- if ( rc == -1 ) {
- fprintf(stderr, "connect() failed: %s\n", strerror(errno));
+ if (rport == IPPORT_RESERVED / 2) {
+ fprintf(stderr,
+ "Warning: all privileged ports are in use.\n");
return -1;
}
fprintf(stderr, "close failed: %d\n", rc);
return 0;
+#endif /* HAVE_CONNECT */
}
int jt_ptl_disconnect(int argc, char **argv)
return 0;
}
- if (!g_nal_is_compatible (NULL, SOCKNAL, OPENIBNAL, 0))
+ if (!g_nal_is_compatible (NULL, SOCKNAL, OPENIBNAL, IIBNAL, 0))
return 0;
if (argc >= 2 &&
}
/* crappy overloads */
- if (data.ioc_nid != sizeof(lwt_event_t) ||
- data.ioc_nid2 != offsetof(lwt_event_t, lwte_where)) {
+ if (data.ioc_nid2 != sizeof(lwt_event_t) ||
+ data.ioc_nid3 != offsetof(lwt_event_t, lwte_where)) {
fprintf(stderr,"kernel/user LWT event mismatch %d(%d),%d(%d)\n",
- (int)data.ioc_nid, sizeof(lwt_event_t),
- (int)data.ioc_nid2,
+ (int)data.ioc_nid2, sizeof(lwt_event_t),
+ (int)data.ioc_nid3,
(int)offsetof(lwt_event_t, lwte_where));
return (-1);
}
static int
lwt_print(FILE *f, cycles_t t0, cycles_t tlast, double mhz, int cpu, lwt_event_t *e)
{
+#ifndef __WORDSIZE
+# error "__WORDSIZE not defined"
+#elif __WORDSIZE == 32
+# define XFMT "%#010lx"
+#elif __WORDSIZE== 64
+# define XFMT "%#018lx"
+#else
+# error "Unexpected __WORDSIZE"
+#endif
char *where = lwt_get_string(e->lwte_where);
if (where == NULL)
return (-1);
- fprintf(f, "%#010lx %#010lx %#010lx %#010lx: %#010lx %1d %10.6f %10.2f %s\n",
+ fprintf(f, XFMT" "XFMT" "XFMT" "XFMT": "XFMT" %2d %10.6f %10.2f %s\n",
e->lwte_p1, e->lwte_p2, e->lwte_p3, e->lwte_p4,
(long)e->lwte_task, cpu, (e->lwte_when - t0) / (mhz * 1000000.0),
(t0 == e->lwte_when) ? 0.0 : (e->lwte_when - tlast) / mhz,
lwt_put_string(where);
return (0);
+#undef XFMT
}
double
# lconf is the main driver script for starting and stopping
# lustre filesystem services.
#
-# Based in part on the XML obdctl modifications done by Brian Behlendorf
+# Based in part on the XML obdctl modifications done by Brian Behlendorf
import sys, getopt, types
import string, os, stat, popen2, socket, time, random, fcntl, select
PORTALS_DIR = 'portals'
# Needed to call lconf --record
-CONFIG_FILE = ""
+CONFIG_FILE = ""
# Please keep these in sync with the values in portals/kp30.h
-ptldebug_names = {
+ptldebug_names = {
"trace" : (1 << 0),
"inode" : (1 << 1),
"super" : (1 << 2),
"rpctrace" : (1 << 20),
"vfstrace" : (1 << 21),
"reada" : (1 << 22),
- "config" : (1 << 23),
- "mmap" : (1 << 24),
+ "mmap" : (1 << 23),
+ "config" : (1 << 24),
}
subsystem_names = {
"gmnal" : (1 << 19),
"ptlrouter" : (1 << 20),
"cobd" : (1 << 21),
- "openibnal" : (1 << 22),
- "cmobd" : (1 << 23),
+ "ibnal" : (1 << 22),
+ "sm" : (1 << 23),
+ "asobd" : (1 << 24),
+ "lmv" : (1 << 25),
+ "cmobd" : (1 << 26),
}
if not first_cleanup_error:
first_cleanup_error = rc
-# ============================================================
+# ============================================================
# debugging and error funcs
def fixme(msg = "this feature"):
return pid
except IOError:
return 0
-
+
def clean_pidfile(self):
""" Remove a stale pidfile """
log("removing stale pidfile:", self.pidfile())
os.unlink(self.pidfile())
except OSError, e:
log(self.pidfile(), e)
-
+
class AcceptorHandler(DaemonHandler):
def __init__(self, port, net_type):
DaemonHandler.__init__(self, "acceptor")
return "/var/run/%s-%d.pid" % (self.command, self.port)
def command_line(self):
- return string.join(map(str,(self.flags, self.port)))
-
+ return string.join(map(str,(self.flags, self.port)))
+
acceptors = {}
# start the acceptors
if not daemon.running():
daemon.start()
else:
- panic("run_one_acceptor: No acceptor defined for port:", port)
-
+ panic("run_one_acceptor: No acceptor defined for port:", port)
+
def stop_acceptor(port):
if acceptors.has_key(port):
daemon = acceptors[port]
if daemon.running():
daemon.stop()
-
+
# ============================================================
# handle lctl interface
def use_save_file(self, file):
self.save_file = file
-
+
def record(self, dev_name, logname):
log("Recording log", logname, "on", dev_name)
self.record_device = dev_name
device $%s
record %s
%s""" % (self.record_device, self.record_log, cmds)
-
+
debug("+", cmd_line, cmds)
if config.noexec: return (0, [])
raise CommandError(self.lctl, out, rc)
return rc, out
-
+
def clear_log(self, dev, log):
""" clear an existing log """
cmds = """
quit """ % (dev, log)
self.run(cmds)
+ def root_squash(self, name, uid, nid):
+ cmds = """
+ device $%s
+ root_squash %s %s
+ quit""" % (name, uid, nid)
+ self.run(cmds)
+
def network(self, net, nid):
""" set mynid """
cmds = """
quit """ % (net, nid)
self.run(cmds)
- def root_squash(self, name, uid, nid):
+ # add an interface
+ def add_interface(self, net, ip, netmask = ""):
+ """ add an interface """
cmds = """
- device $%s
- root_squash %s %s
- quit""" % (name, uid, nid)
+ network %s
+ add_interface %s %s
+ quit """ % (net, ip, netmask)
+ self.run(cmds)
+
+ # delete an interface
+ def del_interface(self, net, ip):
+ """ delete an interface """
+ cmds = """
+ network %s
+ del_interface %s
+ quit """ % (net, ip)
self.run(cmds)
# create a new connection
cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type)
self.run(cmds)
- def add_peer(self, net_type, nid, hostaddr, port):
- if net_type in ('tcp',) and not config.lctl_dump:
+ def add_peer(self, net_type, nid, hostaddr, port):
+ if net_type in ('tcp',) and not config.lctl_dump:
cmds = """
network %s
add_peer %s %s %d
quit""" % (net_type,
nid, hostaddr, port )
self.run(cmds)
- elif net_type in ('openib',) and not config.lctl_dump:
+ elif net_type in ('openib','iib',) and not config.lctl_dump:
cmds = """
network %s
add_peer %s
quit""" % (net_type,
- nid)
- self.run(cmds)
-
+ nid )
+ self.run(cmds)
+
def connect(self, srv):
self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
- if srv.net_type in ('tcp','openib',) and not config.lctl_dump:
- self.add_peer(srv.net_type, srv.nid, srv.hostaddr, srv.port)
+ if srv.net_type in ('tcp','openib','iib',) and not config.lctl_dump:
+ if srv.hostaddr[0]:
+ hostaddr = string.split(srv.hostaddr[0], '/')[0]
+ self.add_peer(srv.net_type, srv.nid, hostaddr, srv.port)
# Recover a device
def recover(self, dev_name, new_conn):
device $%s
recover %s""" %(dev_name, new_conn)
self.run(cmds)
-
+
# add a route to a range
def add_route(self, net, gw, lo, hi):
cmds = """
except CommandError, e:
log ("ignore: ")
e.dump()
-
+
def del_route(self, net, gw, lo, hi):
cmds = """
ignore_errors
quit """ % (net, gw, tgt)
self.run(cmds)
+
def del_peer(self, net_type, nid, hostaddr):
if net_type in ('tcp',) and not config.lctl_dump:
cmds = """
quit""" % (net_type,
nid, hostaddr)
self.run(cmds)
- elif net_type in ('openib',) and not config.lctl_dump:
+ elif net_type in ('openib','iib',) and not config.lctl_dump:
cmds = """
ignore_errors
network %s
quit""" % (net_type,
nid)
self.run(cmds)
-
+
# disconnect one connection
def disconnect(self, srv):
self.del_uuid(srv.nid_uuid)
- if srv.net_type in ('tcp','openib',) and not config.lctl_dump:
- self.del_peer(srv.net_type, srv.nid, srv.hostaddr)
+ if srv.net_type in ('tcp','openib','iib',) and not config.lctl_dump:
+ if srv.hostaddr[0]:
+ hostaddr = string.split(srv.hostaddr[0], '/')[0]
+ self.del_peer(srv.net_type, srv.nid, hostaddr)
def del_uuid(self, uuid):
cmds = """
setup %s
quit""" % (name, setup)
self.run(cmds)
-
+
def add_conn(self, name, conn_uuid):
cmds = """
cfg_device %s
except CommandError, e:
self.cleanup(name, uuid, 0)
raise e
-
+
# cleanup a device
def cleanup(self, name, uuid, force, failover = 0):
cmds = """
attach lov %s %s
lov_setup %s %d %d %d %s %s
- quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off,
- pattern, devlist)
+ quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off,
+ pattern, devlist)
self.run(cmds)
# add an OBD to a LOV
modbase = src_dir +'/'+ dev_dir +'/'+ modname
for modext in '.ko', '.o':
module = modbase + modext
- try:
+ try:
if os.access(module, os.R_OK):
return module
except OSError:
i=i+1
return ''
-
-
# build fs according to type
# fixme: dangerous
def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
# ext3 journal size is in megabytes
# but don't set jsize if mkfsoptions indicates a separate journal device
if jsize == 0 and jdev(mkfsoptions) == '':
- if devsize == 0:
+ if devsize == 0:
if not is_block(dev):
ret, out = runcmd("ls -l %s" %dev)
devsize = int(string.split(out[0])[4]) / 1024
else:
# sfdisk -s will fail for too large block device,
# then, read the size of partition from /proc/partitions
-
+
# get the realpath of the device
# it may be the real device, such as /dev/hda7
# or the hardlink created via mknod for a device
real_dev = os.path.join(os.path.dirname(real_dev), dev_link)
if link_count > 19:
panic("Entountered too many symbolic links resolving block device:", dev)
-
+
# get the major and minor number of the realpath via ls
- # it seems python(os.stat) does not return
+ # it seems python(os.stat) does not return
# the st_rdev member of the stat structure
ret, out = runcmd("ls -l %s" %real_dev)
major = string.split(string.split(out[0])[4], ",")[0]
minor = string.split(out[0])[5]
-
+
# get the devsize from /proc/partitions with the major and minor number
ret, out = runcmd("cat /proc/partitions")
for line in out:
if devsize > 1024 * 1024:
jsize = ((devsize / 102400) * 4)
if jsize > 400:
- jsize = 400
+ jsize = 400
if jsize: jopt = "-J size=%d" %(jsize,)
if isize: iopt = "-I %d" %(isize,)
mkfs = 'mkfs.ext2 -j -b 4096 '
jmkfs = 'mkfs.ext2 -b 4096 -O journal_dev '
if config.force:
jmkfs = jmkfs + '-F '
- jmkfs = jmkfs + jdev(mkfsoptions)
+ jmkfs = jmkfs + jdev(mkfsoptions)
(ret, out) = run (jmkfs)
if ret:
panic("Unable format journal device:", jdev(mkfsoptions), string.join(out))
-
elif fstype == 'reiserfs':
# reiserfs journal size is in blocks
if jsize: jopt = "--journal_size %d" %(jsize,)
# determine if dev is formatted as a <fstype> filesystem
def need_format(fstype, dev):
- # FIXME don't know how to implement this
+ # FIXME don't know how to implement this
return 0
# initialize a block device if needed
# panic("device:", dev,
# "not prepared, and autoformat is not set.\n",
# "Rerun with --reformat option to format ALL filesystems")
-
+
return dev
def if2addr(iface):
else:
local = sys_get_local_address(net_type, wildcard, cluster_id)
return local
-
+
def sys_get_local_address(net_type, wildcard, cluster_id):
"""Return the local address for the network type."""
local = ""
- if net_type in ('tcp','openib',):
+ if net_type in ('tcp','openib','iib',):
if ':' in wildcard:
iface, star = string.split(wildcard, ':')
local = if2addr(iface)
elan_id = a[1]
break
try:
- nid = my_int(cluster_id) + my_int(elan_id)
+ nid = my_int(cluster_id) + my_int(elan_id)
local = "%d" % (nid)
except ValueError, e:
local = elan_id
except IOError, e:
log(e)
return 0
-
+
class kmod:
"""Manage kernel modules"""
continue
log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
if src_dir:
- module = find_module(src_dir, dev_dir, mod)
+ module = find_module(src_dir, dev_dir, mod)
if not module:
panic('module not found:', mod)
(rc, out) = run('/sbin/insmod', module)
self._server = None
self._connected = 0
self.kmod = kmod(config.lustre, config.portals)
-
+
def info(self, *args):
msg = string.join(map(str,args))
print self.module_name + ":", self.name, self.uuid, msg
log(self.module_name, "cleanup failed: ", self.name)
e.dump()
cleanup_error(e.rc)
-
+
def add_portals_module(self, dev_dir, modname):
"""Append a module to list of modules to load."""
self.kmod.add_portals_module(dev_dir, modname)
def load_module(self):
"""Load all the modules in the list in the order they appear."""
self.kmod.load_module()
-
+
def cleanup_module(self):
"""Unload the modules in the list in reverse order."""
if self.safe_to_clean():
def safe_to_clean(self):
return 1
-
+
def safe_to_clean_modules(self):
return self.safe_to_clean()
-
+
class Network(Module):
def __init__(self,db):
Module.__init__(self, 'NETWORK', db)
self.nid_uuid = self.nid_to_uuid(self.nid)
- self.hostaddr = self.db.get_val('hostaddr', self.nid)
- if '*' in self.hostaddr:
- self.hostaddr = sys_get_local_address(self.net_type, self.hostaddr, self.cluster_id)
- if not self.hostaddr:
- panic("unable to set hostaddr for", self.net_type, self.hostaddr, self.cluster_id)
- debug("hostaddr:", self.hostaddr)
+ self.hostaddr = self.db.get_hostaddr()
+ if len(self.hostaddr) == 0:
+ self.hostaddr.append(self.nid)
+ if '*' in self.hostaddr[0]:
+ self.hostaddr[0] = sys_get_local_address(self.net_type, self.hostaddr[0], self.cluster_id)
+ if not self.hostaddr[0]:
+ panic("unable to set hostaddr for", self.net_type, self.hostaddr[0], self.cluster_id)
+ debug("hostaddr:", self.hostaddr[0])
self.add_portals_module("libcfs", 'libcfs')
self.add_portals_module("portals", 'portals')
self.add_portals_module("knals/gmnal", 'kgmnal')
if self.net_type == 'openib':
self.add_portals_module("knals/openibnal", 'kopenibnal')
+ if self.net_type == 'iib':
+ self.add_portals_module("knals/iibnal", 'kiibnal')
def nid_to_uuid(self, nid):
return "NID_%s_UUID" %(nid,)
lctl.network(self.net_type, self.nid)
if self.net_type == 'tcp':
sys_tweak_socknal()
+ for hostaddr in self.db.get_hostaddr():
+ ip = string.split(hostaddr, '/')[0]
+ if len(string.split(hostaddr, '/')) == 2:
+ netmask = string.split(hostaddr, '/')[1]
+ else:
+ netmask = ""
+ lctl.add_interface(self.net_type, ip, netmask)
if self.net_type == 'elan':
sys_optimize_elan()
if self.port and node_is_router():
stop_acceptor(self.port)
if node_is_router():
self.disconnect_peer_gateways()
+ if self.net_type == 'tcp':
+ for hostaddr in self.db.get_hostaddr():
+ ip = string.split(hostaddr, '/')[0]
+ lctl.del_interface(self.net_type, ip)
def correct_level(self, level, op=None):
return level
def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
lo, hi):
- # only setup connections for tcp and openib NALs
- srvdb = None
-
- if not net_type in ('tcp','openib'):
+ # only setup connections for tcp, openib, and iib NALs
+ srvdb = None
+ if not net_type in ('tcp','openib','iib',):
return None
# connect to target if route is to single node and this node is the gw
return None
return Network(srvdb)
-
+
def prepare(self):
if not config.record and is_network_prepared():
return
self.devlist = self.db.get_lov_tgts('lov_tgt')
self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
self.osclist = []
- self.obdlist = []
+ self.obdlist = []
self.desc_uuid = self.uuid
self.uuid = generate_client_uuid(self.name)
self.fs_name = fs_name
self.mdclist.append(mdc)
else:
panic('mdc not found:', mds_uuid)
-
+
def prepare(self):
if is_prepared(self.name):
return
if not self.lmv:
panic("No LMV initialized and not lovconfig_uuid found")
-
+
lovconfig_uuid = self.lmv.get_first_ref('lovconfig')
lovconfig = self.lmv.lookup(lovconfig_uuid)
lov_uuid = lovconfig.get_first_ref('lov')
stripe_count = lov.stripe_cnt
else:
stripe_count = len(lov.devlist)
-
- if stripe_count > 77:
+ if stripe_count > 77:
self.inode_size = 4096
elif stripe_count > 35:
self.inode_size = 2048
if self.fstype == 'smfs':
self.add_lustre_module('smfs', 'smfs')
-
+
if self.fstype == 'ldiskfs':
self.add_lustre_module('ldiskfs', 'ldiskfs')
if self.fstype:
self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
-
- # if fstype is smfs, then we should also take care about backing
+
+ # if fstype is smfs, then we should also take care about backing
# store fs.
if self.fstype == 'smfs':
self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.backfstype))
blkdev = block_dev(self.devpath, self.size, self.fstype, 0,
self.format, self.journal_size, self.inode_size,
self.mkfsoptions, self.backfstype, self.backdevpath)
-
+
if not is_prepared('MDT'):
lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
- try:
+ try:
mountfsoptions = def_mount_options(self.fstype, 'mds')
-
+
if config.mountfsoptions:
if mountfsoptions:
mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
else:
mountfsoptions = self.mountfsoptions
-
+
if self.fstype == 'smfs':
realdev = self.fstype
-
+
if mountfsoptions:
- mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
- self.backfstype,
+ mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
+ self.backfstype,
blkdev)
else:
- mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
+ mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
blkdev)
else:
realdev = blkdev
-
+
print 'MDS mount options: ' + mountfsoptions
-
+
if not self.master_mds:
- self.master_mds = 'dumb'
+ self.master_mds = 'dumb'
if not self.cachetype:
self.cachetype = 'dumb'
lctl.newdev("mds", self.name, self.uuid,
- setup ="%s %s %s %s %s %s" %(realdev, self.fstype,
+ setup ="%s %s %s %s %s %s" %(realdev, self.fstype,
self.name, mountfsoptions,
self.master_mds, self.cachetype))
if self.fstype == 'smfs':
realdev = self.fstype
-
+
if mountfsoptions:
- mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
- self.backfstype,
+ mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
+ self.backfstype,
blkdev)
else:
- mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
+ mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
blkdev)
else:
realdev = blkdev
print 'MDS mount options: ' + mountfsoptions
- # As mount options are passed by 4th param to config tool, we need
+ # As mount options are passed by 4th param to config tool, we need
# to pass something in 3rd param. But we do not want this 3rd param
# be counted as a profile name for reading log on MDS setup, thus,
- # we pass there some predefined sign like 'dumb', which will be
+ # we pass there some predefined sign like 'dumb', which will be
# checked in MDS code and skipped. Probably there is more nice way
# like pass empty string and check it in config tool and pass null
# as 4th param.
lctl.newdev("mds", self.name, self.uuid,
- setup ="%s %s %s %s" %(realdev, self.fstype,
+ setup ="%s %s %s %s" %(realdev, self.fstype,
'dumb', mountfsoptions))
do_cleanup = 1
# this is ugly, should be organized nice later.
target_uuid = self.db.get_first_ref('target')
mds = self.db.lookup(target_uuid)
-
+
lovconfig_uuid = mds.get_first_ref('lovconfig')
if lovconfig_uuid:
lovconfig = mds.lookup(lovconfig_uuid)
obd_uuid = lovconfig.get_first_ref('lov')
else:
obd_uuid = fs.get_first_ref('obd')
-
+
client_uuid = generate_client_uuid(self.name)
client = VOSC(self.db.lookup(obd_uuid), client_uuid, self.name,
self.name)
print "cleanup failed: ", self.name
e.dump()
cleanup_error(e.rc)
-
+
if self.fstype == 'smfs':
clean_loop(self.backdevpath)
else:
self.active = 0
if self.active and config.group and config.group != ost.get_val('group'):
self.active = 0
-
+
self.target_dev_uuid = self.uuid
self.uuid = target_uuid
# modules
self.backdevpath)
mountfsoptions = def_mount_options(self.fstype, 'ost')
-
+
if config.mountfsoptions:
if mountfsoptions:
mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
else:
mountfsoptions = self.mountfsoptions
-
+
if self.fstype == 'smfs':
realdev = self.fstype
-
+
if mountfsoptions:
- mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
- self.backfstype,
+ mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
+ self.backfstype,
blkdev)
else:
- mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
+ mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
blkdev)
else:
realdev = blkdev
-
+
print 'OSD mount options: ' + mountfsoptions
-
+
lctl.newdev(self.osdtype, self.name, self.uuid,
setup ="%s %s %s %s" %(realdev, self.fstype,
- self.failover_ost,
+ self.failover_ost,
mountfsoptions))
if not is_prepared('OSS'):
lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
self.db = tgtdb
self.active = 1
self.backup_targets = []
-
+
self.tgt_dev_uuid = get_active_target(tgtdb)
if not self.tgt_dev_uuid:
panic("No target device found for target(1):", self.target_name)
if name_override != None:
self.name = "lov_%s" % name_override
self.add_lustre_module('lov', 'lov')
- self.stripe_sz = 65536
- self.stripe_off = 0
+ self.stripe_sz = 65536
+ self.stripe_off = 0
self.pattern = 0
- self.stripe_cnt = 1
+ self.stripe_cnt = 1
self.desc_uuid = self.uuid
self.uuid = generate_client_uuid(self.name)
self.fs_name = fs_name
self.osc = get_osc(db, self.uuid, fs_name)
- if not self.osc:
+ if not self.osc:
panic('osc not found:', self.uuid)
if config_only:
self.config_only = 1
self.stripe_sz, self.stripe_off, self.pattern)
target_uuid = self.osc.target_uuid
try:
- self.osc.active = 1
+ self.osc.active = 1
self.osc.prepare(ignore_connect_failure=0)
except CommandError, e:
print "Error preparing OSC %s\n" % osc.uuid
class CMOBD(Module):
def __init__(self,db):
Module.__init__(self, 'CMOBD', db)
- self.name = self.db.getName();
+ self.name = self.db.getName();
self.uuid = generate_client_uuid(self.name)
self.master_uuid = self.db.get_first_ref('masterobd')
self.cache_uuid = self.db.get_first_ref('cacheobd')
panic('cache obd not found:', self.cache_uuid)
if master_obd.get_class() == 'ost':
- self.client_uuid = generate_client_uuid(self.name)
- self.master= VLOV(master_obd, self.client_uuid, self.name,
+ self.client_uuid = generate_client_uuid(self.name)
+ self.master= VLOV(master_obd, self.client_uuid, self.name,
"%s_master" % (self.name))
self.master_uuid = self.master.get_uuid()
else:
- self.master = get_mdc(db, self.name, self.master_uuid)
+ self.master = get_mdc(db, self.name, self.master_uuid)
# need to check /proc/mounts and /etc/mtab before
# formatting anything.
# FIXME: check if device is already formatted.
def cleanup_module(self):
Module.cleanup_module(self)
self.master.cleanup_module()
-
+
def correct_level(self, level, op=None):
return level
class COBD(Module):
def __init__(self, db, uuid, name, type, name_override = None):
Module.__init__(self, 'COBD', db)
- self.name = self.db.getName();
+ self.name = self.db.getName();
self.uuid = generate_client_uuid(self.name)
self.real_uuid = self.db.get_first_ref('realobd')
self.cache_uuid = self.db.get_first_ref('cacheobd')
if not cache_obd:
panic('cache obd not found:', self.cache_uuid)
if type == 'obd':
- self.real = LOV(real_obd, self.real_uuid, name,
+ self.real = LOV(real_obd, self.real_uuid, name,
"%s_real" % (self.name));
- self.cache = LOV(cache_obd, self.cache_uuid, name,
+ self.cache = LOV(cache_obd, self.cache_uuid, name,
"%s_cache" % (self.name));
else:
- self.real = get_mdc(db, name, self.real_uuid)
- self.cache = get_mdc(db, name, self.cache_uuid)
+ self.real = get_mdc(db, name, self.real_uuid)
+ self.cache = get_mdc(db, name, self.cache_uuid)
# need to check /proc/mounts and /etc/mtab before
# formatting anything.
# FIXME: check if device is already formatted.
ost = self.db.lookup(self.obd_uuid)
if not ost:
panic("no ost: ", self.obd_uuid)
-
+
mds = self.db.lookup(self.mds_uuid)
if not mds:
panic("no mds: ", self.mds_uuid)
-
+
self.add_lustre_module('mdc', 'mdc')
self.add_lustre_module('lmv', 'lmv')
self.add_lustre_module('llite', 'llite')
-
+
self.vosc = VOSC(ost, client_uuid, self.name)
self.vmdc = VMDC(mds, client_uuid, self.name)
-
+
def prepare(self):
if not config.record and fs_is_mounted(self.path):
log(self.path, "already mounted.")
self.clientoptions = ',' + self.clientoptions
# Linux kernel will deal with async and not pass it to ll_fill_super,
# so replace it with Lustre async
- self.clientoptions = string.replace(self.clientoptions, "async",
+ self.clientoptions = string.replace(self.clientoptions, "async",
"lasync")
cmd = "mount -t lustre_lite -o osc=%s,mdc=%s%s %s %s" % \
- (self.vosc.get_name(), vmdc_name, self.clientoptions,
+ (self.vosc.get_name(), vmdc_name, self.clientoptions,
config.config, self.path)
run("mkdir", self.path)
ret, val = run(cmd)
if ret:
- self.vmdc.cleanup()
+ self.vmdc.cleanup()
self.vosc.cleanup()
panic("mount failed:", self.path, ":", string.join(val))
return srv_list
-# the order of iniitailization is based on level.
+# the order of iniitailization is based on level.
def getServiceLevel(self):
type = self.get_class()
ret=0;
elif type in ('lmv',):
ret = 45
elif type in ('cmobd',):
- ret = 50
+ ret = 50
elif type in ('mountpoint', 'echoclient'):
ret = 70
else:
############################################################
# routing ("rooting")
+
# list of (nettype, cluster_id, nid)
local_clusters = []
if srv.port > 0:
if acceptors.has_key(srv.port):
panic("duplicate port:", srv.port)
- acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type)
+ acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type)
# This node is a gateway.
is_router = 0
if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
result.append((srv, r))
return result
-
+
def get_active_target(db):
target_uuid = db.getUUID()
target_name = db.getName()
net = Network(n)
if net.nid_uuid == nid_uuid:
return net
-
+
############################################################
# lconf level logic
#
# Prepare the system to run lustre using a particular profile
-# in a the configuration.
+# in a the configuration.
# * load & the modules
# * setup networking for the current node
# * make sure partitions are in place and prepared
prof_db = db.lookup(prof_uuid)
if not prof_db:
panic("profile:", prof_uuid, "not found.")
- services = getServices(prof_db)
+ services = getServices(prof_db)
operation(services)
def magic_get_osc(db, rec, lov):
n.cleanup_module()
#
-# Load profile for
+# Load profile for
def doHost(lustreDB, hosts):
global is_router, local_node_name
node_db = None
timeout = node_db.get_val_int('timeout', 0)
ptldebug = node_db.get_val('ptldebug', '')
subsystem = node_db.get_val('subsystem', '')
-
+
find_local_clusters(node_db)
if not is_router:
find_local_routes(lustreDB)
base = os.path.dirname(cmd)
if development_mode():
if not config.lustre:
- debug('using objdir module paths')
+ debug('using objdir module paths')
config.lustre = (os.path.join(base, ".."))
# normalize the portals dir, using command line arg if set
if config.portals:
debug('config.portals', config.portals)
elif config.lustre and config.portals:
# production mode
- # if --lustre and --portals, normalize portals
+ # if --lustre and --portals, normalize portals
# can ignore POTRALS_DIR here, since it is probly useless here
config.portals = os.path.join(config.lustre, config.portals)
debug('config.portals B', config.portals)
fp = open(path, 'w')
fp.write('%d\n' %(max))
fp.close()
-
-
+
+
def sys_make_devices():
if not os.access('/dev/portals', os.R_OK):
run('mknod /dev/portals c 10 240')
if new_dir in syspath:
return
os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
-
+
def default_debug_path():
path = '/tmp/lustre-log'
if os.path.isdir('/r'):
PARAM),
('minlevel', "Minimum level of services to configure/cleanup",
INTPARAM, 0),
- ('maxlevel', """Maximum level of services to configure/cleanup
+ ('maxlevel', """Maximum level of services to configure/cleanup
Levels are aproximatly like:
10 - netwrk
20 - device, ldlm
('inactive', """The name of an inactive service, to be ignored during
mounting (currently OST-only). Can be repeated.""",
PARAMLIST),
- ]
+ ]
def main():
global lctl, config, toplustreDB, CONFIG_FILE
# in the upcall this is set to SIG_IGN
signal.signal(signal.SIGCHLD, signal.SIG_DFL)
-
+
cl = Lustre.Options("lconf", "config.xml", lconf_options)
try:
config, args = cl.parse(sys.argv[1:])
random.seed(seed)
sanitise_path()
-
+
init_select(config.select)
if len(args) > 0:
--add net
--node node_name
--nid nid
- --cluster_id
- --nettype tcp|elan|gm|openib
- --hostaddr addr
+ --cluster_id
+ --nettype tcp|elan|gm|openib|iib
+ --hostaddr ip[/netmask]
--port port
--tcpbuf size
--irq_affinity 0|1
"""
PARAM = Lustre.Options.PARAM
+PARAMLIST = Lustre.Options.PARAMLIST
lmc_options = [
# lmc input/output options
('reference', "Print short reference for commands."),
('ptldebug', "Set the portals debug level", PARAM),
('subsystem', "Specify which Lustre subsystems have debug output recorded in the log", PARAM),
- # network
- ('nettype', "Specify the network type. This can be tcp/elan/gm/openib.", PARAM),
+ # network
+ ('nettype', "Specify the network type. This can be tcp/elan/gm/openib/iib.", PARAM),
('nid', "Give the network ID, e.g ElanID/IP Address as used by portals.", PARAM),
('port', "Optional argument to specify the TCP port number.", PARAM, DEFAULT_PORT),
- ('hostaddr', "", PARAM,""),
+ ('hostaddr', "Optional argument to specify the host address.", PARAMLIST),
('cluster_id', "Specify the cluster ID", PARAM, "0"),
# routes
network.setAttribute("nettype", net);
self.addElement(network, "nid", nid)
self.addElement(network, "clusterid", cluster_id)
- if hostaddr:
- self.addElement(network, "hostaddr", hostaddr)
+ for host in hostaddr:
+ self.addElement(network, "hostaddr", host)
if port:
self.addElement(network, "port", "%d" %(port))
if net_type in ('tcp',):
port = get_option_int(options, 'port')
- elif net_type in ('elan', 'gm', 'openib'):
+ elif net_type in ('elan', 'gm', 'openib','iib'):
port = 0
else:
print "Unknown net_type: ", net_type
return rc;
}
+
int jt_llog_check(int argc, char **argv)
{
struct obd_ioctl_data data;
return rc;
}
+
int jt_obd_reint_sync(int argc, char **argv)
{
struct obd_ioctl_data data;
return rc;
}
+
int jt_obd_cache_off(int argc, char **argv)
{
struct obd_ioctl_data data;
rc);
return rc;
}
+
int jt_obd_snap_add(int argc, char **argv)
{
+#if 1
+ return -1;
+#else
+# error "FIX the missing #defines before committing"
struct obd_ioctl_data data;
int rc = 0;
if (rc)
fprintf(stderr, "OBD_IOC_SNAP_ADD failed: rc=%d\n", rc);
return rc;
+#endif
}
+
static void signal_server(int sig)
{
if (sig == SIGINT) {