From fb1d86804d2e9f82045c5198b2a9850321c64fb9 Mon Sep 17 00:00:00 2001 From: eeb Date: Fri, 29 Oct 2004 15:06:22 +0000 Subject: [PATCH] * landed unified portals (b_hd_cleanup_merge_singleportals) on HEAD --- lnet/archdep.m4 | 167 +- lnet/build.m4 | 9 + lnet/include/linux/.cvsignore | 2 + lnet/include/linux/kp30.h | 56 +- lnet/include/linux/libcfs.h | 56 +- lnet/include/linux/portals_compat25.h | 2 + lnet/include/lnet/.cvsignore | 2 + lnet/include/lnet/build_check.h | 2 +- lnet/include/lnet/lnetctl.h | 8 - lnet/include/lnet/ptlctl.h | 8 - lnet/klnds/Makefile.in | 1 + lnet/klnds/autoMakefile.am | 2 +- lnet/klnds/iiblnd/.cvsignore | 10 + lnet/klnds/iiblnd/Makefile.in | 6 + lnet/klnds/iiblnd/Makefile.mk | 10 + lnet/klnds/iiblnd/autoMakefile.am | 15 + lnet/klnds/iiblnd/iiblnd.c | 1713 +++++++++++ lnet/klnds/iiblnd/iiblnd.h | 892 ++++++ lnet/klnds/iiblnd/iiblnd_cb.c | 3018 ++++++++++++++++++++ lnet/klnds/openiblnd/.cvsignore | 10 + lnet/klnds/openiblnd/openiblnd.c | 971 ++++--- lnet/klnds/openiblnd/openiblnd.h | 480 ++-- lnet/klnds/openiblnd/openiblnd_cb.c | 1387 ++++----- lnet/klnds/qswlnd/qswlnd.c | 145 +- lnet/klnds/qswlnd/qswlnd.h | 14 +- lnet/klnds/scimaclnd/scimacnal.c | 2 +- lnet/klnds/socklnd/socklnd.c | 14 +- lnet/klnds/socklnd/socklnd.h | 2 - lnet/klnds/socklnd/socklnd_cb.c | 156 +- lnet/libcfs/debug.c | 45 +- lnet/libcfs/module.c | 18 +- lnet/libcfs/tracefile.c | 6 +- lnet/lnet/lib-move.c | 3 +- lnet/lnet/module.c | 6 +- lnet/router/proc.c | 19 +- lnet/ulnds/connection.c | 112 +- lnet/ulnds/dispatch.h | 7 + lnet/ulnds/procapi.c | 8 + lnet/ulnds/select.c | 327 ++- lnet/ulnds/socklnd/connection.c | 112 +- lnet/ulnds/socklnd/dispatch.h | 7 + lnet/ulnds/socklnd/procapi.c | 8 + lnet/ulnds/socklnd/select.c | 327 ++- lnet/ulnds/socklnd/tcplnd.c | 2 - lnet/ulnds/tcplnd.c | 2 - lnet/utils/acceptor.c | 41 +- lnet/utils/debug.c | 44 +- lnet/utils/portals.c | 138 +- lustre/configure.in | 2 + .../patches/kksymoops-2.4.24.vanilla.patch | 2 +- lustre/portals/archdep.m4 | 167 +- lustre/portals/build.m4 | 9 + lustre/portals/include/linux/.cvsignore | 2 + lustre/portals/include/linux/kp30.h | 56 +- lustre/portals/include/linux/libcfs.h | 56 +- lustre/portals/include/linux/portals_compat25.h | 2 + lustre/portals/include/portals/.cvsignore | 2 + lustre/portals/include/portals/build_check.h | 2 +- lustre/portals/include/portals/ptlctl.h | 8 - lustre/portals/knals/Makefile.in | 1 + lustre/portals/knals/autoMakefile.am | 2 +- lustre/portals/knals/iibnal/.cvsignore | 10 + lustre/portals/knals/iibnal/Makefile.in | 6 + lustre/portals/knals/iibnal/Makefile.mk | 10 + lustre/portals/knals/iibnal/autoMakefile.am | 15 + lustre/portals/knals/iibnal/iibnal.c | 1713 +++++++++++ lustre/portals/knals/iibnal/iibnal.h | 892 ++++++ lustre/portals/knals/iibnal/iibnal_cb.c | 3018 ++++++++++++++++++++ lustre/portals/knals/openibnal/.cvsignore | 10 + lustre/portals/knals/openibnal/openibnal.c | 971 ++++--- lustre/portals/knals/openibnal/openibnal.h | 480 ++-- lustre/portals/knals/openibnal/openibnal_cb.c | 1387 ++++----- lustre/portals/knals/qswnal/qswnal.c | 145 +- lustre/portals/knals/qswnal/qswnal.h | 14 +- lustre/portals/knals/scimacnal/scimacnal.c | 2 +- lustre/portals/knals/socknal/socknal.c | 14 +- lustre/portals/knals/socknal/socknal.h | 2 - lustre/portals/knals/socknal/socknal_cb.c | 156 +- lustre/portals/libcfs/debug.c | 45 +- lustre/portals/libcfs/module.c | 18 +- lustre/portals/libcfs/tracefile.c | 6 +- lustre/portals/portals/lib-move.c | 3 +- lustre/portals/portals/module.c | 6 +- lustre/portals/router/proc.c | 19 +- lustre/portals/unals/connection.c | 112 +- lustre/portals/unals/dispatch.h | 7 + lustre/portals/unals/procapi.c | 8 + lustre/portals/unals/select.c | 327 ++- lustre/portals/unals/tcpnal.c | 2 - lustre/portals/utils/acceptor.c | 41 +- lustre/portals/utils/debug.c | 44 +- lustre/portals/utils/portals.c | 138 +- lustre/utils/lconf | 347 ++- lustre/utils/lmc | 19 +- lustre/utils/obd.c | 10 + 95 files changed, 16856 insertions(+), 3844 deletions(-) create mode 100644 lnet/include/linux/.cvsignore create mode 100644 lnet/include/lnet/.cvsignore create mode 100644 lnet/klnds/iiblnd/.cvsignore create mode 100644 lnet/klnds/iiblnd/Makefile.in create mode 100644 lnet/klnds/iiblnd/Makefile.mk create mode 100644 lnet/klnds/iiblnd/autoMakefile.am create mode 100644 lnet/klnds/iiblnd/iiblnd.c create mode 100644 lnet/klnds/iiblnd/iiblnd.h create mode 100644 lnet/klnds/iiblnd/iiblnd_cb.c create mode 100644 lnet/klnds/openiblnd/.cvsignore create mode 100644 lustre/portals/include/linux/.cvsignore create mode 100644 lustre/portals/include/portals/.cvsignore create mode 100644 lustre/portals/knals/iibnal/.cvsignore create mode 100644 lustre/portals/knals/iibnal/Makefile.in create mode 100644 lustre/portals/knals/iibnal/Makefile.mk create mode 100644 lustre/portals/knals/iibnal/autoMakefile.am create mode 100644 lustre/portals/knals/iibnal/iibnal.c create mode 100644 lustre/portals/knals/iibnal/iibnal.h create mode 100644 lustre/portals/knals/iibnal/iibnal_cb.c create mode 100644 lustre/portals/knals/openibnal/.cvsignore diff --git a/lnet/archdep.m4 b/lnet/archdep.m4 index d2bd1a1..021fa68 100644 --- a/lnet/archdep.m4 +++ b/lnet/archdep.m4 @@ -14,26 +14,107 @@ AC_MSG_RESULT([$enable_inkernel]) AM_CONDITIONAL(INKERNEL, test x$enable_inkernel = xyes) # -------- are we building against an external portals? ------- -AC_MSG_CHECKING([if Cray portals should be used]) +AC_MSG_CHECKING([for Cray portals]) AC_ARG_WITH([cray-portals], AC_HELP_STRING([--with-cray-portals=path], [path to cray portals]), [ if test "$with_cray_portals" != no; then - if test -r $with_cray_portals/include/portals/api.h ; then - CRAY_PORTALS_PATH=$with_cray_portals - CRAY_PORTALS_INCLUDE="-I$with_cray_portals/include" - AC_DEFINE(CRAY_PORTALS, 1, [Building with Cray Portals]) - else - AC_MSG_ERROR([--with-cray-portals specified badly]) - fi - fi + CRAY_PORTALS_PATH=$with_cray_portals + CRAY_PORTALS_INCLUDES="$with_cray_portals/include" + CRAY_PORTALS_LIBS="$with_cray_portals" + fi ],[with_cray_portals=no]) AC_SUBST(CRAY_PORTALS_PATH) -AC_MSG_RESULT([$with_cray_portals]) +AC_MSG_RESULT([$CRAY_PORTALS_PATH]) + +AC_MSG_CHECKING([for Cray portals includes]) +AC_ARG_WITH([cray-portals-includes], + AC_HELP_STRING([--with-cray-portals-includes=path], + [path to cray portals includes]), + [ + if test "$with_cray_portals_includes" != no; then + CRAY_PORTALS_INCLUDES="$with_cray_portals_includes" + fi + ]) +AC_SUBST(CRAY_PORTALS_INCLUDES) +AC_MSG_RESULT([$CRAY_PORTALS_INCLUDES]) + +AC_MSG_CHECKING([for Cray portals libs]) +AC_ARG_WITH([cray-portals-libs], + AC_HELP_STRING([--with-cray-portals-libs=path], + [path to cray portals libs]), + [ + if test "$with_cray_portals_libs" != no; then + CRAY_PORTALS_LIBS="$with_cray_portals_libs" + fi + ]) +AC_SUBST(CRAY_PORTALS_LIBS) +AC_MSG_RESULT([$CRAY_PORTALS_LIBS]) + +if test x$CRAY_PORTALS_INCLUDES != x ; then + if test ! -r $CRAY_PORTALS_INCLUDES/portals/api.h ; then + AC_MSG_ERROR([Cray portals headers were not found in $CRAY_PORTALS_INCLUDES. Please check the paths passed to --with-cray-portals or --with-cray-portals-includes.]) + fi +fi +if test x$CRAY_PORTALS_LIBS != x ; then + if test ! -r $CRAY_PORTALS_LIBS/libportals.a ; then + AC_MSG_ERROR([Cray portals libraries were not found in $CRAY_PORTALS_LIBS. Please check the paths passed to --with-cray-portals or --with-cray-portals-libs.]) + fi +fi +AC_MSG_CHECKING([whether to use Cray portals]) +if test x$CRAY_PORTALS_INCLUDES != x -a x$CRAY_PORTALS_LIBS != x ; then + with_cray_portals=yes + AC_DEFINE(CRAY_PORTALS, 1, [Building with Cray Portals]) + CRAY_PORTALS_INCLUDES="-I$CRAY_PORTALS_INCLUDES" +else + with_cray_portals=no +fi +AC_MSG_RESULT([$with_cray_portals]) AM_CONDITIONAL(CRAY_PORTALS, test x$with_cray_portals != xno) +# ---------------------------------------- +# some tests for catamount-like systems +# ---------------------------------------- +AC_ARG_ENABLE([sysio_init], + AC_HELP_STRING([--disable-sysio-init], + [call sysio init functions when initializing liblustre]), + [],[enable_sysio_init=yes]) +AC_MSG_CHECKING([whether to initialize libsysio]) +AC_MSG_RESULT([$enable_sysio_init]) +if test x$enable_sysio_init != xno ; then + AC_DEFINE([INIT_SYSIO], 1, [call sysio init functions]) +fi + +AC_ARG_ENABLE([urandom], + AC_HELP_STRING([--disable-urandom], + [disable use of /dev/urandom for liblustre]), + [],[enable_urandom=yes]) +AC_MSG_CHECKING([whether to use /dev/urandom for liblustre]) +AC_MSG_RESULT([$enable_urandom]) +if test x$enable_urandom != xno ; then + AC_DEFINE([LIBLUSTRE_USE_URANDOM], 1, [use /dev/urandom for random data]) +fi + +# -------- check for -lcap and -lpthread ---- +if test x$enable_liblustre = xyes ; then + AC_CHECK_LIB([cap], [cap_get_proc], + [ + CAP_LIBS="-lcap" + AC_DEFINE([HAVE_LIBCAP], 1, [use libcap]) + ], + [CAP_LIBS=""]) + AC_SUBST(CAP_LIBS) + AC_CHECK_LIB([pthread], [pthread_create], + [ + PTHREAD_LIBS="-lpthread" + AC_DEFINE([HAVE_LIBPTHREAD], 1, [use libpthread]) + ], + [PTHREAD_LIBS=""]) + AC_SUBST(PTHREAD_LIBS) +fi + # -------- enable tests and utils? ------- if test x$enable_tests = xno ; then AC_MSG_NOTICE([disabling tests]) @@ -128,7 +209,7 @@ AM_CONDITIONAL(USE_QUILT, test x$QUILT != xno) # ------- Makeflags ------------------ -CPPFLAGS="$CRAY_PORTALS_INCLUDE $CRAY_PORTALS_COMMANDLINE -I\$(top_srcdir)/include -I\$(top_srcdir)/portals/include" +CPPFLAGS="$CPPFLAGS $CRAY_PORTALS_INCLUDES -I\$(top_srcdir)/include -I\$(top_srcdir)/portals/include" # liblustre are all the same LLCPPFLAGS="-D__arch_lib__ -D_LARGEFILE64_SOURCE=1" @@ -146,7 +227,7 @@ if test x$enable_ldiskfs = xyes ; then AC_DEFINE(CONFIG_LDISKFS_FS_SECURITY, 1, [enable fs security]) fi -EXTRA_KCFLAGS="-g $CRAY_PORTALS_INCLUDE $CRAY_PORTALS_COMMANDLINE -I$PWD/portals/include -I$PWD/include" +EXTRA_KCFLAGS="-g $CRAY_PORTALS_INCLUDES -I$PWD/portals/include -I$PWD/include" # these are like AC_TRY_COMPILE, but try to build modules against the # kernel, inside the kernel-tests directory @@ -408,6 +489,35 @@ if test x$enable_modules != xno ; then AC_SUBST(OPENIBCPPFLAGS) AC_SUBST(OPENIBNAL) + #### Infinicon IB + AC_MSG_CHECKING([if Infinicon IB kernel headers are present]) + # for how the only infinicon ib build has headers in /usr/include/iba + IIBCPPFLAGS="-I/usr/include -DIN_TREE_BUILD" + EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="$EXTRA_KCFLAGS $IIBCPPFLAGS" + LUSTRE_MODULE_TRY_COMPILE( + [ + #include + ],[ + IBT_INTERFACE_UNION interfaces; + FSTATUS rc; + + rc = IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2, + &interfaces); + + return rc == FSUCCESS ? 0 : 1; + ],[ + AC_MSG_RESULT([yes]) + IIBNAL="iibnal" + ],[ + AC_MSG_RESULT([no]) + IIBNAL="" + IIBCPPFLAGS="" + ]) + EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save" + AC_SUBST(IIBCPPFLAGS) + AC_SUBST(IIBNAL) + # ---------- Red Hat 2.4.18 has iobuf->dovary -------------- # But other kernels don't @@ -667,15 +777,34 @@ fi AM_CONDITIONAL(BUILD_QSWNAL, test x$QSWNAL = "xqswnal") AM_CONDITIONAL(BUILD_GMNAL, test x$GMNAL = "xgmnal") AM_CONDITIONAL(BUILD_OPENIBNAL, test x$OPENIBNAL = "xopenibnal") +AM_CONDITIONAL(BUILD_IIBNAL, test x$IIBNAL = "xiibnal") + +# portals/utils/portals.c +AC_CHECK_HEADERS([netdb.h netinet/tcp.h asm/types.h]) +AC_CHECK_FUNCS([gethostbyname socket connect]) + +# portals/utils/debug.c +AC_CHECK_HEADERS([linux/version.h]) + +# include/liblustre.h +AC_CHECK_HEADERS([asm/page.h sys/user.h stdint.h]) + +# liblustre/llite_lib.h +AC_CHECK_HEADERS([xtio.h file.h]) + +# liblustre/dir.c +AC_CHECK_HEADERS([linux/types.h sys/types.h linux/unistd.h unistd.h]) + +# liblustre/lutil.c +AC_CHECK_HEADERS([netinet/in.h arpa/inet.h catamount/data.h]) +AC_CHECK_FUNCS([inet_ntoa]) CPPFLAGS="-include \$(top_builddir)/include/config.h $CPPFLAGS" EXTRA_KCFLAGS="-include $PWD/include/config.h $EXTRA_KCFLAGS" AC_SUBST(EXTRA_KCFLAGS) -#echo "KCPPFLAGS: $KCPPFLAGS" -#echo "KCFLAGS: $KCFLAGS" -#echo "LLCPPFLAGS: $LLCPPFLAGS" -#echo "LLCFLAGS: $LLCFLAGS" -#echo "MOD_LINK: $MOD_LINK" -#echo "CFLAGS: $CFLAGS" -#echo "CPPFLAGS: $CPPFLAGS" +echo "CPPFLAGS: $CPPFLAGS" +echo "LLCPPFLAGS: $LLCPPFLAGS" +echo "CFLAGS: $CFLAGS" +echo "EXTRA_KCFLAGS: $EXTRA_KCFLAGS" +echo "LLCFLAGS: $LLCFLAGS" diff --git a/lnet/build.m4 b/lnet/build.m4 index 861bb4a..f158396 100644 --- a/lnet/build.m4 +++ b/lnet/build.m4 @@ -61,6 +61,13 @@ case "$CC_VERSION" in "gcc version 2.96 20000731 (Mandrake Linux 8.1 2.96-0.62mdk)") bad_cc ;; + # unpatched 'gcc' on rh9. miscompiles a + # struct = (type) { .member = value, }; + # asignment in the iibnal where the struct is a mix + # of u64 and u32 bit-fields. + "gcc version 3.2.2 20030222 (Red Hat Linux 3.2.2-5)") + bad_cc + ;; *) AC_MSG_RESULT([no known problems]) ;; @@ -116,3 +123,5 @@ else LIBWRAP="" fi AC_SUBST(LIBWRAP) + +AC_SUBST(LIBS) diff --git a/lnet/include/linux/.cvsignore b/lnet/include/linux/.cvsignore new file mode 100644 index 0000000..282522d --- /dev/null +++ b/lnet/include/linux/.cvsignore @@ -0,0 +1,2 @@ +Makefile +Makefile.in diff --git a/lnet/include/linux/kp30.h b/lnet/include/linux/kp30.h index db63a08..4e24c71d 100644 --- a/lnet/include/linux/kp30.h +++ b/lnet/include/linux/kp30.h @@ -294,7 +294,6 @@ extern void kportal_blockallsigs (void); # include # include # include -# include # ifndef DEBUG_SUBSYSTEM # define DEBUG_SUBSYSTEM S_UNDEFINED # endif @@ -320,6 +319,11 @@ void portals_debug_dumplog(void); printf("%02x:%06x (@%lu %s:%s,l. %d %d %lu): " format, \ (subsys), (mask), (long)time(0), file, fn, line, \ getpid() , stack, ## a); + +#undef CWARN +#undef CERROR +#define CWARN(format, a...) CDEBUG(D_WARNING, format, ## a) +#define CERROR(format, a...) CDEBUG(D_ERROR, format, ## a) #endif /* support decl needed both by kernel and liblustre */ @@ -338,6 +342,16 @@ char *portals_id2str(int nal, ptl_process_id_t nid, char *str); #define LWT_MEMORY (16<<20) #if !KLWT_SUPPORT +# if defined(__KERNEL__) +# if !defined(BITS_PER_LONG) +# error "BITS_PER_LONG not defined" +# endif +# elif !defined(__WORDSIZE) +# error "__WORDSIZE not defined" +# else +# define BITS_PER_LONG __WORDSIZE +# endif + /* kernel hasn't defined this? */ typedef struct { long long lwte_when; @@ -572,49 +586,42 @@ static inline int portal_ioctl_getdata(char *buf, char *end, void *arg) data = (struct portal_ioctl_data *)buf; err = copy_from_user(buf, (void *)arg, sizeof(*hdr)); - if ( err ) { - EXIT; - return err; - } + if (err) + RETURN(err); if (hdr->ioc_version != PORTAL_IOCTL_VERSION) { - CERROR ("PORTALS: version mismatch kernel vs application\n"); - return -EINVAL; + CERROR("PORTALS: version mismatch kernel vs application\n"); + RETURN(-EINVAL); } if (hdr->ioc_len + buf >= end) { - CERROR ("PORTALS: user buffer exceeds kernel buffer\n"); - return -EINVAL; + CERROR("PORTALS: user buffer exceeds kernel buffer\n"); + RETURN(-EINVAL); } if (hdr->ioc_len < sizeof(struct portal_ioctl_data)) { - CERROR ("PORTALS: user buffer too small for ioctl\n"); - return -EINVAL; + CERROR("PORTALS: user buffer too small for ioctl\n"); + RETURN(-EINVAL); } err = copy_from_user(buf, (void *)arg, hdr->ioc_len); - if ( err ) { - EXIT; - return err; - } + if (err) + RETURN(err); if (portal_ioctl_is_invalid(data)) { - CERROR ("PORTALS: ioctl not correctly formatted\n"); - return -EINVAL; + CERROR("PORTALS: ioctl not correctly formatted\n"); + RETURN(-EINVAL); } - if (data->ioc_inllen1) { + if (data->ioc_inllen1) data->ioc_inlbuf1 = &data->ioc_bulk[0]; - } - if (data->ioc_inllen2) { + if (data->ioc_inllen2) data->ioc_inlbuf2 = &data->ioc_bulk[0] + size_round(data->ioc_inllen1); - } - EXIT; - return 0; + RETURN(0); } #endif @@ -645,10 +652,11 @@ enum { TCPNAL = 5, ROUTER = 6, OPENIBNAL = 7, + IIBNAL = 8, NAL_ENUM_END_MARKER }; -#define PTL_NALFMT_SIZE 30 /* %u:%u.%u.%u.%u,%u (10+4+4+4+3+4+1) */ +#define PTL_NALFMT_SIZE 32 /* %u:%u.%u.%u.%u,%u (10+4+4+4+3+5+1) */ #define NAL_MAX_NR (NAL_ENUM_END_MARKER - 1) diff --git a/lnet/include/linux/libcfs.h b/lnet/include/linux/libcfs.h index d1a5c44..8317f14 100644 --- a/lnet/include/linux/libcfs.h +++ b/lnet/include/linux/libcfs.h @@ -4,7 +4,11 @@ #ifndef _LIBCFS_H #define _LIBCFS_H +#ifdef HAVE_ASM_TYPES_H #include +#else +#include "types.h" +#endif #ifdef __KERNEL__ # include @@ -62,7 +66,6 @@ extern unsigned int portal_stack; extern unsigned int portal_debug; extern unsigned int portal_printk; -#include struct ptldebug_header { __u32 ph_len; __u32 ph_flags; @@ -102,7 +105,7 @@ struct ptldebug_header { #define S_GMNAL 0x00080000 #define S_PTLROUTER 0x00100000 #define S_COBD 0x00200000 -#define S_OPENIBNAL 0x00400000 +#define S_IBNAL 0x00400000 /* All IB NALs */ #define S_SM 0x00800000 #define S_ASOBD 0x01000000 #define S_LMV 0x02000000 @@ -185,8 +188,40 @@ do { \ CDEBUG_STACK, format, ## a); \ } while (0) -#define CWARN(format, a...) CDEBUG(D_WARNING, format, ## a) -#define CERROR(format, a...) CDEBUG(D_ERROR, format, ## a) +#define CDEBUG_MAX_LIMIT 600 +#define CDEBUG_LIMIT(cdebug_mask, cdebug_format, a...) \ +do { \ + static unsigned long cdebug_next; \ + static int cdebug_count, cdebug_delay = 1; \ + \ + CHECK_STACK(CDEBUG_STACK); \ + if (time_after(jiffies, cdebug_next)) { \ + portals_debug_msg(DEBUG_SUBSYSTEM, cdebug_mask, __FILE__, \ + __FUNCTION__, __LINE__, CDEBUG_STACK, \ + cdebug_format, ## a); \ + if (cdebug_count) { \ + portals_debug_msg(DEBUG_SUBSYSTEM, cdebug_mask, \ + __FILE__, __FUNCTION__, __LINE__, \ + CDEBUG_STACK, cdebug_format, ## a); \ + cdebug_count = 0; \ + } \ + if (time_after(jiffies, cdebug_next+(CDEBUG_MAX_LIMIT+10)*HZ))\ + cdebug_delay = cdebug_delay > 8 ? cdebug_delay/8 : 1; \ + else \ + cdebug_delay = cdebug_delay*2 >= CDEBUG_MAX_LIMIT*HZ ?\ + CDEBUG_MAX_LIMIT*HZ : cdebug_delay*2; \ + cdebug_next = jiffies + cdebug_delay; \ + } else { \ + portals_debug_msg(DEBUG_SUBSYSTEM, \ + portal_debug & ~(D_EMERG|D_ERROR|D_WARNING),\ + __FILE__, __FUNCTION__, __LINE__, \ + CDEBUG_STACK, cdebug_format, ## a); \ + cdebug_count++; \ + } \ +} while (0) + +#define CWARN(format, a...) CDEBUG_LIMIT(D_WARNING, format, ## a) +#define CERROR(format, a...) CDEBUG_LIMIT(D_ERROR, format, ## a) #define CEMERG(format, a...) CDEBUG(D_EMERG, format, ## a) #define GOTO(label, rc) \ @@ -229,14 +264,13 @@ do { \ /* initial pid */ # if CRAY_PORTALS /* + * 1) ptl_pid_t in cray portals is only 16 bits, not 32 bits, therefore this + * is too big. * - * 1) ptl_pid_t in cray portals is only 16 bits, not 32 bits, therefore this is too - * big. - * - * 2) the implementation of ernal in cray portals further restricts the pid space - * that may be used to 0 <= pid <= 255 (an 8 bit value). Returns an error at nal - * init time for any pid outside this range. Other nals in cray portals don't have - * this restriction. + * 2) the implementation of ernal in cray portals further restricts the pid + * space that may be used to 0 <= pid <= 255 (an 8 bit value). Returns + * an error at nal init time for any pid outside this range. Other nals + * in cray portals don't have this restriction. * */ #define LUSTRE_PTL_PID 9 # else diff --git a/lnet/include/linux/portals_compat25.h b/lnet/include/linux/portals_compat25.h index 7fe6dfc..5a43a45 100644 --- a/lnet/include/linux/portals_compat25.h +++ b/lnet/include/linux/portals_compat25.h @@ -28,6 +28,8 @@ call_usermodehelper(path, argv, envp, 1) # define RECALC_SIGPENDING recalc_sigpending() # define CURRENT_SECONDS get_seconds() +# define smp_num_cpus NR_CPUS + #elif defined(CONFIG_RH_2_4_20) /* RH 2.4.x */ diff --git a/lnet/include/lnet/.cvsignore b/lnet/include/lnet/.cvsignore new file mode 100644 index 0000000..282522d --- /dev/null +++ b/lnet/include/lnet/.cvsignore @@ -0,0 +1,2 @@ +Makefile +Makefile.in diff --git a/lnet/include/lnet/build_check.h b/lnet/include/lnet/build_check.h index 5db1352..c219d2a 100644 --- a/lnet/include/lnet/build_check.h +++ b/lnet/include/lnet/build_check.h @@ -1,7 +1,7 @@ #ifndef _BUILD_CHECK_H #define _BUILD_CHECK_H -#ifdef CRAY_PORTALS +#if CRAY_PORTALS #error "an application got to me instead of cray's includes" #endif diff --git a/lnet/include/lnet/lnetctl.h b/lnet/include/lnet/lnetctl.h index a81a371..cfddde2 100644 --- a/lnet/include/lnet/lnetctl.h +++ b/lnet/include/lnet/lnetctl.h @@ -31,8 +31,6 @@ #define PORTALS_DEV_PATH "/dev/portals" #define OBD_DEV_ID 1 #define OBD_DEV_PATH "/dev/obd" -#define SMFS_DEV_ID 2 -#define SMFS_DEV_PATH "/dev/snapdev" int ptl_name2nal(char *str); int ptl_parse_ipaddr (__u32 *ipaddrp, char *str); @@ -41,9 +39,6 @@ char * ptl_nid2str (char *buffer, ptl_nid_t nid); int ptl_initialize(int argc, char **argv); int jt_ptl_network(int argc, char **argv); -int jt_ptl_print_autoconnects (int argc, char **argv); -int jt_ptl_add_autoconnect (int argc, char **argv); -int jt_ptl_del_autoconnect (int argc, char **argv); int jt_ptl_print_interfaces(int argc, char **argv); int jt_ptl_add_interface(int argc, char **argv); int jt_ptl_del_interface(int argc, char **argv); @@ -62,9 +57,6 @@ int jt_ptl_add_uuid(int argc, char **argv); int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility */ int jt_ptl_close_uuid(int argc, char **argv); int jt_ptl_del_uuid(int argc, char **argv); -int jt_ptl_rxmem (int argc, char **argv); -int jt_ptl_txmem (int argc, char **argv); -int jt_ptl_nagle (int argc, char **argv); int jt_ptl_add_route (int argc, char **argv); int jt_ptl_del_route (int argc, char **argv); int jt_ptl_notify_router (int argc, char **argv); diff --git a/lnet/include/lnet/ptlctl.h b/lnet/include/lnet/ptlctl.h index a81a371..cfddde2 100644 --- a/lnet/include/lnet/ptlctl.h +++ b/lnet/include/lnet/ptlctl.h @@ -31,8 +31,6 @@ #define PORTALS_DEV_PATH "/dev/portals" #define OBD_DEV_ID 1 #define OBD_DEV_PATH "/dev/obd" -#define SMFS_DEV_ID 2 -#define SMFS_DEV_PATH "/dev/snapdev" int ptl_name2nal(char *str); int ptl_parse_ipaddr (__u32 *ipaddrp, char *str); @@ -41,9 +39,6 @@ char * ptl_nid2str (char *buffer, ptl_nid_t nid); int ptl_initialize(int argc, char **argv); int jt_ptl_network(int argc, char **argv); -int jt_ptl_print_autoconnects (int argc, char **argv); -int jt_ptl_add_autoconnect (int argc, char **argv); -int jt_ptl_del_autoconnect (int argc, char **argv); int jt_ptl_print_interfaces(int argc, char **argv); int jt_ptl_add_interface(int argc, char **argv); int jt_ptl_del_interface(int argc, char **argv); @@ -62,9 +57,6 @@ int jt_ptl_add_uuid(int argc, char **argv); int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility */ int jt_ptl_close_uuid(int argc, char **argv); int jt_ptl_del_uuid(int argc, char **argv); -int jt_ptl_rxmem (int argc, char **argv); -int jt_ptl_txmem (int argc, char **argv); -int jt_ptl_nagle (int argc, char **argv); int jt_ptl_add_route (int argc, char **argv); int jt_ptl_del_route (int argc, char **argv); int jt_ptl_notify_router (int argc, char **argv); diff --git a/lnet/klnds/Makefile.in b/lnet/klnds/Makefile.in index 2a01119..9763d14 100644 --- a/lnet/klnds/Makefile.in +++ b/lnet/klnds/Makefile.in @@ -1,5 +1,6 @@ @BUILD_GMNAL_TRUE@subdir-m += gmnal @BUILD_OPENIBNAL_TRUE@subdir-m += openibnal +@BUILD_IIBNAL_TRUE@subdir-m += iibnal @BUILD_QSWNAL_TRUE@subdir-m += qswnal subdir-m += socknal diff --git a/lnet/klnds/autoMakefile.am b/lnet/klnds/autoMakefile.am index 002c169..0090364 100644 --- a/lnet/klnds/autoMakefile.am +++ b/lnet/klnds/autoMakefile.am @@ -3,4 +3,4 @@ # This code is issued under the GNU General Public License. # See the file COPYING in this distribution -SUBDIRS = gmnal openibnal qswnal socknal +SUBDIRS = gmnal iibnal openibnal qswnal socknal diff --git a/lnet/klnds/iiblnd/.cvsignore b/lnet/klnds/iiblnd/.cvsignore new file mode 100644 index 0000000..5ed596b --- /dev/null +++ b/lnet/klnds/iiblnd/.cvsignore @@ -0,0 +1,10 @@ +.deps +Makefile +.*.cmd +autoMakefile.in +autoMakefile +*.ko +*.mod.c +.*.flags +.tmp_versions +.depend diff --git a/lnet/klnds/iiblnd/Makefile.in b/lnet/klnds/iiblnd/Makefile.in new file mode 100644 index 0000000..e7934e2 --- /dev/null +++ b/lnet/klnds/iiblnd/Makefile.in @@ -0,0 +1,6 @@ +MODULES := kiibnal +kiibnal-objs := iibnal.o iibnal_cb.o + +EXTRA_POST_CFLAGS := @IIBCPPFLAGS@ + +@INCLUDE_RULES@ diff --git a/lnet/klnds/iiblnd/Makefile.mk b/lnet/klnds/iiblnd/Makefile.mk new file mode 100644 index 0000000..0459a20 --- /dev/null +++ b/lnet/klnds/iiblnd/Makefile.mk @@ -0,0 +1,10 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include $(src)/../../Kernelenv + +obj-y += kiibnal.o +kiibnal-objs := iibnal.o iibnal_cb.o + diff --git a/lnet/klnds/iiblnd/autoMakefile.am b/lnet/klnds/iiblnd/autoMakefile.am new file mode 100644 index 0000000..251df66 --- /dev/null +++ b/lnet/klnds/iiblnd/autoMakefile.am @@ -0,0 +1,15 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +if MODULES +if !CRAY_PORTALS +if BUILD_IIBNAL +modulenet_DATA = kiibnal$(KMODEXT) +endif +endif +endif + +MOSTLYCLEANFILES = *.o *.ko *.mod.c +DIST_SOURCES = $(kiibnal-objs:%.o=%.c) iibnal.h diff --git a/lnet/klnds/iiblnd/iiblnd.c b/lnet/klnds/iiblnd/iiblnd.c new file mode 100644 index 0000000..09908c9 --- /dev/null +++ b/lnet/klnds/iiblnd/iiblnd.c @@ -0,0 +1,1713 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * Author: Eric Barton + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "iibnal.h" + +nal_t kibnal_api; +ptl_handle_ni_t kibnal_ni; +kib_tunables_t kibnal_tunables; + +kib_data_t kibnal_data = { + .kib_service_id = IBNAL_SERVICE_NUMBER, +}; + +#ifdef CONFIG_SYSCTL +#define IBNAL_SYSCTL 202 + +#define IBNAL_SYSCTL_TIMEOUT 1 + +static ctl_table kibnal_ctl_table[] = { + {IBNAL_SYSCTL_TIMEOUT, "timeout", + &kibnal_tunables.kib_io_timeout, sizeof (int), + 0644, NULL, &proc_dointvec}, + { 0 } +}; + +static ctl_table kibnal_top_ctl_table[] = { + {IBNAL_SYSCTL, "iibnal", NULL, 0, 0555, kibnal_ctl_table}, + { 0 } +}; +#endif + +#ifdef unused +void +print_service(IB_SERVICE_RECORD *service, char *tag, int rc) +{ + char name[32]; + + if (service == NULL) + { + CWARN("tag : %s\n" + "status : %d (NULL)\n", tag, rc); + return; + } + strncpy (name, service->ServiceName, sizeof(name)-1); + name[sizeof(name)-1] = 0; + + CWARN("tag : %s\n" + "status : %d\n" + "service id: "LPX64"\n" + "name : %s\n" + "NID : "LPX64"\n", tag, rc, + service->RID.ServiceID, name, + *kibnal_service_nid_field(service)); +} +#endif + +static void +kibnal_service_setunset_done (void *arg, FABRIC_OPERATION_DATA *fod, + FSTATUS frc, uint32 madrc) +{ + *(FSTATUS *)arg = frc; + up (&kibnal_data.kib_nid_signal); +} + +#if IBNAL_CHECK_ADVERT +static void +kibnal_service_query_done (void *arg, QUERY *qry, + QUERY_RESULT_VALUES *qry_result) +{ + FSTATUS frc = qry_result->Status; + + if (frc != FSUCCESS && + qry_result->ResultDataSize == 0) + frc = FERROR; + + *(FSTATUS *)arg = frc; + up (&kibnal_data.kib_nid_signal); +} + +static void +kibnal_check_advert (void) +{ + QUERY *qry; + IB_SERVICE_RECORD *svc; + FSTATUS frc; + FSTATUS frc2; + + PORTAL_ALLOC(qry, sizeof(*qry)); + if (qry == NULL) + return; + + memset (qry, 0, sizeof(*qry)); + qry->InputType = InputTypeServiceRecord; + qry->OutputType = OutputTypeServiceRecord; + qry->InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK; + svc = &qry->InputValue.ServiceRecordValue.ServiceRecord; + kibnal_set_service_keys(svc, kibnal_data.kib_nid); + + frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd, + kibnal_data.kib_port_guid, + qry, + kibnal_service_query_done, + NULL, &frc2); + if (frc != FSUCCESS && frc != FPENDING) { + CERROR ("Immediate error %d checking SM service\n", frc); + } else { + down (&kibnal_data.kib_nid_signal); + frc = frc2; + + if (frc != 0) + CERROR ("Error %d checking SM service\n", rc); + } + + return (rc); +} +#endif + +static void fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type) +{ + IB_SERVICE_RECORD *svc; + + memset (fod, 0, sizeof(*fod)); + fod->Type = type; + + svc = &fod->Value.ServiceRecordValue.ServiceRecord; + svc->RID.ServiceID = kibnal_data.kib_service_id; + svc->RID.ServiceGID.Type.Global.InterfaceID = kibnal_data.kib_port_guid; + svc->RID.ServiceGID.Type.Global.SubnetPrefix = DEFAULT_SUBNET_PREFIX; + svc->RID.ServiceP_Key = kibnal_data.kib_port_pkey; + svc->ServiceLease = 0xffffffff; + + kibnal_set_service_keys(svc, kibnal_data.kib_nid); +} + +static int +kibnal_advertise (void) +{ + FABRIC_OPERATION_DATA *fod; + IB_SERVICE_RECORD *svc; + FSTATUS frc; + FSTATUS frc2; + + LASSERT (kibnal_data.kib_nid != PTL_NID_ANY); + + PORTAL_ALLOC(fod, sizeof(*fod)); + if (fod == NULL) + return (-ENOMEM); + + fill_fod(fod, FabOpSetServiceRecord); + svc = &fod->Value.ServiceRecordValue.ServiceRecord; + + CDEBUG(D_NET, "Advertising service id "LPX64" %s:"LPX64"\n", + svc->RID.ServiceID, + svc->ServiceName, *kibnal_service_nid_field(svc)); + + frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd, + kibnal_data.kib_port_guid, + fod, kibnal_service_setunset_done, + NULL, &frc2); + + if (frc != FSUCCESS && frc != FPENDING) { + CERROR ("Immediate error %d advertising NID "LPX64"\n", + frc, kibnal_data.kib_nid); + goto out; + } + + down (&kibnal_data.kib_nid_signal); + + frc = frc2; + if (frc != FSUCCESS) + CERROR ("Error %d advertising BUD "LPX64"\n", + frc, kibnal_data.kib_nid); +out: + PORTAL_FREE(fod, sizeof(*fod)); + return (frc == FSUCCESS) ? 0 : -EINVAL; +} + +static void +kibnal_unadvertise (int expect_success) +{ + FABRIC_OPERATION_DATA *fod; + IB_SERVICE_RECORD *svc; + FSTATUS frc; + FSTATUS frc2; + + LASSERT (kibnal_data.kib_nid != PTL_NID_ANY); + + PORTAL_ALLOC(fod, sizeof(*fod)); + if (fod == NULL) + return; + + fill_fod(fod, FabOpDeleteServiceRecord); + svc = &fod->Value.ServiceRecordValue.ServiceRecord; + + CDEBUG(D_NET, "Unadvertising service %s:"LPX64"\n", + svc->ServiceName, *kibnal_service_nid_field(svc)); + + frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd, + kibnal_data.kib_port_guid, + fod, kibnal_service_setunset_done, + NULL, &frc2); + + if (frc != FSUCCESS && frc != FPENDING) { + CERROR ("Immediate error %d unadvertising NID "LPX64"\n", + frc, kibnal_data.kib_nid); + goto out; + } + + down (&kibnal_data.kib_nid_signal); + + if ((frc2 == FSUCCESS) == !!expect_success) + goto out; + + if (expect_success) + CERROR("Error %d unadvertising NID "LPX64"\n", + frc2, kibnal_data.kib_nid); + else + CWARN("Removed conflicting NID "LPX64"\n", + kibnal_data.kib_nid); + out: + PORTAL_FREE(fod, sizeof(*fod)); +} + +static int +kibnal_set_mynid(ptl_nid_t nid) +{ + struct timeval tv; + lib_ni_t *ni = &kibnal_lib.libnal_ni; + int rc; + FSTATUS frc; + + CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", + nid, ni->ni_pid.nid); + + do_gettimeofday(&tv); + + down (&kibnal_data.kib_nid_mutex); + + if (nid == kibnal_data.kib_nid) { + /* no change of NID */ + up (&kibnal_data.kib_nid_mutex); + return (0); + } + + CDEBUG(D_NET, "NID "LPX64"("LPX64")\n", + kibnal_data.kib_nid, nid); + + if (kibnal_data.kib_nid != PTL_NID_ANY) { + + kibnal_unadvertise (1); + + frc = iibt_cm_cancel(kibnal_data.kib_cep); + if (frc != FSUCCESS && frc != FPENDING) + CERROR ("Error %d stopping listener\n", frc); + + frc = iibt_cm_destroy_cep(kibnal_data.kib_cep); + if (frc != FSUCCESS) + CERROR ("Error %d destroying CEP\n", frc); + + kibnal_data.kib_cep = NULL; + } + + kibnal_data.kib_nid = ni->ni_pid.nid = nid; + kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; + + /* Delete all existing peers and their connections after new + * NID/incarnation set to ensure no old connections in our brave + * new world. */ + kibnal_del_peer (PTL_NID_ANY, 0); + + if (kibnal_data.kib_nid == PTL_NID_ANY) { + /* No new NID to install */ + up (&kibnal_data.kib_nid_mutex); + return (0); + } + + /* remove any previous advert (crashed node etc) */ + kibnal_unadvertise(0); + + kibnal_data.kib_cep = iibt_cm_create_cep(CM_RC_TYPE); + if (kibnal_data.kib_cep == NULL) { + CERROR ("Can't create CEP\n"); + rc = -ENOMEM; + } else { + CM_LISTEN_INFO info; + memset (&info, 0, sizeof(info)); + info.ListenAddr.EndPt.SID = kibnal_data.kib_service_id; + + frc = iibt_cm_listen(kibnal_data.kib_cep, &info, + kibnal_listen_callback, NULL); + if (frc != FSUCCESS && frc != FPENDING) { + CERROR ("iibt_cm_listen error: %d\n", frc); + rc = -EINVAL; + } else { + rc = 0; + } + } + + if (rc == 0) { + rc = kibnal_advertise(); + if (rc == 0) { +#if IBNAL_CHECK_ADVERT + kibnal_check_advert(); +#endif + up (&kibnal_data.kib_nid_mutex); + return (0); + } + + iibt_cm_cancel (kibnal_data.kib_cep); + iibt_cm_destroy_cep (kibnal_data.kib_cep); + /* remove any peers that sprung up while I failed to + * advertise myself */ + kibnal_del_peer (PTL_NID_ANY, 0); + } + + kibnal_data.kib_nid = PTL_NID_ANY; + up (&kibnal_data.kib_nid_mutex); + return (rc); +} + +kib_peer_t * +kibnal_create_peer (ptl_nid_t nid) +{ + kib_peer_t *peer; + + LASSERT (nid != PTL_NID_ANY); + + PORTAL_ALLOC (peer, sizeof (*peer)); + if (peer == NULL) + return (NULL); + + memset(peer, 0, sizeof(*peer)); /* zero flags etc */ + + peer->ibp_nid = nid; + atomic_set (&peer->ibp_refcount, 1); /* 1 ref for caller */ + + INIT_LIST_HEAD (&peer->ibp_list); /* not in the peer table yet */ + INIT_LIST_HEAD (&peer->ibp_conns); + INIT_LIST_HEAD (&peer->ibp_tx_queue); + + peer->ibp_reconnect_time = jiffies; + peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; + + atomic_inc (&kibnal_data.kib_npeers); + return (peer); +} + +void +kibnal_destroy_peer (kib_peer_t *peer) +{ + + LASSERT (atomic_read (&peer->ibp_refcount) == 0); + LASSERT (peer->ibp_persistence == 0); + LASSERT (!kibnal_peer_active(peer)); + LASSERT (peer->ibp_connecting == 0); + LASSERT (list_empty (&peer->ibp_conns)); + LASSERT (list_empty (&peer->ibp_tx_queue)); + + PORTAL_FREE (peer, sizeof (*peer)); + + /* NB a peer's connections keep a reference on their peer until + * they are destroyed, so we can be assured that _all_ state to do + * with this peer has been cleaned up when its refcount drops to + * zero. */ + atomic_dec (&kibnal_data.kib_npeers); +} + +/* the caller is responsible for accounting for the additional reference + * that this creates */ +kib_peer_t * +kibnal_find_peer_locked (ptl_nid_t nid) +{ + struct list_head *peer_list = kibnal_nid2peerlist (nid); + struct list_head *tmp; + kib_peer_t *peer; + + list_for_each (tmp, peer_list) { + + peer = list_entry (tmp, kib_peer_t, ibp_list); + + LASSERT (peer->ibp_persistence != 0 || /* persistent peer */ + peer->ibp_connecting != 0 || /* creating conns */ + !list_empty (&peer->ibp_conns)); /* active conn */ + + if (peer->ibp_nid != nid) + continue; + + CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n", + peer, nid, atomic_read (&peer->ibp_refcount)); + return (peer); + } + return (NULL); +} + +kib_peer_t * +kibnal_get_peer (ptl_nid_t nid) +{ + kib_peer_t *peer; + + read_lock (&kibnal_data.kib_global_lock); + peer = kibnal_find_peer_locked (nid); + if (peer != NULL) /* +1 ref for caller? */ + kib_peer_addref(peer); + read_unlock (&kibnal_data.kib_global_lock); + + return (peer); +} + +void +kibnal_unlink_peer_locked (kib_peer_t *peer) +{ + LASSERT (peer->ibp_persistence == 0); + LASSERT (list_empty(&peer->ibp_conns)); + + LASSERT (kibnal_peer_active(peer)); + list_del_init (&peer->ibp_list); + /* lose peerlist's ref */ + kib_peer_decref(peer); +} + +static int +kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep) +{ + kib_peer_t *peer; + struct list_head *ptmp; + int i; + + read_lock (&kibnal_data.kib_global_lock); + + for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { + + list_for_each (ptmp, &kibnal_data.kib_peers[i]) { + + peer = list_entry (ptmp, kib_peer_t, ibp_list); + LASSERT (peer->ibp_persistence != 0 || + peer->ibp_connecting != 0 || + !list_empty (&peer->ibp_conns)); + + if (index-- > 0) + continue; + + *nidp = peer->ibp_nid; + *persistencep = peer->ibp_persistence; + + read_unlock (&kibnal_data.kib_global_lock); + return (0); + } + } + + read_unlock (&kibnal_data.kib_global_lock); + return (-ENOENT); +} + +static int +kibnal_add_persistent_peer (ptl_nid_t nid) +{ + unsigned long flags; + kib_peer_t *peer; + kib_peer_t *peer2; + + if (nid == PTL_NID_ANY) + return (-EINVAL); + + peer = kibnal_create_peer (nid); + if (peer == NULL) + return (-ENOMEM); + + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + + peer2 = kibnal_find_peer_locked (nid); + if (peer2 != NULL) { + kib_peer_decref (peer); + peer = peer2; + } else { + /* peer table takes existing ref on peer */ + list_add_tail (&peer->ibp_list, + kibnal_nid2peerlist (nid)); + } + + peer->ibp_persistence++; + + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + return (0); +} + +static void +kibnal_del_peer_locked (kib_peer_t *peer, int single_share) +{ + struct list_head *ctmp; + struct list_head *cnxt; + kib_conn_t *conn; + + if (!single_share) + peer->ibp_persistence = 0; + else if (peer->ibp_persistence > 0) + peer->ibp_persistence--; + + if (peer->ibp_persistence != 0) + return; + + list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { + conn = list_entry(ctmp, kib_conn_t, ibc_list); + + kibnal_close_conn_locked (conn, 0); + } + + /* NB peer unlinks itself when last conn is closed */ +} + +int +kibnal_del_peer (ptl_nid_t nid, int single_share) +{ + unsigned long flags; + struct list_head *ptmp; + struct list_head *pnxt; + kib_peer_t *peer; + int lo; + int hi; + int i; + int rc = -ENOENT; + + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + + if (nid != PTL_NID_ANY) + lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; + else { + lo = 0; + hi = kibnal_data.kib_peer_hash_size - 1; + } + + for (i = lo; i <= hi; i++) { + list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) { + peer = list_entry (ptmp, kib_peer_t, ibp_list); + LASSERT (peer->ibp_persistence != 0 || + peer->ibp_connecting != 0 || + !list_empty (&peer->ibp_conns)); + + if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid)) + continue; + + kibnal_del_peer_locked (peer, single_share); + rc = 0; /* matched something */ + + if (single_share) + goto out; + } + } + out: + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + + return (rc); +} + +static kib_conn_t * +kibnal_get_conn_by_idx (int index) +{ + kib_peer_t *peer; + struct list_head *ptmp; + kib_conn_t *conn; + struct list_head *ctmp; + int i; + + read_lock (&kibnal_data.kib_global_lock); + + for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { + list_for_each (ptmp, &kibnal_data.kib_peers[i]) { + + peer = list_entry (ptmp, kib_peer_t, ibp_list); + LASSERT (peer->ibp_persistence > 0 || + peer->ibp_connecting != 0 || + !list_empty (&peer->ibp_conns)); + + list_for_each (ctmp, &peer->ibp_conns) { + if (index-- > 0) + continue; + + conn = list_entry (ctmp, kib_conn_t, ibc_list); + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); + read_unlock (&kibnal_data.kib_global_lock); + return (conn); + } + } + } + + read_unlock (&kibnal_data.kib_global_lock); + return (NULL); +} + +kib_conn_t * +kibnal_create_conn (void) +{ + kib_conn_t *conn; + int i; + __u64 vaddr = 0; + __u64 vaddr_base; + int page_offset; + int ipage; + int rc; + FSTATUS frc; + union { + IB_QP_ATTRIBUTES_CREATE qp_create; + IB_QP_ATTRIBUTES_MODIFY qp_attr; + } params; + + PORTAL_ALLOC (conn, sizeof (*conn)); + if (conn == NULL) { + CERROR ("Can't allocate connection\n"); + return (NULL); + } + + /* zero flags, NULL pointers etc... */ + memset (conn, 0, sizeof (*conn)); + + INIT_LIST_HEAD (&conn->ibc_tx_queue); + INIT_LIST_HEAD (&conn->ibc_active_txs); + spin_lock_init (&conn->ibc_lock); + + atomic_inc (&kibnal_data.kib_nconns); + /* well not really, but I call destroy() on failure, which decrements */ + + PORTAL_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t)); + if (conn->ibc_rxs == NULL) + goto failed; + memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t)); + + rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES, 1); + if (rc != 0) + goto failed; + + vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr; + + for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) { + struct page *page = conn->ibc_rx_pages->ibp_pages[ipage]; + kib_rx_t *rx = &conn->ibc_rxs[i]; + + rx->rx_conn = conn; + rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + + page_offset); + + if (kibnal_whole_mem()) + rx->rx_vaddr = kibnal_page2phys(page) + + page_offset + + kibnal_data.kib_md.md_addr; + else + rx->rx_vaddr = vaddr; + + vaddr += IBNAL_MSG_SIZE; + LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES); + + page_offset += IBNAL_MSG_SIZE; + LASSERT (page_offset <= PAGE_SIZE); + + if (page_offset == PAGE_SIZE) { + page_offset = 0; + ipage++; + LASSERT (ipage <= IBNAL_RX_MSG_PAGES); + } + } + + params.qp_create = (IB_QP_ATTRIBUTES_CREATE) { + .Type = QPTypeReliableConnected, + .SendQDepth = IBNAL_TX_MAX_SG * + IBNAL_MSG_QUEUE_SIZE, + .RecvQDepth = IBNAL_MSG_QUEUE_SIZE, + .SendDSListDepth = 1, + .RecvDSListDepth = 1, + .SendCQHandle = kibnal_data.kib_cq, + .RecvCQHandle = kibnal_data.kib_cq, + .PDHandle = kibnal_data.kib_pd, + .SendSignaledCompletions = TRUE, + }; + frc = iibt_qp_create(kibnal_data.kib_hca, ¶ms.qp_create, NULL, + &conn->ibc_qp, &conn->ibc_qp_attrs); + if (rc != 0) { + CERROR ("Failed to create queue pair: %d\n", rc); + goto failed; + } + + /* Mark QP created */ + conn->ibc_state = IBNAL_CONN_INIT_QP; + + params.qp_attr = (IB_QP_ATTRIBUTES_MODIFY) { + .RequestState = QPStateInit, + .Attrs = (IB_QP_ATTR_PORTGUID | + IB_QP_ATTR_PKEYINDEX | + IB_QP_ATTR_ACCESSCONTROL), + .PortGUID = kibnal_data.kib_port_guid, + .PkeyIndex = 0, + .AccessControl = { + .s = { + .RdmaWrite = 1, + .RdmaRead = 1, + }, + }, + }; + rc = iibt_qp_modify(conn->ibc_qp, ¶ms.qp_attr, NULL); + if (rc != 0) { + CERROR ("Failed to modify queue pair: %d\n", rc); + goto failed; + } + + /* 1 ref for caller */ + atomic_set (&conn->ibc_refcount, 1); + return (conn); + + failed: + kibnal_destroy_conn (conn); + return (NULL); +} + +void +kibnal_destroy_conn (kib_conn_t *conn) +{ + int rc; + FSTATUS frc; + + CDEBUG (D_NET, "connection %p\n", conn); + + LASSERT (atomic_read (&conn->ibc_refcount) == 0); + LASSERT (list_empty(&conn->ibc_tx_queue)); + LASSERT (list_empty(&conn->ibc_active_txs)); + LASSERT (conn->ibc_nsends_posted == 0); + LASSERT (conn->ibc_connreq == NULL); + + switch (conn->ibc_state) { + case IBNAL_CONN_DISCONNECTED: + /* called after connection sequence initiated */ + /* fall through */ + + case IBNAL_CONN_INIT_QP: + /* _destroy includes an implicit Reset of the QP which + * discards posted work */ + rc = iibt_qp_destroy(conn->ibc_qp); + if (rc != 0) + CERROR("Can't destroy QP: %d\n", rc); + /* fall through */ + + case IBNAL_CONN_INIT_NOTHING: + break; + + default: + LASSERT (0); + } + + if (conn->ibc_cep != NULL) { + frc = iibt_cm_destroy_cep(conn->ibc_cep); + if (frc != 0) + CERROR("Can't destroy CEP %p: %d\n", conn->ibc_cep, + frc); + } + + if (conn->ibc_rx_pages != NULL) + kibnal_free_pages(conn->ibc_rx_pages); + + if (conn->ibc_rxs != NULL) + PORTAL_FREE(conn->ibc_rxs, + IBNAL_RX_MSGS * sizeof(kib_rx_t)); + + if (conn->ibc_peer != NULL) + kib_peer_decref(conn->ibc_peer); + + PORTAL_FREE(conn, sizeof (*conn)); + + atomic_dec(&kibnal_data.kib_nconns); + + if (atomic_read (&kibnal_data.kib_nconns) == 0 && + kibnal_data.kib_shutdown) { + /* I just nuked the last connection on shutdown; wake up + * everyone so they can exit. */ + wake_up_all(&kibnal_data.kib_sched_waitq); + wake_up_all(&kibnal_data.kib_connd_waitq); + } +} + +void +kibnal_put_conn (kib_conn_t *conn) +{ + unsigned long flags; + + CDEBUG (D_NET, "putting conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + + LASSERT (atomic_read (&conn->ibc_refcount) > 0); + if (!atomic_dec_and_test (&conn->ibc_refcount)) + return; + + /* must disconnect before dropping the final ref */ + LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECTED); + + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); + + list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns); + wake_up (&kibnal_data.kib_connd_waitq); + + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); +} + +static int +kibnal_close_peer_conns_locked (kib_peer_t *peer, int why) +{ + kib_conn_t *conn; + struct list_head *ctmp; + struct list_head *cnxt; + int count = 0; + + list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { + conn = list_entry (ctmp, kib_conn_t, ibc_list); + + count++; + kibnal_close_conn_locked (conn, why); + } + + return (count); +} + +int +kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation) +{ + kib_conn_t *conn; + struct list_head *ctmp; + struct list_head *cnxt; + int count = 0; + + list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { + conn = list_entry (ctmp, kib_conn_t, ibc_list); + + if (conn->ibc_incarnation == incarnation) + continue; + + CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n", + peer->ibp_nid, conn->ibc_incarnation, incarnation); + + count++; + kibnal_close_conn_locked (conn, -ESTALE); + } + + return (count); +} + +static int +kibnal_close_matching_conns (ptl_nid_t nid) +{ + unsigned long flags; + kib_peer_t *peer; + struct list_head *ptmp; + struct list_head *pnxt; + int lo; + int hi; + int i; + int count = 0; + + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + + if (nid != PTL_NID_ANY) + lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; + else { + lo = 0; + hi = kibnal_data.kib_peer_hash_size - 1; + } + + for (i = lo; i <= hi; i++) { + list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) { + + peer = list_entry (ptmp, kib_peer_t, ibp_list); + LASSERT (peer->ibp_persistence != 0 || + peer->ibp_connecting != 0 || + !list_empty (&peer->ibp_conns)); + + if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid)) + continue; + + count += kibnal_close_peer_conns_locked (peer, 0); + } + } + + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + + /* wildcards always succeed */ + if (nid == PTL_NID_ANY) + return (0); + + return (count == 0 ? -ENOENT : 0); +} + +static int +kibnal_cmd(struct portals_cfg *pcfg, void * private) +{ + int rc = -EINVAL; + ENTRY; + + LASSERT (pcfg != NULL); + + switch(pcfg->pcfg_command) { + case NAL_CMD_GET_PEER: { + ptl_nid_t nid = 0; + int share_count = 0; + + rc = kibnal_get_peer_info(pcfg->pcfg_count, + &nid, &share_count); + pcfg->pcfg_nid = nid; + pcfg->pcfg_size = 0; + pcfg->pcfg_id = 0; + pcfg->pcfg_misc = 0; + pcfg->pcfg_count = 0; + pcfg->pcfg_wait = share_count; + break; + } + case NAL_CMD_ADD_PEER: { + rc = kibnal_add_persistent_peer (pcfg->pcfg_nid); + break; + } + case NAL_CMD_DEL_PEER: { + rc = kibnal_del_peer (pcfg->pcfg_nid, + /* flags == single_share */ + pcfg->pcfg_flags != 0); + break; + } + case NAL_CMD_GET_CONN: { + kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count); + + if (conn == NULL) + rc = -ENOENT; + else { + rc = 0; + pcfg->pcfg_nid = conn->ibc_peer->ibp_nid; + pcfg->pcfg_id = 0; + pcfg->pcfg_misc = 0; + pcfg->pcfg_flags = 0; + kibnal_put_conn (conn); + } + break; + } + case NAL_CMD_CLOSE_CONNECTION: { + rc = kibnal_close_matching_conns (pcfg->pcfg_nid); + break; + } + case NAL_CMD_REGISTER_MYNID: { + if (pcfg->pcfg_nid == PTL_NID_ANY) + rc = -EINVAL; + else + rc = kibnal_set_mynid (pcfg->pcfg_nid); + break; + } + } + + RETURN(rc); +} + +void +kibnal_free_pages (kib_pages_t *p) +{ + int npages = p->ibp_npages; + int rc; + int i; + + if (p->ibp_mapped) { + rc = iibt_deregister_memory(p->ibp_handle); + if (rc != 0) + CERROR ("Deregister error: %d\n", rc); + } + + for (i = 0; i < npages; i++) + if (p->ibp_pages[i] != NULL) + __free_page(p->ibp_pages[i]); + + PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages])); +} + +int +kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write) +{ + kib_pages_t *p; + __u64 *phys_pages; + int i; + FSTATUS frc; + IB_ACCESS_CONTROL access; + + memset(&access, 0, sizeof(access)); + access.s.MWBindable = 1; + access.s.LocalWrite = 1; + access.s.RdmaRead = 1; + access.s.RdmaWrite = 1; + + PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages])); + if (p == NULL) { + CERROR ("Can't allocate buffer %d\n", npages); + return (-ENOMEM); + } + + memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages])); + p->ibp_npages = npages; + + for (i = 0; i < npages; i++) { + p->ibp_pages[i] = alloc_page (GFP_KERNEL); + if (p->ibp_pages[i] == NULL) { + CERROR ("Can't allocate page %d of %d\n", i, npages); + kibnal_free_pages(p); + return (-ENOMEM); + } + } + + if (kibnal_whole_mem()) + goto out; + + PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages)); + if (phys_pages == NULL) { + CERROR ("Can't allocate physarray for %d pages\n", npages); + /* XXX free ibp_pages? */ + kibnal_free_pages(p); + return (-ENOMEM); + } + + /* if we were using the _contig_ registration variant we would have + * an array of PhysAddr/Length pairs, but the discontiguous variant + * just takes the PhysAddr */ + for (i = 0; i < npages; i++) + phys_pages[i] = kibnal_page2phys(p->ibp_pages[i]); + + frc = iibt_register_physical_memory(kibnal_data.kib_hca, + 0, /* requested vaddr */ + phys_pages, npages, + 0, /* offset */ + kibnal_data.kib_pd, + access, + &p->ibp_handle, &p->ibp_vaddr, + &p->ibp_lkey, &p->ibp_rkey); + + PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages)); + + if (frc != FSUCCESS) { + CERROR ("Error %d mapping %d pages\n", frc, npages); + kibnal_free_pages(p); + return (-ENOMEM); + } + + CDEBUG(D_NET, "registered %d pages; handle: %p vaddr "LPX64" " + "lkey %x rkey %x\n", npages, p->ibp_handle, + p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey); + + p->ibp_mapped = 1; +out: + *pp = p; + return (0); +} + +static int +kibnal_setup_tx_descs (void) +{ + int ipage = 0; + int page_offset = 0; + __u64 vaddr; + __u64 vaddr_base; + struct page *page; + kib_tx_t *tx; + int i; + int rc; + + /* pre-mapped messages are not bigger than 1 page */ + LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE); + + /* No fancy arithmetic when we do the buffer calculations */ + LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0); + + rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES, + 0); + if (rc != 0) + return (rc); + + /* ignored for the whole_mem case */ + vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr; + + for (i = 0; i < IBNAL_TX_MSGS; i++) { + page = kibnal_data.kib_tx_pages->ibp_pages[ipage]; + tx = &kibnal_data.kib_tx_descs[i]; + + memset (tx, 0, sizeof(*tx)); /* zero flags etc */ + + tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + + page_offset); + + if (kibnal_whole_mem()) + tx->tx_vaddr = kibnal_page2phys(page) + + page_offset + + kibnal_data.kib_md.md_addr; + else + tx->tx_vaddr = vaddr; + + tx->tx_isnblk = (i >= IBNAL_NTX); + tx->tx_mapped = KIB_TX_UNMAPPED; + + CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", + i, tx, tx->tx_msg, tx->tx_vaddr); + + if (tx->tx_isnblk) + list_add (&tx->tx_list, + &kibnal_data.kib_idle_nblk_txs); + else + list_add (&tx->tx_list, + &kibnal_data.kib_idle_txs); + + vaddr += IBNAL_MSG_SIZE; + LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES); + + page_offset += IBNAL_MSG_SIZE; + LASSERT (page_offset <= PAGE_SIZE); + + if (page_offset == PAGE_SIZE) { + page_offset = 0; + ipage++; + LASSERT (ipage <= IBNAL_TX_MSG_PAGES); + } + } + + return (0); +} + +static void +kibnal_api_shutdown (nal_t *nal) +{ + int i; + int rc; + + if (nal->nal_refct != 0) { + /* This module got the first ref */ + PORTAL_MODULE_UNUSE; + return; + } + + CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", + atomic_read (&portal_kmemory)); + + LASSERT(nal == &kibnal_api); + + switch (kibnal_data.kib_init) { + default: + CERROR ("Unexpected state %d\n", kibnal_data.kib_init); + LBUG(); + + case IBNAL_INIT_ALL: + /* stop calls to nal_cmd */ + libcfs_nal_cmd_unregister(IIBNAL); + /* No new peers */ + + /* resetting my NID to unadvertises me, removes my + * listener and nukes all current peers */ + kibnal_set_mynid (PTL_NID_ANY); + + /* Wait for all peer state to clean up (crazy) */ + i = 2; + while (atomic_read (&kibnal_data.kib_npeers) != 0) { + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + "waiting for %d peers to disconnect (can take a few seconds)\n", + atomic_read (&kibnal_data.kib_npeers)); + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (HZ); + } + /* fall through */ + + case IBNAL_INIT_CQ: + rc = iibt_cq_destroy(kibnal_data.kib_cq); + if (rc != 0) + CERROR ("Destroy CQ error: %d\n", rc); + /* fall through */ + + case IBNAL_INIT_TXD: + kibnal_free_pages (kibnal_data.kib_tx_pages); + /* fall through */ + + case IBNAL_INIT_MR: + if (kibnal_data.kib_md.md_handle != NULL) { + rc = iibt_deregister_memory(kibnal_data.kib_md.md_handle); + if (rc != FSUCCESS) + CERROR ("Deregister memory: %d\n", rc); + } + /* fall through */ + +#if IBNAL_FMR + case IBNAL_INIT_FMR: + rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool); + if (rc != 0) + CERROR ("Destroy FMR pool error: %d\n", rc); + /* fall through */ +#endif + case IBNAL_INIT_PD: + rc = iibt_pd_free(kibnal_data.kib_pd); + if (rc != 0) + CERROR ("Destroy PD error: %d\n", rc); + /* fall through */ + + case IBNAL_INIT_SD: + rc = iibt_sd_deregister(kibnal_data.kib_sd); + if (rc != 0) + CERROR ("Deregister SD error: %d\n", rc); + /* fall through */ + + case IBNAL_INIT_PORT: + /* XXX ??? */ + /* fall through */ + + case IBNAL_INIT_PORTATTRS: + PORTAL_FREE(kibnal_data.kib_hca_attrs.PortAttributesList, + kibnal_data.kib_hca_attrs.PortAttributesListSize); + /* fall through */ + + case IBNAL_INIT_HCA: + rc = iibt_close_hca(kibnal_data.kib_hca); + if (rc != 0) + CERROR ("Close HCA error: %d\n", rc); + /* fall through */ + + case IBNAL_INIT_LIB: + lib_fini(&kibnal_lib); + /* fall through */ + + case IBNAL_INIT_DATA: + /* Module refcount only gets to zero when all peers + * have been closed so all lists must be empty */ + LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0); + LASSERT (kibnal_data.kib_peers != NULL); + for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { + LASSERT (list_empty (&kibnal_data.kib_peers[i])); + } + LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0); + LASSERT (list_empty (&kibnal_data.kib_sched_rxq)); + LASSERT (list_empty (&kibnal_data.kib_sched_txq)); + LASSERT (list_empty (&kibnal_data.kib_connd_conns)); + LASSERT (list_empty (&kibnal_data.kib_connd_peers)); + + /* flag threads to terminate; wake and wait for them to die */ + kibnal_data.kib_shutdown = 1; + wake_up_all (&kibnal_data.kib_sched_waitq); + wake_up_all (&kibnal_data.kib_connd_waitq); + + i = 2; + while (atomic_read (&kibnal_data.kib_nthreads) != 0) { + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + "Waiting for %d threads to terminate\n", + atomic_read (&kibnal_data.kib_nthreads)); + set_current_state (TASK_INTERRUPTIBLE); + schedule_timeout (HZ); + } + /* fall through */ + + case IBNAL_INIT_NOTHING: + break; + } + + if (kibnal_data.kib_tx_descs != NULL) + PORTAL_FREE (kibnal_data.kib_tx_descs, + IBNAL_TX_MSGS * sizeof(kib_tx_t)); + + if (kibnal_data.kib_peers != NULL) + PORTAL_FREE (kibnal_data.kib_peers, + sizeof (struct list_head) * + kibnal_data.kib_peer_hash_size); + + CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", + atomic_read (&portal_kmemory)); + printk(KERN_INFO "Lustre: Infinicon IB NAL unloaded (final mem %d)\n", + atomic_read(&portal_kmemory)); + + kibnal_data.kib_init = IBNAL_INIT_NOTHING; +} + +#define roundup_power(val, power) \ + ( (val + (__u64)(power - 1)) & ~((__u64)(power - 1)) ) + +/* this isn't very portable or sturdy in the face of funny mem/bus configs */ +static __u64 max_phys_mem(IB_CA_ATTRIBUTES *ca_attr) +{ + struct sysinfo si; + __u64 ret; + + /* XXX we don't bother with first-gen cards */ + if (ca_attr->VendorId == 0xd0b7 && ca_attr->DeviceId == 0x3101) + return 0ULL; + + si_meminfo(&si); + ret = (__u64)max(si.totalram, max_mapnr) * si.mem_unit; + return roundup_power(ret, 128 * 1024 * 1024); +} +#undef roundup_power + +static int +kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, + ptl_ni_limits_t *requested_limits, + ptl_ni_limits_t *actual_limits) +{ + ptl_process_id_t process_id; + int pkmem = atomic_read(&portal_kmemory); + IB_PORT_ATTRIBUTES *pattr; + FSTATUS frc; + int rc; + int n; + int i; + + LASSERT (nal == &kibnal_api); + + if (nal->nal_refct != 0) { + if (actual_limits != NULL) + *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits; + /* This module got the first ref */ + PORTAL_MODULE_USE; + return (PTL_OK); + } + + LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING); + + frc = IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2, + &kibnal_data.kib_interfaces); + if (frc != FSUCCESS) { + CERROR("IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2) = %d\n", + frc); + return -ENOSYS; + } + + init_MUTEX (&kibnal_data.kib_nid_mutex); + init_MUTEX_LOCKED (&kibnal_data.kib_nid_signal); + kibnal_data.kib_nid = PTL_NID_ANY; + + rwlock_init(&kibnal_data.kib_global_lock); + + kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE; + PORTAL_ALLOC (kibnal_data.kib_peers, + sizeof (struct list_head) * kibnal_data.kib_peer_hash_size); + if (kibnal_data.kib_peers == NULL) { + goto failed; + } + for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) + INIT_LIST_HEAD(&kibnal_data.kib_peers[i]); + + spin_lock_init (&kibnal_data.kib_connd_lock); + INIT_LIST_HEAD (&kibnal_data.kib_connd_peers); + INIT_LIST_HEAD (&kibnal_data.kib_connd_conns); + init_waitqueue_head (&kibnal_data.kib_connd_waitq); + + spin_lock_init (&kibnal_data.kib_sched_lock); + INIT_LIST_HEAD (&kibnal_data.kib_sched_txq); + INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq); + init_waitqueue_head (&kibnal_data.kib_sched_waitq); + + spin_lock_init (&kibnal_data.kib_tx_lock); + INIT_LIST_HEAD (&kibnal_data.kib_idle_txs); + INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs); + init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq); + + PORTAL_ALLOC (kibnal_data.kib_tx_descs, + IBNAL_TX_MSGS * sizeof(kib_tx_t)); + if (kibnal_data.kib_tx_descs == NULL) { + CERROR ("Can't allocate tx descs\n"); + goto failed; + } + + /* lists/ptrs/locks initialised */ + kibnal_data.kib_init = IBNAL_INIT_DATA; + /*****************************************************/ + + process_id.pid = 0; + process_id.nid = kibnal_data.kib_nid; + + rc = lib_init(&kibnal_lib, nal, process_id, + requested_limits, actual_limits); + if (rc != PTL_OK) { + CERROR("lib_init failed: error %d\n", rc); + goto failed; + } + + /* lib interface initialised */ + kibnal_data.kib_init = IBNAL_INIT_LIB; + /*****************************************************/ + + for (i = 0; i < IBNAL_N_SCHED; i++) { + rc = kibnal_thread_start (kibnal_scheduler, (void *)i); + if (rc != 0) { + CERROR("Can't spawn iibnal scheduler[%d]: %d\n", + i, rc); + goto failed; + } + } + + rc = kibnal_thread_start (kibnal_connd, NULL); + if (rc != 0) { + CERROR ("Can't spawn iibnal connd: %d\n", rc); + goto failed; + } + + n = sizeof(kibnal_data.kib_hca_guids) / + sizeof(kibnal_data.kib_hca_guids[0]); + frc = iibt_get_hca_guids(&n, kibnal_data.kib_hca_guids); + if (frc != FSUCCESS) { + CERROR ("Can't get channel adapter guids: %d\n", frc); + goto failed; + } + if (n == 0) { + CERROR ("No channel adapters found\n"); + goto failed; + } + + /* Infinicon has per-HCA rather than per CQ completion handlers */ + frc = iibt_open_hca(kibnal_data.kib_hca_guids[0], + kibnal_ca_callback, + kibnal_ca_async_callback, + &kibnal_data.kib_hca, + &kibnal_data.kib_hca); + if (frc != FSUCCESS) { + CERROR ("Can't open CA[0]: %d\n", frc); + goto failed; + } + + /* Channel Adapter opened */ + kibnal_data.kib_init = IBNAL_INIT_HCA; + /*****************************************************/ + + kibnal_data.kib_hca_attrs.PortAttributesList = NULL; + kibnal_data.kib_hca_attrs.PortAttributesListSize = 0; + frc = iibt_query_hca(kibnal_data.kib_hca, + &kibnal_data.kib_hca_attrs, NULL); + if (frc != FSUCCESS) { + CERROR ("Can't size port attrs: %d\n", frc); + goto failed; + } + + PORTAL_ALLOC(kibnal_data.kib_hca_attrs.PortAttributesList, + kibnal_data.kib_hca_attrs.PortAttributesListSize); + if (kibnal_data.kib_hca_attrs.PortAttributesList == NULL) + goto failed; + + /* Port attrs allocated */ + kibnal_data.kib_init = IBNAL_INIT_PORTATTRS; + /*****************************************************/ + + frc = iibt_query_hca(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs, + NULL); + if (frc != FSUCCESS) { + CERROR ("Can't get port attrs for CA 0: %d\n", frc); + goto failed; + } + + for (i = 0, pattr = kibnal_data.kib_hca_attrs.PortAttributesList; + pattr != NULL; + i++, pattr = pattr->Next) { + switch (pattr->PortState) { + default: + CERROR("Unexpected port[%d] state %d\n", + i, pattr->PortState); + continue; + case PortStateDown: + CDEBUG(D_NET, "port[%d] Down\n", i); + continue; + case PortStateInit: + CDEBUG(D_NET, "port[%d] Init\n", i); + continue; + case PortStateArmed: + CDEBUG(D_NET, "port[%d] Armed\n", i); + continue; + + case PortStateActive: + CDEBUG(D_NET, "port[%d] Active\n", i); + kibnal_data.kib_port = i; + kibnal_data.kib_port_guid = pattr->GUID; + kibnal_data.kib_port_pkey = pattr->PkeyTable[0]; + break; + } + break; + } + + if (pattr == NULL) { + CERROR ("Can't find an active port\n"); + goto failed; + } + + CDEBUG(D_NET, "got guid "LPX64"\n", kibnal_data.kib_port_guid); + + /* Active port found */ + kibnal_data.kib_init = IBNAL_INIT_PORT; + /*****************************************************/ + + frc = iibt_sd_register(&kibnal_data.kib_sd, NULL); + if (frc != FSUCCESS) { + CERROR ("Can't register with SD: %d\n", frc); + goto failed; + } + + /* Registered with SD OK */ + kibnal_data.kib_init = IBNAL_INIT_SD; + /*****************************************************/ + + frc = iibt_pd_allocate(kibnal_data.kib_hca, 0, &kibnal_data.kib_pd); + if (frc != FSUCCESS) { + CERROR ("Can't create PD: %d\n", rc); + goto failed; + } + + /* flag PD initialised */ + kibnal_data.kib_init = IBNAL_INIT_PD; + /*****************************************************/ + +#if IBNAL_FMR + { + const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK; + struct ib_fmr_pool_param params = { + .max_pages_per_fmr = PTL_MTU/PAGE_SIZE, + .access = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ), + .pool_size = pool_size, + .dirty_watermark = (pool_size * 3)/4, + .flush_function = NULL, + .flush_arg = NULL, + .cache = 1, + }; + rc = ib_fmr_pool_create(kibnal_data.kib_pd, ¶ms, + &kibnal_data.kib_fmr_pool); + if (rc != 0) { + CERROR ("Can't create FMR pool size %d: %d\n", + pool_size, rc); + goto failed; + } + } + + /* flag FMR pool initialised */ + kibnal_data.kib_init = IBNAL_INIT_FMR; +#endif + /*****************************************************/ + if (IBNAL_WHOLE_MEM) { + IB_MR_PHYS_BUFFER phys; + IB_ACCESS_CONTROL access; + kib_md_t *md = &kibnal_data.kib_md; + + memset(&access, 0, sizeof(access)); + access.s.MWBindable = 1; + access.s.LocalWrite = 1; + access.s.RdmaRead = 1; + access.s.RdmaWrite = 1; + + phys.PhysAddr = 0; + phys.Length = max_phys_mem(&kibnal_data.kib_hca_attrs); + if (phys.Length == 0) { + CERROR ("couldn't determine the end of phys mem\n"); + goto failed; + } + + rc = iibt_register_contig_physical_memory(kibnal_data.kib_hca, + 0, + &phys, 1, + 0, + kibnal_data.kib_pd, + access, + &md->md_handle, + &md->md_addr, + &md->md_lkey, + &md->md_rkey); + if (rc != FSUCCESS) { + CERROR("registering physical memory failed: %d\n", + rc); + CERROR("falling back to registration per-rdma\n"); + md->md_handle = NULL; + } else { + CDEBUG(D_NET, "registered "LPU64" bytes of mem\n", + phys.Length); + kibnal_data.kib_init = IBNAL_INIT_MR; + } + } + + /*****************************************************/ + + rc = kibnal_setup_tx_descs(); + if (rc != 0) { + CERROR ("Can't register tx descs: %d\n", rc); + goto failed; + } + + /* flag TX descs initialised */ + kibnal_data.kib_init = IBNAL_INIT_TXD; + /*****************************************************/ + + { + uint32 nentries; + + frc = iibt_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES, + &kibnal_data.kib_cq, &kibnal_data.kib_cq, + &nentries); + if (frc != FSUCCESS) { + CERROR ("Can't create RX CQ: %d\n", frc); + goto failed; + } + + /* flag CQ initialised */ + kibnal_data.kib_init = IBNAL_INIT_CQ; + + if (nentries < IBNAL_CQ_ENTRIES) { + CERROR ("CQ only has %d entries, need %d\n", + nentries, IBNAL_CQ_ENTRIES); + goto failed; + } + + rc = iibt_cq_rearm(kibnal_data.kib_cq, CQEventSelNextWC); + if (rc != 0) { + CERROR ("Failed to re-arm completion queue: %d\n", rc); + goto failed; + } + } + + /*****************************************************/ + + rc = libcfs_nal_cmd_register(IIBNAL, &kibnal_cmd, NULL); + if (rc != 0) { + CERROR ("Can't initialise command interface (rc = %d)\n", rc); + goto failed; + } + + /* flag everything initialised */ + kibnal_data.kib_init = IBNAL_INIT_ALL; + /*****************************************************/ + + printk(KERN_INFO "Lustre: Infinicon IB NAL loaded " + "(initial mem %d)\n", pkmem); + + return (PTL_OK); + + failed: + kibnal_api_shutdown (&kibnal_api); + return (PTL_FAIL); +} + +void __exit +kibnal_module_fini (void) +{ +#ifdef CONFIG_SYSCTL + if (kibnal_tunables.kib_sysctl != NULL) + unregister_sysctl_table (kibnal_tunables.kib_sysctl); +#endif + PtlNIFini(kibnal_ni); + + ptl_unregister_nal(IIBNAL); +} + +int __init +kibnal_module_init (void) +{ + int rc; + + if (sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN) { + CERROR("sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN\n"); + return -EINVAL; + } + + /* the following must be sizeof(int) for proc_dointvec() */ + if (sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)) { + CERROR("sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)\n"); + return -EINVAL; + } + + kibnal_api.nal_ni_init = kibnal_api_startup; + kibnal_api.nal_ni_fini = kibnal_api_shutdown; + + /* Initialise dynamic tunables to defaults once only */ + kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT; + + rc = ptl_register_nal(IIBNAL, &kibnal_api); + if (rc != PTL_OK) { + CERROR("Can't register IBNAL: %d\n", rc); + return (-ENOMEM); /* or something... */ + } + + /* Pure gateways want the NAL started up at module load time... */ + rc = PtlNIInit(IIBNAL, 0, NULL, NULL, &kibnal_ni); + if (rc != PTL_OK && rc != PTL_IFACE_DUP) { + ptl_unregister_nal(IIBNAL); + return (-ENODEV); + } + +#ifdef CONFIG_SYSCTL + /* Press on regardless even if registering sysctl doesn't work */ + kibnal_tunables.kib_sysctl = + register_sysctl_table (kibnal_top_ctl_table, 0); +#endif + return (0); +} + +MODULE_AUTHOR("Cluster File Systems, Inc. "); +MODULE_DESCRIPTION("Kernel Infinicon IB NAL v0.01"); +MODULE_LICENSE("GPL"); + +module_init(kibnal_module_init); +module_exit(kibnal_module_fini); + diff --git a/lnet/klnds/iiblnd/iiblnd.h b/lnet/klnds/iiblnd/iiblnd.h new file mode 100644 index 0000000..0a25a9a --- /dev/null +++ b/lnet/klnds/iiblnd/iiblnd.h @@ -0,0 +1,892 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * Author: Eric Barton + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_IBNAL + +#include +#include +#include +#include + +#include + +#define GCC_VERSION (__GNUC__ * 10000 \ + + __GNUC_MINOR__ * 100 \ + + __GNUC_PATCHLEVEL__) + +/* Test for GCC > 3.2.2 */ +#if GCC_VERSION <= 30202 +/* GCC 3.2.2, and presumably several versions before it, will + * miscompile this driver. See + * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9853. */ +#error Invalid GCC version. Must use GCC >= 3.2.3 +#endif + +#define IBNAL_SERVICE_NAME "iibnal" +#define IBNAL_SERVICE_NUMBER 0x11b9a1 + +#if CONFIG_SMP +# define IBNAL_N_SCHED num_online_cpus() /* # schedulers */ +#else +# define IBNAL_N_SCHED 1 /* # schedulers */ +#endif + +#define IBNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */ +#define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */ + +#define IBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */ + +#define IBNAL_MSG_QUEUE_SIZE 8 /* # messages/RDMAs in-flight */ +#define IBNAL_CREDIT_HIGHWATER 7 /* when to eagerly return credits */ +/* 7 indicates infinite retry attempts, Infinicon recommended 5 */ +#define IBNAL_RETRY 5 /* # times to retry */ +#define IBNAL_RNR_RETRY 5 /* */ +#define IBNAL_CM_RETRY 5 /* # times to retry connection */ +#define IBNAL_FLOW_CONTROL 1 +#define IBNAL_ACK_TIMEOUT 20 /* supposedly 4 secs */ + +#define IBNAL_NTX 64 /* # tx descs */ +/* this had to be dropped down so that we only register < 255 pages per + * region. this will change if we register all memory. */ +#define IBNAL_NTX_NBLK 128 /* # reserved tx descs */ + +#define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */ + +#define IBNAL_RESCHED 100 /* # scheduler loops before reschedule */ + +#define IBNAL_CONCURRENT_PEERS 1000 /* # nodes all talking at once to me */ + +/* default vals for runtime tunables */ +#define IBNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */ + +/************************/ +/* derived constants... */ + +/* TX messages (shared by all connections) */ +#define IBNAL_TX_MSGS (IBNAL_NTX + IBNAL_NTX_NBLK) +#define IBNAL_TX_MSG_BYTES (IBNAL_TX_MSGS * IBNAL_MSG_SIZE) +#define IBNAL_TX_MSG_PAGES ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) + +#define IBNAL_TX_MAX_SG (PTL_MD_MAX_IOV + 1) + +/* RX messages (per connection) */ +#define IBNAL_RX_MSGS IBNAL_MSG_QUEUE_SIZE +#define IBNAL_RX_MSG_BYTES (IBNAL_RX_MSGS * IBNAL_MSG_SIZE) +#define IBNAL_RX_MSG_PAGES ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) + + +/* we may have up to 2 completions per transmit + + 1 completion per receive, per connection */ +#define IBNAL_CQ_ENTRIES ((2*IBNAL_TX_MSGS) + \ + (IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS)) + +#define IBNAL_RDMA_BASE 0x0eeb0000 +#define IBNAL_FMR 0 +#define IBNAL_WHOLE_MEM 1 +#define IBNAL_CKSUM 0 +//#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_PROCESS +#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_INTERRUPT + +/* XXX I have no idea. */ +#define IBNAL_STARTING_PSN 1 + +typedef struct +{ + int kib_io_timeout; /* comms timeout (seconds) */ + struct ctl_table_header *kib_sysctl; /* sysctl interface */ +} kib_tunables_t; + +/* some of these have specific types in the stack that just map back + * to the uFOO types, like IB_{L,R}_KEY. */ +typedef struct +{ + int ibp_npages; /* # pages */ + int ibp_mapped; /* mapped? */ + __u64 ibp_vaddr; /* mapped region vaddr */ + __u32 ibp_lkey; /* mapped region lkey */ + __u32 ibp_rkey; /* mapped region rkey */ + IB_HANDLE ibp_handle; /* mapped region handle */ + struct page *ibp_pages[0]; +} kib_pages_t; + +typedef struct +{ + IB_HANDLE md_handle; + __u32 md_lkey; + __u32 md_rkey; + __u64 md_addr; +} kib_md_t __attribute__((packed)); + +typedef struct +{ + int kib_init; /* initialisation state */ + __u64 kib_incarnation; /* which one am I */ + int kib_shutdown; /* shut down? */ + atomic_t kib_nthreads; /* # live threads */ + + __u64 kib_service_id; /* service number I listen on */ + __u64 kib_port_guid; /* my GUID (lo 64 of GID)*/ + __u16 kib_port_pkey; /* my pkey, whatever that is */ + ptl_nid_t kib_nid; /* my NID */ + struct semaphore kib_nid_mutex; /* serialise NID ops */ + struct semaphore kib_nid_signal; /* signal completion */ + IB_HANDLE kib_cep; /* connection end point */ + + rwlock_t kib_global_lock; /* stabilize peer/conn ops */ + + struct list_head *kib_peers; /* hash table of all my known peers */ + int kib_peer_hash_size; /* size of kib_peers */ + atomic_t kib_npeers; /* # peers extant */ + atomic_t kib_nconns; /* # connections extant */ + + struct list_head kib_connd_conns; /* connections to progress */ + struct list_head kib_connd_peers; /* peers waiting for a connection */ + wait_queue_head_t kib_connd_waitq; /* connection daemons sleep here */ + unsigned long kib_connd_waketime; /* when connd will wake */ + spinlock_t kib_connd_lock; /* serialise */ + + wait_queue_head_t kib_sched_waitq; /* schedulers sleep here */ + struct list_head kib_sched_txq; /* tx requiring attention */ + struct list_head kib_sched_rxq; /* rx requiring attention */ + spinlock_t kib_sched_lock; /* serialise */ + + struct kib_tx *kib_tx_descs; /* all the tx descriptors */ + kib_pages_t *kib_tx_pages; /* premapped tx msg pages */ + + struct list_head kib_idle_txs; /* idle tx descriptors */ + struct list_head kib_idle_nblk_txs; /* idle reserved tx descriptors */ + wait_queue_head_t kib_idle_tx_waitq; /* block here for tx descriptor */ + __u64 kib_next_tx_cookie; /* RDMA completion cookie */ + spinlock_t kib_tx_lock; /* serialise */ + + IB_HANDLE kib_hca; /* The HCA */ + int kib_port; /* port on the device */ + IB_HANDLE kib_pd; /* protection domain */ + IB_HANDLE kib_sd; /* SD handle */ + IB_HANDLE kib_cq; /* completion queue */ + kib_md_t kib_md; /* full-mem registration */ + + void *kib_listen_handle; /* where I listen for connections */ + + IBT_INTERFACE_UNION kib_interfaces; /* The Infinicon IBT interface */ + + uint64 kib_hca_guids[8]; /* all the HCA guids */ + IB_CA_ATTRIBUTES kib_hca_attrs; /* where to get HCA attrs */ + FABRIC_OPERATION_DATA kib_fabopdata; /* (un)advertise service record */ +} kib_data_t; + +#define IBNAL_INIT_NOTHING 0 +#define IBNAL_INIT_DATA 1 +#define IBNAL_INIT_LIB 2 +#define IBNAL_INIT_HCA 3 +#define IBNAL_INIT_PORTATTRS 4 +#define IBNAL_INIT_PORT 5 +#define IBNAL_INIT_SD 6 +#define IBNAL_INIT_PD 7 +#define IBNAL_INIT_FMR 8 +#define IBNAL_INIT_MR 9 +#define IBNAL_INIT_TXD 10 +#define IBNAL_INIT_CQ 11 +#define IBNAL_INIT_ALL 12 + +/************************************************************************ + * Wire message structs. + * These are sent in sender's byte order (i.e. receiver flips). + * CAVEAT EMPTOR: other structs communicated between nodes (e.g. MAD + * private data and SM service info), is LE on the wire. + */ + +/* also kib_md_t above */ + +typedef struct +{ + __u32 rd_key; /* remote key */ + __u32 rd_nob; /* # of bytes */ + __u64 rd_addr; /* remote io vaddr */ +} kib_rdma_desc_t __attribute__((packed)); + +typedef struct +{ + ptl_hdr_t ibim_hdr; /* portals header */ + char ibim_payload[0]; /* piggy-backed payload */ +} kib_immediate_msg_t __attribute__((packed)); + +/* these arrays serve two purposes during rdma. they are built on the passive + * side and sent to the active side as remote arguments. On the active side + * the descs are used as a data structure on the way to local gather items. + * the different roles result in split local/remote meaning of desc->rd_key */ +typedef struct +{ + ptl_hdr_t ibrm_hdr; /* portals header */ + __u64 ibrm_cookie; /* opaque completion cookie */ + __u32 ibrm_num_descs; /* how many descs */ + kib_rdma_desc_t ibrm_desc[0]; /* where to suck/blow */ +} kib_rdma_msg_t __attribute__((packed)); + +#define kib_rdma_msg_len(num_descs) \ + offsetof(kib_msg_t, ibm_u.rdma.ibrm_desc[num_descs]) + +typedef struct +{ + __u64 ibcm_cookie; /* opaque completion cookie */ + __u32 ibcm_status; /* completion status */ +} kib_completion_msg_t __attribute__((packed)); + +typedef struct +{ + __u32 ibm_magic; /* I'm an openibnal message */ + __u16 ibm_version; /* this is my version number */ + __u8 ibm_type; /* msg type */ + __u8 ibm_credits; /* returned credits */ +#if IBNAL_CKSUM + __u32 ibm_nob; + __u32 ibm_cksum; +#endif + union { + kib_immediate_msg_t immediate; + kib_rdma_msg_t rdma; + kib_completion_msg_t completion; + } ibm_u __attribute__((packed)); +} kib_msg_t __attribute__((packed)); + +#define IBNAL_MSG_MAGIC 0x0be91b91 /* unique magic */ +#define IBNAL_MSG_VERSION 1 /* current protocol version */ + +#define IBNAL_MSG_NOOP 0xd0 /* nothing (just credits) */ +#define IBNAL_MSG_IMMEDIATE 0xd1 /* portals hdr + payload */ +#define IBNAL_MSG_PUT_RDMA 0xd2 /* portals PUT hdr + source rdma desc */ +#define IBNAL_MSG_PUT_DONE 0xd3 /* signal PUT rdma completion */ +#define IBNAL_MSG_GET_RDMA 0xd4 /* portals GET hdr + sink rdma desc */ +#define IBNAL_MSG_GET_DONE 0xd5 /* signal GET rdma completion */ + +/***********************************************************************/ + +typedef struct kib_rx /* receive message */ +{ + struct list_head rx_list; /* queue for attention */ + struct kib_conn *rx_conn; /* owning conn */ + int rx_rdma; /* RDMA completion posted? */ + int rx_posted; /* posted? */ + __u64 rx_vaddr; /* pre-mapped buffer (hca vaddr) */ + kib_msg_t *rx_msg; /* pre-mapped buffer (host vaddr) */ + IB_WORK_REQ rx_wrq; + IB_LOCAL_DATASEGMENT rx_gl; /* and it's memory */ +} kib_rx_t; + +typedef struct kib_tx /* transmit message */ +{ + struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */ + int tx_isnblk; /* I'm reserved for non-blocking sends */ + struct kib_conn *tx_conn; /* owning conn */ + int tx_mapped; /* mapped for RDMA? */ + int tx_sending; /* # tx callbacks outstanding */ + int tx_status; /* completion status */ + unsigned long tx_deadline; /* completion deadline */ + int tx_passive_rdma; /* peer sucks/blows */ + int tx_passive_rdma_wait; /* waiting for peer to complete */ + __u64 tx_passive_rdma_cookie; /* completion cookie */ + lib_msg_t *tx_libmsg[2]; /* lib msgs to finalize on completion */ + kib_md_t tx_md; /* RDMA mapping (active/passive) */ + __u64 tx_vaddr; /* pre-mapped buffer (hca vaddr) */ + kib_msg_t *tx_msg; /* pre-mapped buffer (host vaddr) */ + int tx_nsp; /* # send work items */ + IB_WORK_REQ tx_wrq[IBNAL_TX_MAX_SG]; /* send work items... */ + IB_LOCAL_DATASEGMENT tx_gl[IBNAL_TX_MAX_SG]; /* ...and their memory */ +} kib_tx_t; + +#define KIB_TX_UNMAPPED 0 +#define KIB_TX_MAPPED 1 +#define KIB_TX_MAPPED_FMR 2 + +typedef struct kib_wire_connreq +{ + __u32 wcr_magic; /* I'm an openibnal connreq */ + __u16 wcr_version; /* this is my version number */ + __u16 wcr_queue_depth; /* this is my receive queue size */ + __u64 wcr_nid; /* peer's NID */ + __u64 wcr_incarnation; /* peer's incarnation */ +} kib_wire_connreq_t; + +typedef struct kib_gid +{ + __u64 hi, lo; +} kib_gid_t; + +typedef struct kib_connreq +{ + /* connection-in-progress */ + struct kib_conn *cr_conn; + kib_wire_connreq_t cr_wcr; + __u64 cr_tid; + IB_SERVICE_RECORD cr_service; + kib_gid_t cr_gid; + IB_PATH_RECORD cr_path; + CM_REQUEST_INFO cr_cmreq; + CM_CONN_INFO cr_discarded; + CM_REJECT_INFO cr_rej_info; +} kib_connreq_t; + +typedef struct kib_conn +{ + struct kib_peer *ibc_peer; /* owning peer */ + struct list_head ibc_list; /* stash on peer's conn list */ + __u64 ibc_incarnation; /* which instance of the peer */ + atomic_t ibc_refcount; /* # users */ + int ibc_state; /* what's happening */ + atomic_t ibc_nob; /* # bytes buffered */ + int ibc_nsends_posted; /* # uncompleted sends */ + int ibc_credits; /* # credits I have */ + int ibc_outstanding_credits; /* # credits to return */ + int ibc_rcvd_disconnect;/* received discon request */ + int ibc_sent_disconnect;/* sent discon request */ + struct list_head ibc_tx_queue; /* send queue */ + struct list_head ibc_active_txs; /* active tx awaiting completion */ + spinlock_t ibc_lock; /* serialise */ + kib_rx_t *ibc_rxs; /* the rx descs */ + kib_pages_t *ibc_rx_pages; /* premapped rx msg pages */ + IB_HANDLE ibc_qp; /* queue pair */ + IB_HANDLE ibc_cep; /* connection ID? */ + IB_QP_ATTRIBUTES_QUERY ibc_qp_attrs; /* QP attrs */ + kib_connreq_t *ibc_connreq; /* connection request state */ +} kib_conn_t; + +#define IBNAL_CONN_INIT_NOTHING 0 /* initial state */ +#define IBNAL_CONN_INIT_QP 1 /* ibc_qp set up */ +#define IBNAL_CONN_CONNECTING 2 /* started to connect */ +#define IBNAL_CONN_ESTABLISHED 3 /* connection established */ +#define IBNAL_CONN_SEND_DREQ 4 /* to send disconnect req */ +#define IBNAL_CONN_DREQ 5 /* sent disconnect req */ +#define IBNAL_CONN_DREP 6 /* sent disconnect rep */ +#define IBNAL_CONN_DISCONNECTED 7 /* no more QP or CM traffic */ + +#define KIB_ASSERT_CONN_STATE(conn, state) do { \ + LASSERTF((conn)->ibc_state == state, "%d\n", conn->ibc_state); \ +} while (0) + +#define KIB_ASSERT_CONN_STATE_RANGE(conn, low, high) do { \ + LASSERTF(low <= high, "%d %d\n", low, high); \ + LASSERTF((conn)->ibc_state >= low && (conn)->ibc_state <= high, \ + "%d\n", conn->ibc_state); \ +} while (0) + +typedef struct kib_peer +{ + struct list_head ibp_list; /* stash on global peer list */ + struct list_head ibp_connd_list; /* schedule on kib_connd_peers */ + ptl_nid_t ibp_nid; /* who's on the other end(s) */ + atomic_t ibp_refcount; /* # users */ + int ibp_persistence; /* "known" peer refs */ + struct list_head ibp_conns; /* all active connections */ + struct list_head ibp_tx_queue; /* msgs waiting for a conn */ + int ibp_connecting; /* connecting+accepting */ + unsigned long ibp_reconnect_time; /* when reconnect may be attempted */ + unsigned long ibp_reconnect_interval; /* exponential backoff */ +} kib_peer_t; + + +extern lib_nal_t kibnal_lib; +extern kib_data_t kibnal_data; +extern kib_tunables_t kibnal_tunables; + +/******************************************************************************/ +/* Infinicon IBT interface wrappers */ +#define IIBT_IF (kibnal_data.kib_interfaces.ver2) + +static inline FSTATUS +iibt_get_hca_guids(uint32 *hca_count, EUI64 *hca_guid_list) +{ + return IIBT_IF.GetCaGuids(hca_count, hca_guid_list); +} + +static inline FSTATUS +iibt_open_hca(EUI64 hca_guid, + IB_COMPLETION_CALLBACK completion_callback, + IB_ASYNC_EVENT_CALLBACK async_event_callback, + void *arg, + IB_HANDLE *handle) +{ + return IIBT_IF.Vpi.OpenCA(hca_guid, completion_callback, + async_event_callback, arg, handle); +} + +static inline FSTATUS +iibt_query_hca(IB_HANDLE hca_handle, IB_CA_ATTRIBUTES *attrs, void **argp) +{ + return IIBT_IF.Vpi.QueryCA(hca_handle, attrs, argp); +} + +static inline FSTATUS +iibt_close_hca(IB_HANDLE hca_handle) +{ + return IIBT_IF.Vpi.CloseCA(hca_handle); +} + +static inline FSTATUS +iibt_pd_allocate(IB_HANDLE hca_handle, __u32 max_avs, IB_HANDLE *pd_handle) +{ + return IIBT_IF.Vpi.AllocatePD(hca_handle, max_avs, pd_handle); +} + +static inline FSTATUS +iibt_pd_free(IB_HANDLE pd_handle) +{ + return IIBT_IF.Vpi.FreePD(pd_handle); +} + +static inline FSTATUS +iibt_register_physical_memory(IB_HANDLE hca_handle, + IB_VIRT_ADDR requested_io_va, + void *phys_buffers, uint64 nphys_buffers, + uint32 io_va_offset, IB_HANDLE pd_handle, + IB_ACCESS_CONTROL access, + IB_HANDLE *mem_handle, + IB_VIRT_ADDR *actual_io_va, + IB_L_KEY *lkey, IB_R_KEY *rkey) +{ + return IIBT_IF.Vpi.RegisterPhysMemRegion(hca_handle, requested_io_va, + phys_buffers, nphys_buffers, + io_va_offset, pd_handle, + access, + mem_handle, actual_io_va, + lkey, rkey); +} + +static inline FSTATUS +iibt_register_contig_physical_memory(IB_HANDLE hca_handle, + IB_VIRT_ADDR requested_io_va, + IB_MR_PHYS_BUFFER *phys_buffers, + uint64 nphys_buffers, + uint32 io_va_offset, IB_HANDLE pd_handle, + IB_ACCESS_CONTROL access, + IB_HANDLE *mem_handle, + IB_VIRT_ADDR *actual_io_va, + IB_L_KEY *lkey, IB_R_KEY *rkey) +{ + return IIBT_IF.Vpi.RegisterContigPhysMemRegion(hca_handle, + requested_io_va, + phys_buffers, + nphys_buffers, + io_va_offset, pd_handle, + access, + mem_handle, actual_io_va, + lkey, rkey); +} + +static inline FSTATUS +iibt_register_memory(IB_HANDLE hca_handle, + void *virt_addr, unsigned int length, + IB_HANDLE pd_handle, + IB_ACCESS_CONTROL access, + IB_HANDLE *mem_handle, + IB_L_KEY *lkey, IB_R_KEY *rkey) +{ + return IIBT_IF.Vpi.RegisterMemRegion(hca_handle, + virt_addr, length, + pd_handle, + access, + mem_handle, + lkey, rkey); +} + +static inline FSTATUS +iibt_deregister_memory(IB_HANDLE mem_handle) +{ + return IIBT_IF.Vpi.DeregisterMemRegion(mem_handle); +} + +static inline FSTATUS +iibt_cq_create(IB_HANDLE hca_handle, uint32 requested_size, + void *arg, IB_HANDLE *cq_handle, uint32 *actual_size) +{ + return IIBT_IF.Vpi.CreateCQ(hca_handle, requested_size, + arg, cq_handle, actual_size); +} + +static inline FSTATUS +iibt_cq_poll(IB_HANDLE cq_handle, IB_WORK_COMPLETION *wc) +{ + return IIBT_IF.Vpi.PollCQ(cq_handle, wc); +} + +static inline FSTATUS +iibt_cq_rearm(IB_HANDLE cq_handle, IB_CQ_EVENT_SELECT select) +{ + return IIBT_IF.Vpi.RearmCQ(cq_handle, select); +} + +static inline FSTATUS +iibt_cq_destroy(IB_HANDLE cq_handle) +{ + return IIBT_IF.Vpi.DestroyCQ(cq_handle); +} + +static inline FSTATUS +iibt_qp_create(IB_HANDLE hca_handle, IB_QP_ATTRIBUTES_CREATE *create_attr, + void *arg, IB_HANDLE *cq_handle, + IB_QP_ATTRIBUTES_QUERY *query_attr) +{ + return IIBT_IF.Vpi.CreateQP(hca_handle, create_attr, arg, cq_handle, + query_attr); +} + +static inline FSTATUS +iibt_qp_query(IB_HANDLE qp_handle, IB_QP_ATTRIBUTES_QUERY *query_attr, + void **arg_ptr) +{ + return IIBT_IF.Vpi.QueryQP(qp_handle, query_attr, arg_ptr); +} + +static inline FSTATUS +iibt_qp_modify(IB_HANDLE qp_handle, IB_QP_ATTRIBUTES_MODIFY *modify_attr, + IB_QP_ATTRIBUTES_QUERY *query_attr) +{ + return IIBT_IF.Vpi.ModifyQP(qp_handle, modify_attr, query_attr); +} + +static inline FSTATUS +iibt_qp_destroy(IB_HANDLE qp_handle) +{ + return IIBT_IF.Vpi.DestroyQP(qp_handle); +} + +static inline FSTATUS +iibt_postrecv(IB_HANDLE qp_handle, IB_WORK_REQ *work_req) +{ + return IIBT_IF.Vpi.PostRecv(qp_handle, work_req); +} + +static inline FSTATUS +iibt_postsend(IB_HANDLE qp_handle, IB_WORK_REQ *work_req) +{ + return IIBT_IF.Vpi.PostSend(qp_handle, work_req); +} + +static inline FSTATUS +iibt_sd_register(IB_HANDLE *sd_handle, CLIENT_CONTROL_PARAMETERS *p) +{ + return IIBT_IF.Sdi.Register(sd_handle, p); +} + +static inline FSTATUS +iibt_sd_deregister(IB_HANDLE sd_handle) +{ + return IIBT_IF.Sdi.Deregister(sd_handle); +} + +static inline FSTATUS +iibt_sd_port_fabric_operation(IB_HANDLE sd_handle, EUI64 port_guid, + FABRIC_OPERATION_DATA *fod, + PFABRIC_OPERATION_CALLBACK callback, + COMMAND_CONTROL_PARAMETERS *p, void *arg) +{ + return IIBT_IF.Sdi.PortFabricOperation(sd_handle, port_guid, + fod, callback, p, arg); +} + +static inline FSTATUS +iibt_sd_query_port_fabric_information(IB_HANDLE sd_handle, EUI64 port_guid, + QUERY *qry, + PQUERY_CALLBACK callback, + COMMAND_CONTROL_PARAMETERS *p, void *arg) +{ + return IIBT_IF.Sdi.QueryPortFabricInformation(sd_handle, port_guid, + qry, callback, p, arg); +} + +static inline IB_HANDLE +iibt_cm_create_cep(CM_CEP_TYPE type) +{ + return IIBT_IF.Cmi.CmCreateCEP(type); +} + +static inline FSTATUS +iibt_cm_modify_cep(IB_HANDLE cep, uint32 attr, char* value, uint32 len, + uint32 offset) +{ + return IIBT_IF.Cmi.CmModifyCEP(cep, attr, value, len, offset); +} + +static inline FSTATUS +iibt_cm_destroy_cep(IB_HANDLE cep_handle) +{ + return IIBT_IF.Cmi.CmDestroyCEP(cep_handle); +} + +static inline FSTATUS +iibt_cm_listen(IB_HANDLE cep, CM_LISTEN_INFO *info, + PFN_CM_CALLBACK callback, void *arg) +{ + return IIBT_IF.Cmi.CmListen(cep, info, callback, arg); +} + +static inline FSTATUS +iibt_cm_cancel(IB_HANDLE cep) +{ + return IIBT_IF.Cmi.CmCancel(cep); +} + +static inline FSTATUS +iibt_cm_accept(IB_HANDLE cep, + CM_CONN_INFO *send_info, CM_CONN_INFO *recv_info, + PFN_CM_CALLBACK callback, void *arg, + IB_HANDLE *new_cep) +{ + return IIBT_IF.Cmi.CmAccept(cep, + send_info, recv_info, + callback, arg, new_cep); +} + +static inline FSTATUS +iibt_cm_reject(IB_HANDLE cep, CM_REJECT_INFO *rej) +{ + return IIBT_IF.Cmi.CmReject(cep, rej); +} + +static inline FSTATUS +iibt_cm_disconnect(IB_HANDLE cep, CM_DREQUEST_INFO *req, + CM_DREPLY_INFO *reply) +{ + return IIBT_IF.Cmi.CmDisconnect(cep, req, reply); +} + +static inline FSTATUS +iibt_cm_connect (IB_HANDLE cep, CM_REQUEST_INFO *req, + PFN_CM_CALLBACK callback, void *arg) +{ + return IIBT_IF.Cmi.CmConnect (cep, req, callback, arg); +} + +static inline int wrq_signals_completion(IB_WORK_REQ *wrq) +{ + return wrq->Req.SendRC.Options.s.SignaledCompletion == 1; +} + + +/******************************************************************************/ + +/* these are purposely avoiding using local vars so they don't increase + * stack consumption. */ + +#define kib_peer_addref(peer) do { \ + LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n", \ + atomic_read(&peer->ibp_refcount)); \ + CDEBUG(D_NET, "++peer[%p] -> "LPX64" (%d)\n", \ + peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \ + atomic_inc(&peer->ibp_refcount); \ +} while (0) + +#define kib_peer_decref(peer) do { \ + LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n", \ + atomic_read(&peer->ibp_refcount)); \ + CDEBUG(D_NET, "--peer[%p] -> "LPX64" (%d)\n", \ + peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \ + if (atomic_dec_and_test (&peer->ibp_refcount)) { \ + CDEBUG (D_NET, "destroying peer "LPX64" %p\n", \ + peer->ibp_nid, peer); \ + kibnal_destroy_peer (peer); \ + } \ +} while (0) + +/******************************************************************************/ + +static inline struct list_head * +kibnal_nid2peerlist (ptl_nid_t nid) +{ + unsigned int hash = ((unsigned int)nid) % kibnal_data.kib_peer_hash_size; + + return (&kibnal_data.kib_peers [hash]); +} + +static inline int +kibnal_peer_active(kib_peer_t *peer) +{ + /* Am I in the peer hash table? */ + return (!list_empty(&peer->ibp_list)); +} + +static inline void +kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn) +{ + /* CAVEAT EMPTOR: tx takes caller's ref on conn */ + + LASSERT (tx->tx_nsp > 0); /* work items set up */ + LASSERT (tx->tx_conn == NULL); /* only set here */ + + tx->tx_conn = conn; + tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ; + list_add_tail(&tx->tx_list, &conn->ibc_tx_queue); +} + +#define KIBNAL_SERVICE_KEY_MASK (IB_SERVICE_RECORD_COMP_SERVICENAME | \ + IB_SERVICE_RECORD_COMP_SERVICEDATA8_1 | \ + IB_SERVICE_RECORD_COMP_SERVICEDATA8_2 | \ + IB_SERVICE_RECORD_COMP_SERVICEDATA8_3 | \ + IB_SERVICE_RECORD_COMP_SERVICEDATA8_4 | \ + IB_SERVICE_RECORD_COMP_SERVICEDATA8_5 | \ + IB_SERVICE_RECORD_COMP_SERVICEDATA8_6 | \ + IB_SERVICE_RECORD_COMP_SERVICEDATA8_7 | \ + IB_SERVICE_RECORD_COMP_SERVICEDATA8_8) + +static inline __u64* +kibnal_service_nid_field(IB_SERVICE_RECORD *srv) +{ + /* must be consistent with KIBNAL_SERVICE_KEY_MASK */ + return (__u64 *)srv->ServiceData8; +} + + +static inline void +kibnal_set_service_keys(IB_SERVICE_RECORD *srv, ptl_nid_t nid) +{ + LASSERT (strlen(IBNAL_SERVICE_NAME) < sizeof(srv->ServiceName)); + memset (srv->ServiceName, 0, sizeof(srv->ServiceName)); + strcpy (srv->ServiceName, IBNAL_SERVICE_NAME); + + *kibnal_service_nid_field(srv) = cpu_to_le64(nid); +} + +#if 0 +static inline void +kibnal_show_rdma_attr (kib_conn_t *conn) +{ + struct ib_qp_attribute qp_attr; + int rc; + + memset (&qp_attr, 0, sizeof(qp_attr)); + rc = ib_qp_query(conn->ibc_qp, &qp_attr); + if (rc != 0) { + CERROR ("Can't get qp attrs: %d\n", rc); + return; + } + + CWARN ("RDMA CAPABILITY: write %s read %s\n", + (qp_attr.valid_fields & TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE) ? + (qp_attr.enable_rdma_write ? "enabled" : "disabled") : "invalid", + (qp_attr.valid_fields & TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE) ? + (qp_attr.enable_rdma_read ? "enabled" : "disabled") : "invalid"); +} +#endif + +#if CONFIG_X86 +static inline __u64 +kibnal_page2phys (struct page *p) +{ + __u64 page_number = p - mem_map; + + return (page_number << PAGE_SHIFT); +} +#else +# error "no page->phys" +#endif + +/* CAVEAT EMPTOR: + * We rely on tx/rx descriptor alignment to allow us to use the lowest bit + * of the work request id as a flag to determine if the completion is for a + * transmit or a receive. It seems that that the CQ entry's 'op' field + * isn't always set correctly on completions that occur after QP teardown. */ + +static inline __u64 +kibnal_ptr2wreqid (void *ptr, int isrx) +{ + unsigned long lptr = (unsigned long)ptr; + + LASSERT ((lptr & 1) == 0); + return (__u64)(lptr | (isrx ? 1 : 0)); +} + +static inline void * +kibnal_wreqid2ptr (__u64 wreqid) +{ + return (void *)(((unsigned long)wreqid) & ~1UL); +} + +static inline int +kibnal_wreqid_is_rx (__u64 wreqid) +{ + return (wreqid & 1) != 0; +} + +static inline int +kibnal_whole_mem(void) +{ + return kibnal_data.kib_md.md_handle != NULL; +} + +extern kib_peer_t *kibnal_create_peer (ptl_nid_t nid); +extern void kibnal_destroy_peer (kib_peer_t *peer); +extern int kibnal_del_peer (ptl_nid_t nid, int single_share); +extern kib_peer_t *kibnal_find_peer_locked (ptl_nid_t nid); +extern void kibnal_unlink_peer_locked (kib_peer_t *peer); +extern int kibnal_close_stale_conns_locked (kib_peer_t *peer, + __u64 incarnation); +extern kib_conn_t *kibnal_create_conn (void); +extern void kibnal_put_conn (kib_conn_t *conn); +extern void kibnal_destroy_conn (kib_conn_t *conn); +void kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg); + +extern int kibnal_alloc_pages (kib_pages_t **pp, int npages, int access); +extern void kibnal_free_pages (kib_pages_t *p); + +extern void kibnal_check_sends (kib_conn_t *conn); +extern void kibnal_close_conn_locked (kib_conn_t *conn, int error); +extern void kibnal_destroy_conn (kib_conn_t *conn); +extern int kibnal_thread_start (int (*fn)(void *arg), void *arg); +extern int kibnal_scheduler(void *arg); +extern int kibnal_connd (void *arg); +extern void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob); +extern void kibnal_close_conn (kib_conn_t *conn, int why); +extern void kibnal_start_active_rdma (int type, int status, + kib_rx_t *rx, lib_msg_t *libmsg, + unsigned int niov, + struct iovec *iov, ptl_kiov_t *kiov, + size_t offset, size_t nob); + +void kibnal_ca_async_callback (void *ca_arg, IB_EVENT_RECORD *ev); +void kibnal_ca_callback (void *ca_arg, void *cq_arg); diff --git a/lnet/klnds/iiblnd/iiblnd_cb.c b/lnet/klnds/iiblnd/iiblnd_cb.c new file mode 100644 index 0000000..a827ba5 --- /dev/null +++ b/lnet/klnds/iiblnd/iiblnd_cb.c @@ -0,0 +1,3018 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * Author: Eric Barton + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "iibnal.h" + +/* + * LIB functions follow + * + */ +static void +kibnal_schedule_tx_done (kib_tx_t *tx) +{ + unsigned long flags; + + spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags); + + list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq); + wake_up (&kibnal_data.kib_sched_waitq); + + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); +} + +static void +kibnal_tx_done (kib_tx_t *tx) +{ + ptl_err_t ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL; + unsigned long flags; + int i; + FSTATUS frc; + + LASSERT (tx->tx_sending == 0); /* mustn't be awaiting callback */ + LASSERT (!tx->tx_passive_rdma_wait); /* mustn't be awaiting RDMA */ + + switch (tx->tx_mapped) { + default: + LBUG(); + + case KIB_TX_UNMAPPED: + break; + + case KIB_TX_MAPPED: + if (in_interrupt()) { + /* can't deregister memory in IRQ context... */ + kibnal_schedule_tx_done(tx); + return; + } + frc = iibt_deregister_memory(tx->tx_md.md_handle); + LASSERT (frc == FSUCCESS); + tx->tx_mapped = KIB_TX_UNMAPPED; + break; + +#if IBNAL_FMR + case KIB_TX_MAPPED_FMR: + if (in_interrupt() && tx->tx_status != 0) { + /* can't flush FMRs in IRQ context... */ + kibnal_schedule_tx_done(tx); + return; + } + + rc = ib_fmr_deregister(tx->tx_md.md_handle.fmr); + LASSERT (rc == 0); + + if (tx->tx_status != 0) + ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool); + tx->tx_mapped = KIB_TX_UNMAPPED; + break; +#endif + } + + for (i = 0; i < 2; i++) { + /* tx may have up to 2 libmsgs to finalise */ + if (tx->tx_libmsg[i] == NULL) + continue; + + lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc); + tx->tx_libmsg[i] = NULL; + } + + if (tx->tx_conn != NULL) { + kibnal_put_conn (tx->tx_conn); + tx->tx_conn = NULL; + } + + tx->tx_nsp = 0; + tx->tx_passive_rdma = 0; + tx->tx_status = 0; + + spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); + + if (tx->tx_isnblk) { + list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs); + } else { + list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs); + wake_up (&kibnal_data.kib_idle_tx_waitq); + } + + spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); +} + +static kib_tx_t * +kibnal_get_idle_tx (int may_block) +{ + unsigned long flags; + kib_tx_t *tx = NULL; + ENTRY; + + for (;;) { + spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); + + /* "normal" descriptor is free */ + if (!list_empty (&kibnal_data.kib_idle_txs)) { + tx = list_entry (kibnal_data.kib_idle_txs.next, + kib_tx_t, tx_list); + break; + } + + if (!may_block) { + /* may dip into reserve pool */ + if (list_empty (&kibnal_data.kib_idle_nblk_txs)) { + CERROR ("reserved tx desc pool exhausted\n"); + break; + } + + tx = list_entry (kibnal_data.kib_idle_nblk_txs.next, + kib_tx_t, tx_list); + break; + } + + /* block for idle tx */ + spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); + + wait_event (kibnal_data.kib_idle_tx_waitq, + !list_empty (&kibnal_data.kib_idle_txs) || + kibnal_data.kib_shutdown); + } + + if (tx != NULL) { + list_del (&tx->tx_list); + + /* Allocate a new passive RDMA completion cookie. It might + * not be needed, but we've got a lock right now and we're + * unlikely to wrap... */ + tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++; + + LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); + LASSERT (tx->tx_nsp == 0); + LASSERT (tx->tx_sending == 0); + LASSERT (tx->tx_status == 0); + LASSERT (tx->tx_conn == NULL); + LASSERT (!tx->tx_passive_rdma); + LASSERT (!tx->tx_passive_rdma_wait); + LASSERT (tx->tx_libmsg[0] == NULL); + LASSERT (tx->tx_libmsg[1] == NULL); + } + + spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); + + RETURN(tx); +} + +static int +kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) +{ + /* I would guess that if kibnal_get_peer (nid) == NULL, + and we're not routing, then 'nid' is very distant :) */ + if ( nal->libnal_ni.ni_pid.nid == nid ) { + *dist = 0; + } else { + *dist = 1; + } + + return 0; +} + +static void +kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status) +{ + struct list_head *ttmp; + unsigned long flags; + int idle; + + spin_lock_irqsave (&conn->ibc_lock, flags); + + list_for_each (ttmp, &conn->ibc_active_txs) { + kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list); + + LASSERT (tx->tx_passive_rdma || + !tx->tx_passive_rdma_wait); + + LASSERT (tx->tx_passive_rdma_wait || + tx->tx_sending != 0); + + if (!tx->tx_passive_rdma_wait || + tx->tx_passive_rdma_cookie != cookie) + continue; + + CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status); + + tx->tx_status = status; + tx->tx_passive_rdma_wait = 0; + idle = (tx->tx_sending == 0); + + if (idle) + list_del (&tx->tx_list); + + spin_unlock_irqrestore (&conn->ibc_lock, flags); + + /* I could be racing with tx callbacks. It's whoever + * _makes_ tx idle that frees it */ + if (idle) + kibnal_tx_done (tx); + return; + } + + spin_unlock_irqrestore (&conn->ibc_lock, flags); + + CERROR ("Unmatched (late?) RDMA completion "LPX64" from "LPX64"\n", + cookie, conn->ibc_peer->ibp_nid); +} + +static __u32 +kibnal_lkey(kib_pages_t *ibp) +{ + if (kibnal_whole_mem()) + return kibnal_data.kib_md.md_lkey; + + return ibp->ibp_lkey; +} + +static void +kibnal_post_rx (kib_rx_t *rx, int do_credits) +{ + kib_conn_t *conn = rx->rx_conn; + int rc = 0; + unsigned long flags; + FSTATUS frc; + ENTRY; + + rx->rx_gl = (IB_LOCAL_DATASEGMENT) { + .Address = rx->rx_vaddr, + .Length = IBNAL_MSG_SIZE, + .Lkey = kibnal_lkey(conn->ibc_rx_pages), + }; + + rx->rx_wrq = (IB_WORK_REQ) { + .Operation = WROpRecv, + .DSListDepth = 1, + .MessageLen = IBNAL_MSG_SIZE, + .WorkReqId = kibnal_ptr2wreqid(rx, 1), + .DSList = &rx->rx_gl, + }; + + KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED, + IBNAL_CONN_DREP); + LASSERT (!rx->rx_posted); + rx->rx_posted = 1; + mb(); + + if (conn->ibc_state != IBNAL_CONN_ESTABLISHED) + rc = -ECONNABORTED; + else { + frc = iibt_postrecv(conn->ibc_qp, &rx->rx_wrq); + if (frc != FSUCCESS) { + CDEBUG(D_NET, "post failed %d\n", frc); + rc = -EINVAL; + } + CDEBUG(D_NET, "posted rx %p\n", &rx->rx_wrq); + } + + if (rc == 0) { + if (do_credits) { + spin_lock_irqsave(&conn->ibc_lock, flags); + conn->ibc_outstanding_credits++; + spin_unlock_irqrestore(&conn->ibc_lock, flags); + + kibnal_check_sends(conn); + } + EXIT; + return; + } + + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { + CERROR ("Error posting receive -> "LPX64": %d\n", + conn->ibc_peer->ibp_nid, rc); + kibnal_close_conn (rx->rx_conn, rc); + } else { + CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n", + conn->ibc_peer->ibp_nid, rc); + } + + /* Drop rx's ref */ + kibnal_put_conn (conn); + EXIT; +} + +#if IBNAL_CKSUM +static inline __u32 kibnal_cksum (void *ptr, int nob) +{ + char *c = ptr; + __u32 sum = 0; + + while (nob-- > 0) + sum = ((sum << 1) | (sum >> 31)) + *c++; + + return (sum); +} +#endif + +static void hexdump(char *string, void *ptr, int len) +{ + unsigned char *c = ptr; + int i; + + return; + + if (len < 0 || len > 2048) { + printk("XXX what the hell? %d\n",len); + return; + } + + printk("%d bytes of '%s' from 0x%p\n", len, string, ptr); + + for (i = 0; i < len;) { + printk("%02x",*(c++)); + i++; + if (!(i & 15)) { + printk("\n"); + } else if (!(i&1)) { + printk(" "); + } + } + + if(len & 15) { + printk("\n"); + } +} + +static void +kibnal_rx_callback (IB_WORK_COMPLETION *wc) +{ + kib_rx_t *rx = (kib_rx_t *)kibnal_wreqid2ptr(wc->WorkReqId); + kib_msg_t *msg = rx->rx_msg; + kib_conn_t *conn = rx->rx_conn; + int nob = wc->Length; + const int base_nob = offsetof(kib_msg_t, ibm_u); + int credits; + int flipped; + unsigned long flags; + __u32 i; +#if IBNAL_CKSUM + __u32 msg_cksum; + __u32 computed_cksum; +#endif + + /* we set the QP to erroring after we've finished disconnecting, + * maybe we should do so sooner. */ + KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED, + IBNAL_CONN_DISCONNECTED); + + CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); + LASSERT (rx->rx_posted); + rx->rx_posted = 0; + mb(); + + /* receives complete with error in any case after we've started + * disconnecting */ + if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) + goto failed; + + if (wc->Status != WRStatusSuccess) { + CERROR("Rx from "LPX64" failed: %d\n", + conn->ibc_peer->ibp_nid, wc->Status); + goto failed; + } + + if (nob < base_nob) { + CERROR ("Short rx from "LPX64": %d < expected %d\n", + conn->ibc_peer->ibp_nid, nob, base_nob); + goto failed; + } + + hexdump("rx", rx->rx_msg, sizeof(kib_msg_t)); + + /* Receiver does any byte flipping if necessary... */ + + if (msg->ibm_magic == IBNAL_MSG_MAGIC) { + flipped = 0; + } else { + if (msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) { + CERROR ("Unrecognised magic: %08x from "LPX64"\n", + msg->ibm_magic, conn->ibc_peer->ibp_nid); + goto failed; + } + flipped = 1; + __swab16s (&msg->ibm_version); + LASSERT (sizeof(msg->ibm_type) == 1); + LASSERT (sizeof(msg->ibm_credits) == 1); + } + + if (msg->ibm_version != IBNAL_MSG_VERSION) { + CERROR ("Incompatible msg version %d (%d expected)\n", + msg->ibm_version, IBNAL_MSG_VERSION); + goto failed; + } + +#if IBNAL_CKSUM + if (nob != msg->ibm_nob) { + CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->ibm_nob); + goto failed; + } + + msg_cksum = le32_to_cpu(msg->ibm_cksum); + msg->ibm_cksum = 0; + computed_cksum = kibnal_cksum (msg, nob); + + if (msg_cksum != computed_cksum) { + CERROR ("Checksum failure %d: (%d expected)\n", + computed_cksum, msg_cksum); +// goto failed; + } + CDEBUG(D_NET, "cksum %x, nob %d\n", computed_cksum, nob); +#endif + + /* Have I received credits that will let me send? */ + credits = msg->ibm_credits; + if (credits != 0) { + spin_lock_irqsave(&conn->ibc_lock, flags); + conn->ibc_credits += credits; + spin_unlock_irqrestore(&conn->ibc_lock, flags); + + kibnal_check_sends(conn); + } + + switch (msg->ibm_type) { + case IBNAL_MSG_NOOP: + kibnal_post_rx (rx, 1); + return; + + case IBNAL_MSG_IMMEDIATE: + if (nob < base_nob + sizeof (kib_immediate_msg_t)) { + CERROR ("Short IMMEDIATE from "LPX64": %d\n", + conn->ibc_peer->ibp_nid, nob); + goto failed; + } + break; + + case IBNAL_MSG_PUT_RDMA: + case IBNAL_MSG_GET_RDMA: + if (nob < base_nob + sizeof (kib_rdma_msg_t)) { + CERROR ("Short RDMA msg from "LPX64": %d\n", + conn->ibc_peer->ibp_nid, nob); + goto failed; + } + if (flipped) + __swab32(msg->ibm_u.rdma.ibrm_num_descs); + + CDEBUG(D_NET, "%d RDMA: cookie "LPX64":\n", + msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie); + + if ((msg->ibm_u.rdma.ibrm_num_descs > PTL_MD_MAX_IOV) || + (kib_rdma_msg_len(msg->ibm_u.rdma.ibrm_num_descs) > + min(nob, IBNAL_MSG_SIZE))) { + CERROR ("num_descs %d too large\n", + msg->ibm_u.rdma.ibrm_num_descs); + goto failed; + } + + for(i = 0; i < msg->ibm_u.rdma.ibrm_num_descs; i++) { + kib_rdma_desc_t *desc = &msg->ibm_u.rdma.ibrm_desc[i]; + + if (flipped) { + __swab32(desc->rd_key); + __swab32(desc->rd_nob); + __swab64(desc->rd_addr); + } + + CDEBUG(D_NET, " key %x, " "addr "LPX64", nob %u\n", + desc->rd_key, desc->rd_addr, desc->rd_nob); + } + break; + + case IBNAL_MSG_PUT_DONE: + case IBNAL_MSG_GET_DONE: + if (nob < base_nob + sizeof (kib_completion_msg_t)) { + CERROR ("Short COMPLETION msg from "LPX64": %d\n", + conn->ibc_peer->ibp_nid, nob); + goto failed; + } + if (flipped) + __swab32s(&msg->ibm_u.completion.ibcm_status); + + CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n", + msg->ibm_type, msg->ibm_u.completion.ibcm_cookie, + msg->ibm_u.completion.ibcm_status); + + kibnal_complete_passive_rdma (conn, + msg->ibm_u.completion.ibcm_cookie, + msg->ibm_u.completion.ibcm_status); + kibnal_post_rx (rx, 1); + return; + + default: + CERROR ("Can't parse type from "LPX64": %d\n", + conn->ibc_peer->ibp_nid, msg->ibm_type); + goto failed; + } + + /* schedule for kibnal_rx() in thread context */ + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); + + list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq); + wake_up (&kibnal_data.kib_sched_waitq); + + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); + return; + + failed: + CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); + kibnal_close_conn(conn, -ECONNABORTED); + + /* Don't re-post rx & drop its ref on conn */ + kibnal_put_conn(conn); +} + +void +kibnal_rx (kib_rx_t *rx) +{ + kib_msg_t *msg = rx->rx_msg; + + /* Clear flag so I can detect if I've sent an RDMA completion */ + rx->rx_rdma = 0; + + switch (msg->ibm_type) { + case IBNAL_MSG_GET_RDMA: + lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx); + /* If the incoming get was matched, I'll have initiated the + * RDMA and the completion message... */ + if (rx->rx_rdma) + break; + + /* Otherwise, I'll send a failed completion now to prevent + * the peer's GET blocking for the full timeout. */ + CERROR ("Completing unmatched RDMA GET from "LPX64"\n", + rx->rx_conn->ibc_peer->ibp_nid); + kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO, + rx, NULL, 0, NULL, NULL, 0, 0); + break; + + case IBNAL_MSG_PUT_RDMA: + lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx); + if (rx->rx_rdma) + break; + /* This is most unusual, since even if lib_parse() didn't + * match anything, it should have asked us to read (and + * discard) the payload. The portals header must be + * inconsistent with this message type, so it's the + * sender's fault for sending garbage and she can time + * herself out... */ + CERROR ("Uncompleted RMDA PUT from "LPX64"\n", + rx->rx_conn->ibc_peer->ibp_nid); + break; + + case IBNAL_MSG_IMMEDIATE: + lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx); + LASSERT (!rx->rx_rdma); + break; + + default: + LBUG(); + break; + } + + kibnal_post_rx (rx, 1); +} + +static struct page * +kibnal_kvaddr_to_page (unsigned long vaddr) +{ + struct page *page; + + if (vaddr >= VMALLOC_START && + vaddr < VMALLOC_END) + page = vmalloc_to_page ((void *)vaddr); +#if CONFIG_HIGHMEM + else if (vaddr >= PKMAP_BASE && + vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) + page = vmalloc_to_page ((void *)vaddr); + /* in 2.4 ^ just walks the page tables */ +#endif + else + page = virt_to_page (vaddr); + + if (!VALID_PAGE (page)) + page = NULL; + + return page; +} + +static void +kibnal_fill_ibrm(kib_tx_t *tx, struct page *page, unsigned long page_offset, + unsigned long len, int active) +{ + kib_rdma_msg_t *ibrm = &tx->tx_msg->ibm_u.rdma; + kib_rdma_desc_t *desc; + + LASSERTF(ibrm->ibrm_num_descs < PTL_MD_MAX_IOV, "%u\n", + ibrm->ibrm_num_descs); + + desc = &ibrm->ibrm_desc[ibrm->ibrm_num_descs]; + if (active) + desc->rd_key = kibnal_data.kib_md.md_lkey; + else + desc->rd_key = kibnal_data.kib_md.md_rkey; + desc->rd_nob = len; /*PAGE_SIZE - kiov->kiov_offset; */ + desc->rd_addr = kibnal_page2phys(page) + page_offset + + kibnal_data.kib_md.md_addr; + + ibrm->ibrm_num_descs++; +} + +static int +kibnal_map_rdma_iov(kib_tx_t *tx, unsigned long vaddr, int nob, int active) +{ + struct page *page; + int page_offset, len; + + while (nob > 0) { + page = kibnal_kvaddr_to_page(vaddr); + if (page == NULL) + return -EFAULT; + + page_offset = vaddr & (PAGE_SIZE - 1); + len = min(nob, (int)PAGE_SIZE - page_offset); + + kibnal_fill_ibrm(tx, page, page_offset, len, active); + nob -= len; + vaddr += len; + } + return 0; +} + +static int +kibnal_map_iov (kib_tx_t *tx, IB_ACCESS_CONTROL access, + int niov, struct iovec *iov, int offset, int nob, int active) + +{ + void *vaddr; + FSTATUS frc; + + LASSERT (nob > 0); + LASSERT (niov > 0); + LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); + + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + niov--; + iov++; + LASSERT (niov > 0); + } + + if (nob > iov->iov_len - offset) { + CERROR ("Can't map multiple vaddr fragments\n"); + return (-EMSGSIZE); + } + + /* our large contiguous iov could be backed by multiple physical + * pages. */ + if (kibnal_whole_mem()) { + int rc; + tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0; + rc = kibnal_map_rdma_iov(tx, (unsigned long)iov->iov_base + + offset, nob, active); + if (rc != 0) { + CERROR ("Can't map iov: %d\n", rc); + return rc; + } + return 0; + } + + vaddr = (void *)(((unsigned long)iov->iov_base) + offset); + tx->tx_md.md_addr = (__u64)((unsigned long)vaddr); + + frc = iibt_register_memory(kibnal_data.kib_hca, vaddr, nob, + kibnal_data.kib_pd, access, + &tx->tx_md.md_handle, &tx->tx_md.md_lkey, + &tx->tx_md.md_rkey); + if (frc != 0) { + CERROR ("Can't map vaddr %p: %d\n", vaddr, frc); + return -EINVAL; + } + + tx->tx_mapped = KIB_TX_MAPPED; + return (0); +} + +static int +kibnal_map_kiov (kib_tx_t *tx, IB_ACCESS_CONTROL access, + int nkiov, ptl_kiov_t *kiov, + int offset, int nob, int active) +{ + __u64 *phys = NULL; + int page_offset; + int nphys; + int resid; + int phys_size = 0; + FSTATUS frc; + int i, rc = 0; + + CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob); + + LASSERT (nob > 0); + LASSERT (nkiov > 0); + LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); + + while (offset >= kiov->kiov_len) { + offset -= kiov->kiov_len; + nkiov--; + kiov++; + LASSERT (nkiov > 0); + } + + page_offset = kiov->kiov_offset + offset; + nphys = 1; + + if (!kibnal_whole_mem()) { + phys_size = nkiov * sizeof (*phys); + PORTAL_ALLOC(phys, phys_size); + if (phys == NULL) { + CERROR ("Can't allocate tmp phys\n"); + return (-ENOMEM); + } + + phys[0] = kibnal_page2phys(kiov->kiov_page); + } else { + tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0; + kibnal_fill_ibrm(tx, kiov->kiov_page, kiov->kiov_offset, + kiov->kiov_len, active); + } + + resid = nob - (kiov->kiov_len - offset); + + while (resid > 0) { + kiov++; + nkiov--; + LASSERT (nkiov > 0); + + if (kiov->kiov_offset != 0 || + ((resid > PAGE_SIZE) && + kiov->kiov_len < PAGE_SIZE)) { + /* Can't have gaps */ + CERROR ("Can't make payload contiguous in I/O VM:" + "page %d, offset %d, len %d \n", nphys, + kiov->kiov_offset, kiov->kiov_len); + + for (i = -nphys; i < nkiov; i++) + { + CERROR("kiov[%d] %p +%d for %d\n", + i, kiov[i].kiov_page, kiov[i].kiov_offset, kiov[i].kiov_len); + } + + rc = -EINVAL; + goto out; + } + + if (nphys == PTL_MD_MAX_IOV) { + CERROR ("payload too big (%d)\n", nphys); + rc = -EMSGSIZE; + goto out; + } + + if (!kibnal_whole_mem()) { + LASSERT (nphys * sizeof (*phys) < phys_size); + phys[nphys] = kibnal_page2phys(kiov->kiov_page); + } else { + if (kib_rdma_msg_len(nphys) > IBNAL_MSG_SIZE) { + CERROR ("payload too big (%d)\n", nphys); + rc = -EMSGSIZE; + goto out; + } + kibnal_fill_ibrm(tx, kiov->kiov_page, + kiov->kiov_offset, kiov->kiov_len, + active); + } + + nphys ++; + resid -= PAGE_SIZE; + } + + if (kibnal_whole_mem()) + goto out; + +#if 0 + CWARN ("nphys %d, nob %d, page_offset %d\n", nphys, nob, page_offset); + for (i = 0; i < nphys; i++) + CWARN (" [%d] "LPX64"\n", i, phys[i]); +#endif + +#if IBNAL_FMR +#error "iibnal hasn't learned about FMR yet" + rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool, + phys, nphys, + &tx->tx_md.md_addr, + page_offset, + &tx->tx_md.md_handle.fmr, + &tx->tx_md.md_lkey, + &tx->tx_md.md_rkey); +#else + frc = iibt_register_physical_memory(kibnal_data.kib_hca, + IBNAL_RDMA_BASE, + phys, nphys, + 0, /* offset */ + kibnal_data.kib_pd, + access, + &tx->tx_md.md_handle, + &tx->tx_md.md_addr, + &tx->tx_md.md_lkey, + &tx->tx_md.md_rkey); +#endif + if (frc == FSUCCESS) { + CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: lkey %x, rkey %x\n", + nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey); +#if IBNAL_FMR + tx->tx_mapped = KIB_TX_MAPPED_FMR; +#else + tx->tx_mapped = KIB_TX_MAPPED; +#endif + } else { + CERROR ("Can't map phys: %d\n", rc); + rc = -EFAULT; + } + + out: + if (phys != NULL) + PORTAL_FREE(phys, phys_size); + return (rc); +} + +static kib_conn_t * +kibnal_find_conn_locked (kib_peer_t *peer) +{ + struct list_head *tmp; + + /* just return the first connection */ + list_for_each (tmp, &peer->ibp_conns) { + return (list_entry(tmp, kib_conn_t, ibc_list)); + } + + return (NULL); +} + +void +kibnal_check_sends (kib_conn_t *conn) +{ + unsigned long flags; + kib_tx_t *tx; + int rc; + int i; + int done; + int nwork; + ENTRY; + + spin_lock_irqsave (&conn->ibc_lock, flags); + + LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE); + + if (list_empty(&conn->ibc_tx_queue) && + conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) { + spin_unlock_irqrestore(&conn->ibc_lock, flags); + + tx = kibnal_get_idle_tx(0); /* don't block */ + if (tx != NULL) + kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0); + + spin_lock_irqsave(&conn->ibc_lock, flags); + + if (tx != NULL) { + atomic_inc(&conn->ibc_refcount); + kibnal_queue_tx_locked(tx, conn); + } + } + + while (!list_empty (&conn->ibc_tx_queue)) { + tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list); + + /* We rely on this for QP sizing */ + LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= IBNAL_TX_MAX_SG); + + LASSERT (conn->ibc_outstanding_credits >= 0); + LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE); + LASSERT (conn->ibc_credits >= 0); + LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE); + + /* Not on ibc_rdma_queue */ + LASSERT (!tx->tx_passive_rdma_wait); + + if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE) + GOTO(out, 0); + + if (conn->ibc_credits == 0) /* no credits */ + GOTO(out, 1); + + if (conn->ibc_credits == 1 && /* last credit reserved for */ + conn->ibc_outstanding_credits == 0) /* giving back credits */ + GOTO(out, 2); + + list_del (&tx->tx_list); + + if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP && + (!list_empty(&conn->ibc_tx_queue) || + conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) { + /* redundant NOOP */ + spin_unlock_irqrestore(&conn->ibc_lock, flags); + kibnal_tx_done(tx); + spin_lock_irqsave(&conn->ibc_lock, flags); + continue; + } + + tx->tx_msg->ibm_credits = conn->ibc_outstanding_credits; + conn->ibc_outstanding_credits = 0; + + conn->ibc_nsends_posted++; + conn->ibc_credits--; + + /* we only get a tx completion for the final rdma op */ + tx->tx_sending = min(tx->tx_nsp, 2); + tx->tx_passive_rdma_wait = tx->tx_passive_rdma; + list_add (&tx->tx_list, &conn->ibc_active_txs); +#if IBNAL_CKSUM + tx->tx_msg->ibm_cksum = 0; + tx->tx_msg->ibm_cksum = kibnal_cksum(tx->tx_msg, tx->tx_msg->ibm_nob); + CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->ibm_cksum, tx->tx_msg->ibm_nob); +#endif + spin_unlock_irqrestore (&conn->ibc_lock, flags); + + /* NB the gap between removing tx from the queue and sending it + * allows message re-ordering to occur */ + + LASSERT (tx->tx_nsp > 0); + + rc = -ECONNABORTED; + nwork = 0; + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { + tx->tx_status = 0; + /* Driver only accepts 1 item at a time */ + for (i = 0; i < tx->tx_nsp; i++) { + hexdump("tx", tx->tx_msg, sizeof(kib_msg_t)); + rc = iibt_postsend(conn->ibc_qp, + &tx->tx_wrq[i]); + if (rc != 0) + break; + if (wrq_signals_completion(&tx->tx_wrq[i])) + nwork++; + CDEBUG(D_NET, "posted tx wrq %p\n", + &tx->tx_wrq[i]); + } + } + + spin_lock_irqsave (&conn->ibc_lock, flags); + if (rc != 0) { + /* NB credits are transferred in the actual + * message, which can only be the last work item */ + conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits; + conn->ibc_credits++; + conn->ibc_nsends_posted--; + + tx->tx_status = rc; + tx->tx_passive_rdma_wait = 0; + tx->tx_sending -= tx->tx_nsp - nwork; + + done = (tx->tx_sending == 0); + if (done) + list_del (&tx->tx_list); + + spin_unlock_irqrestore (&conn->ibc_lock, flags); + + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) + CERROR ("Error %d posting transmit to "LPX64"\n", + rc, conn->ibc_peer->ibp_nid); + else + CDEBUG (D_NET, "Error %d posting transmit to " + LPX64"\n", rc, conn->ibc_peer->ibp_nid); + + kibnal_close_conn (conn, rc); + + if (done) + kibnal_tx_done (tx); + return; + } + + } + + EXIT; +out: + spin_unlock_irqrestore (&conn->ibc_lock, flags); +} + +static void +kibnal_tx_callback (IB_WORK_COMPLETION *wc) +{ + kib_tx_t *tx = (kib_tx_t *)kibnal_wreqid2ptr(wc->WorkReqId); + kib_conn_t *conn; + unsigned long flags; + int idle; + + conn = tx->tx_conn; + LASSERT (conn != NULL); + LASSERT (tx->tx_sending != 0); + + spin_lock_irqsave(&conn->ibc_lock, flags); + + CDEBUG(D_NET, "conn %p tx %p [%d/%d]: %d\n", conn, tx, + tx->tx_sending, tx->tx_nsp, wc->Status); + + /* I could be racing with rdma completion. Whoever makes 'tx' idle + * gets to free it, which also drops its ref on 'conn'. If it's + * not me, then I take an extra ref on conn so it can't disappear + * under me. */ + + tx->tx_sending--; + idle = (tx->tx_sending == 0) && /* This is the final callback */ + (!tx->tx_passive_rdma_wait); /* Not waiting for RDMA completion */ + if (idle) + list_del(&tx->tx_list); + + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); + + if (tx->tx_sending == 0) + conn->ibc_nsends_posted--; + + if (wc->Status != WRStatusSuccess && + tx->tx_status == 0) + tx->tx_status = -ECONNABORTED; + + spin_unlock_irqrestore(&conn->ibc_lock, flags); + + if (idle) + kibnal_tx_done (tx); + + if (wc->Status != WRStatusSuccess) { + CERROR ("Tx completion to "LPX64" failed: %d\n", + conn->ibc_peer->ibp_nid, wc->Status); + kibnal_close_conn (conn, -ENETDOWN); + } else { + /* can I shovel some more sends out the door? */ + kibnal_check_sends(conn); + } + + kibnal_put_conn (conn); +} + +void +kibnal_ca_async_callback (void *ca_arg, IB_EVENT_RECORD *ev) +{ + /* XXX flesh out. this seems largely for async errors */ + CERROR("type: %d code: %u\n", ev->EventType, ev->EventCode); +} + +void +kibnal_ca_callback (void *ca_arg, void *cq_arg) +{ + IB_HANDLE cq = *(IB_HANDLE *)cq_arg; + IB_HANDLE ca = *(IB_HANDLE *)ca_arg; + IB_WORK_COMPLETION wc; + int armed = 0; + + CDEBUG(D_NET, "ca %p cq %p\n", ca, cq); + + for(;;) { + while (iibt_cq_poll(cq, &wc) == FSUCCESS) { + if (kibnal_wreqid_is_rx(wc.WorkReqId)) + kibnal_rx_callback(&wc); + else + kibnal_tx_callback(&wc); + } + if (armed) + return; + if (iibt_cq_rearm(cq, CQEventSelNextWC) != FSUCCESS) { + CERROR("rearm failed?\n"); + return; + } + armed = 1; + } +} + +void +kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob) +{ + IB_LOCAL_DATASEGMENT *gl = &tx->tx_gl[tx->tx_nsp]; + IB_WORK_REQ *wrq = &tx->tx_wrq[tx->tx_nsp]; + int fence; + int nob = offsetof (kib_msg_t, ibm_u) + body_nob; + + LASSERT (tx->tx_nsp >= 0 && + tx->tx_nsp < sizeof(tx->tx_wrq)/sizeof(tx->tx_wrq[0])); + LASSERT (nob <= IBNAL_MSG_SIZE); + + tx->tx_msg->ibm_magic = IBNAL_MSG_MAGIC; + tx->tx_msg->ibm_version = IBNAL_MSG_VERSION; + tx->tx_msg->ibm_type = type; +#if IBNAL_CKSUM + tx->tx_msg->ibm_nob = nob; +#endif + /* Fence the message if it's bundled with an RDMA read */ + fence = (tx->tx_nsp > 0) && + (type == IBNAL_MSG_PUT_DONE); + + *gl = (IB_LOCAL_DATASEGMENT) { + .Address = tx->tx_vaddr, + .Length = IBNAL_MSG_SIZE, + .Lkey = kibnal_lkey(kibnal_data.kib_tx_pages), + }; + + wrq->WorkReqId = kibnal_ptr2wreqid(tx, 0); + wrq->Operation = WROpSend; + wrq->DSList = gl; + wrq->DSListDepth = 1; + wrq->MessageLen = nob; + wrq->Req.SendRC.ImmediateData = 0; + wrq->Req.SendRC.Options.s.SolicitedEvent = 1; + wrq->Req.SendRC.Options.s.SignaledCompletion = 1; + wrq->Req.SendRC.Options.s.ImmediateData = 0; + wrq->Req.SendRC.Options.s.Fence = fence; + + tx->tx_nsp++; +} + +static void +kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn) +{ + unsigned long flags; + + spin_lock_irqsave(&conn->ibc_lock, flags); + + kibnal_queue_tx_locked (tx, conn); + + spin_unlock_irqrestore(&conn->ibc_lock, flags); + + kibnal_check_sends(conn); +} + +static void +kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid) +{ + unsigned long flags; + kib_peer_t *peer; + kib_conn_t *conn; + rwlock_t *g_lock = &kibnal_data.kib_global_lock; + + /* If I get here, I've committed to send, so I complete the tx with + * failure on any problems */ + + LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */ + LASSERT (tx->tx_nsp > 0); /* work items have been set up */ + + read_lock (g_lock); + + peer = kibnal_find_peer_locked (nid); + if (peer == NULL) { + read_unlock (g_lock); + tx->tx_status = -EHOSTUNREACH; + kibnal_tx_done (tx); + return; + } + + conn = kibnal_find_conn_locked (peer); + if (conn != NULL) { + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */ + read_unlock (g_lock); + + kibnal_queue_tx (tx, conn); + return; + } + + /* Making one or more connections; I'll need a write lock... */ + read_unlock (g_lock); + write_lock_irqsave (g_lock, flags); + + peer = kibnal_find_peer_locked (nid); + if (peer == NULL) { + write_unlock_irqrestore (g_lock, flags); + tx->tx_status = -EHOSTUNREACH; + kibnal_tx_done (tx); + return; + } + + conn = kibnal_find_conn_locked (peer); + if (conn != NULL) { + /* Connection exists; queue message on it */ + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */ + write_unlock_irqrestore (g_lock, flags); + + kibnal_queue_tx (tx, conn); + return; + } + + if (peer->ibp_connecting == 0) { + if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) { + write_unlock_irqrestore (g_lock, flags); + tx->tx_status = -EHOSTUNREACH; + kibnal_tx_done (tx); + return; + } + + peer->ibp_connecting = 1; + kib_peer_addref(peer); /* extra ref for connd */ + + spin_lock (&kibnal_data.kib_connd_lock); + + list_add_tail (&peer->ibp_connd_list, + &kibnal_data.kib_connd_peers); + wake_up (&kibnal_data.kib_connd_waitq); + + spin_unlock (&kibnal_data.kib_connd_lock); + } + + /* A connection is being established; queue the message... */ + list_add_tail (&tx->tx_list, &peer->ibp_tx_queue); + + write_unlock_irqrestore (g_lock, flags); +} + +static ptl_err_t +kibnal_start_passive_rdma (int type, ptl_nid_t nid, + lib_msg_t *libmsg, ptl_hdr_t *hdr) +{ + int nob = libmsg->md->length; + kib_tx_t *tx; + kib_msg_t *ibmsg; + int rc; + IB_ACCESS_CONTROL access = {0,}; + + LASSERT (type == IBNAL_MSG_PUT_RDMA || type == IBNAL_MSG_GET_RDMA); + LASSERT (nob > 0); + LASSERT (!in_interrupt()); /* Mapping could block */ + + access.s.MWBindable = 1; + access.s.LocalWrite = 1; + access.s.RdmaRead = 1; + access.s.RdmaWrite = 1; + + tx = kibnal_get_idle_tx (1); /* May block; caller is an app thread */ + LASSERT (tx != NULL); + + if ((libmsg->md->options & PTL_MD_KIOV) == 0) + rc = kibnal_map_iov (tx, access, + libmsg->md->md_niov, + libmsg->md->md_iov.iov, + 0, nob, 0); + else + rc = kibnal_map_kiov (tx, access, + libmsg->md->md_niov, + libmsg->md->md_iov.kiov, + 0, nob, 0); + + if (rc != 0) { + CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc); + goto failed; + } + + if (type == IBNAL_MSG_GET_RDMA) { + /* reply gets finalized when tx completes */ + tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, + nid, libmsg); + if (tx->tx_libmsg[1] == NULL) { + CERROR ("Can't create reply for GET -> "LPX64"\n", + nid); + rc = -ENOMEM; + goto failed; + } + } + + tx->tx_passive_rdma = 1; + + ibmsg = tx->tx_msg; + + ibmsg->ibm_u.rdma.ibrm_hdr = *hdr; + ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie; + /* map_kiov alrady filled the rdma descs for the whole_mem case */ + if (!kibnal_whole_mem()) { + ibmsg->ibm_u.rdma.ibrm_desc[0].rd_key = tx->tx_md.md_rkey; + ibmsg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr; + ibmsg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob; + ibmsg->ibm_u.rdma.ibrm_num_descs = 1; + } + + kibnal_init_tx_msg (tx, type, + kib_rdma_msg_len(ibmsg->ibm_u.rdma.ibrm_num_descs)); + + CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr " + LPX64", nob %d\n", + tx, tx->tx_passive_rdma_cookie, tx->tx_md.md_rkey, + tx->tx_md.md_addr, nob); + + /* libmsg gets finalized when tx completes. */ + tx->tx_libmsg[0] = libmsg; + + kibnal_launch_tx(tx, nid); + return (PTL_OK); + + failed: + tx->tx_status = rc; + kibnal_tx_done (tx); + return (PTL_FAIL); +} + +void +kibnal_start_active_rdma (int type, int status, + kib_rx_t *rx, lib_msg_t *libmsg, + unsigned int niov, + struct iovec *iov, ptl_kiov_t *kiov, + size_t offset, size_t nob) +{ + kib_msg_t *rxmsg = rx->rx_msg; + kib_msg_t *txmsg; + kib_tx_t *tx; + IB_ACCESS_CONTROL access = {0,}; + IB_WR_OP rdma_op; + int rc; + __u32 i; + + CDEBUG(D_NET, "type %d, status %d, niov %d, offset %d, nob %d\n", + type, status, niov, offset, nob); + + /* Called by scheduler */ + LASSERT (!in_interrupt ()); + + /* Either all pages or all vaddrs */ + LASSERT (!(kiov != NULL && iov != NULL)); + + /* No data if we're completing with failure */ + LASSERT (status == 0 || nob == 0); + + LASSERT (type == IBNAL_MSG_GET_DONE || + type == IBNAL_MSG_PUT_DONE); + + /* Flag I'm completing the RDMA. Even if I fail to send the + * completion message, I will have tried my best so further + * attempts shouldn't be tried. */ + LASSERT (!rx->rx_rdma); + rx->rx_rdma = 1; + + if (type == IBNAL_MSG_GET_DONE) { + rdma_op = WROpRdmaWrite; + LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA); + } else { + access.s.LocalWrite = 1; + rdma_op = WROpRdmaRead; + LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA); + } + + tx = kibnal_get_idle_tx (0); /* Mustn't block */ + if (tx == NULL) { + CERROR ("tx descs exhausted on RDMA from "LPX64 + " completing locally with failure\n", + rx->rx_conn->ibc_peer->ibp_nid); + lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE); + return; + } + LASSERT (tx->tx_nsp == 0); + + if (nob == 0) + GOTO(init_tx, 0); + + /* We actually need to transfer some data (the transfer + * size could get truncated to zero when the incoming + * message is matched) */ + if (kiov != NULL) + rc = kibnal_map_kiov (tx, access, niov, kiov, offset, nob, 1); + else + rc = kibnal_map_iov (tx, access, niov, iov, offset, nob, 1); + + if (rc != 0) { + CERROR ("Can't map RDMA -> "LPX64": %d\n", + rx->rx_conn->ibc_peer->ibp_nid, rc); + /* We'll skip the RDMA and complete with failure. */ + status = rc; + nob = 0; + GOTO(init_tx, rc); + } + + if (!kibnal_whole_mem()) { + tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_key = tx->tx_md.md_lkey; + tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr; + tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob; + tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 1; + } + + /* XXX ugh. different page-sized hosts. */ + if (tx->tx_msg->ibm_u.rdma.ibrm_num_descs != + rxmsg->ibm_u.rdma.ibrm_num_descs) { + CERROR("tx descs (%u) != rx descs (%u)\n", + tx->tx_msg->ibm_u.rdma.ibrm_num_descs, + rxmsg->ibm_u.rdma.ibrm_num_descs); + /* We'll skip the RDMA and complete with failure. */ + status = rc; + nob = 0; + GOTO(init_tx, rc); + } + + /* map_kiov filled in the rdma descs which describe our side of the + * rdma transfer. */ + /* ibrm_num_descs was verified in rx_callback */ + for(i = 0; i < rxmsg->ibm_u.rdma.ibrm_num_descs; i++) { + kib_rdma_desc_t *ldesc, *rdesc; /* local, remote */ + IB_LOCAL_DATASEGMENT *ds = &tx->tx_gl[i]; + IB_WORK_REQ *wrq = &tx->tx_wrq[i]; + + ldesc = &tx->tx_msg->ibm_u.rdma.ibrm_desc[i]; + rdesc = &rxmsg->ibm_u.rdma.ibrm_desc[i]; + + ds->Address = ldesc->rd_addr; + ds->Length = ldesc->rd_nob; + ds->Lkey = ldesc->rd_key; + + memset(wrq, 0, sizeof(*wrq)); + wrq->WorkReqId = kibnal_ptr2wreqid(tx, 0); + wrq->Operation = rdma_op; + wrq->DSList = ds; + wrq->DSListDepth = 1; + wrq->MessageLen = ds->Length; + wrq->Req.SendRC.ImmediateData = 0; + wrq->Req.SendRC.Options.s.SolicitedEvent = 0; + wrq->Req.SendRC.Options.s.SignaledCompletion = 0; + wrq->Req.SendRC.Options.s.ImmediateData = 0; + wrq->Req.SendRC.Options.s.Fence = 0; + wrq->Req.SendRC.RemoteDS.Address = rdesc->rd_addr; + wrq->Req.SendRC.RemoteDS.Rkey = rdesc->rd_key; + + /* only the last rdma post triggers tx completion */ + if (i == rxmsg->ibm_u.rdma.ibrm_num_descs - 1) + wrq->Req.SendRC.Options.s.SignaledCompletion = 1; + + tx->tx_nsp++; + } + +init_tx: + txmsg = tx->tx_msg; + + txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie; + txmsg->ibm_u.completion.ibcm_status = status; + + kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t)); + + if (status == 0 && nob != 0) { + LASSERT (tx->tx_nsp > 1); + /* RDMA: libmsg gets finalized when the tx completes. This + * is after the completion message has been sent, which in + * turn is after the RDMA has finished. */ + tx->tx_libmsg[0] = libmsg; + } else { + LASSERT (tx->tx_nsp == 1); + /* No RDMA: local completion happens now! */ + CDEBUG(D_WARNING,"No data: immediate completion\n"); + lib_finalize (&kibnal_lib, NULL, libmsg, + status == 0 ? PTL_OK : PTL_FAIL); + } + + /* +1 ref for this tx... */ + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + rx->rx_conn, rx->rx_conn->ibc_state, + rx->rx_conn->ibc_peer->ibp_nid, + atomic_read (&rx->rx_conn->ibc_refcount)); + atomic_inc (&rx->rx_conn->ibc_refcount); + /* ...and queue it up */ + kibnal_queue_tx(tx, rx->rx_conn); +} + +static ptl_err_t +kibnal_sendmsg(lib_nal_t *nal, + void *private, + lib_msg_t *libmsg, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int payload_niov, + struct iovec *payload_iov, + ptl_kiov_t *payload_kiov, + size_t payload_offset, + size_t payload_nob) +{ + kib_msg_t *ibmsg; + kib_tx_t *tx; + int nob; + + /* NB 'private' is different depending on what we're sending.... */ + + CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid:"LPX64 + " pid %d\n", payload_nob, payload_niov, nid , pid); + + LASSERT (payload_nob == 0 || payload_niov > 0); + LASSERT (payload_niov <= PTL_MD_MAX_IOV); + + /* Thread context if we're sending payload */ + LASSERT (!in_interrupt() || payload_niov == 0); + /* payload is either all vaddrs or all pages */ + LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); + + switch (type) { + default: + LBUG(); + return (PTL_FAIL); + + case PTL_MSG_REPLY: { + /* reply's 'private' is the incoming receive */ + kib_rx_t *rx = private; + + /* RDMA reply expected? */ + if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) { + kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0, + rx, libmsg, payload_niov, + payload_iov, payload_kiov, + payload_offset, payload_nob); + return (PTL_OK); + } + + /* Incoming message consistent with immediate reply? */ + if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) { + CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n", + nid, rx->rx_msg->ibm_type); + return (PTL_FAIL); + } + + /* Will it fit in a message? */ + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); + if (nob >= IBNAL_MSG_SIZE) { + CERROR("REPLY for "LPX64" too big (RDMA not requested): %d\n", + nid, payload_nob); + return (PTL_FAIL); + } + break; + } + + case PTL_MSG_GET: + /* might the REPLY message be big enough to need RDMA? */ + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]); + if (nob > IBNAL_MSG_SIZE) + return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, + nid, libmsg, hdr)); + break; + + case PTL_MSG_ACK: + LASSERT (payload_nob == 0); + break; + + case PTL_MSG_PUT: + /* Is the payload big enough to need RDMA? */ + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); + if (nob > IBNAL_MSG_SIZE) + return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA, + nid, libmsg, hdr)); + + break; + } + + tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK || + type == PTL_MSG_REPLY || + in_interrupt())); + if (tx == NULL) { + CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n", + type, nid, in_interrupt() ? " (intr)" : ""); + return (PTL_NO_SPACE); + } + + ibmsg = tx->tx_msg; + ibmsg->ibm_u.immediate.ibim_hdr = *hdr; + + if (payload_nob > 0) { + if (payload_kiov != NULL) + lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload, + payload_niov, payload_kiov, + payload_offset, payload_nob); + else + lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload, + payload_niov, payload_iov, + payload_offset, payload_nob); + } + + kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, + offsetof(kib_immediate_msg_t, + ibim_payload[payload_nob])); + + /* libmsg gets finalized when tx completes */ + tx->tx_libmsg[0] = libmsg; + + kibnal_launch_tx(tx, nid); + return (PTL_OK); +} + +static ptl_err_t +kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int payload_niov, struct iovec *payload_iov, + size_t payload_offset, size_t payload_len) +{ + return (kibnal_sendmsg(nal, private, cookie, + hdr, type, nid, pid, + payload_niov, payload_iov, NULL, + payload_offset, payload_len)); +} + +static ptl_err_t +kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int payload_niov, ptl_kiov_t *payload_kiov, + size_t payload_offset, size_t payload_len) +{ + return (kibnal_sendmsg(nal, private, cookie, + hdr, type, nid, pid, + payload_niov, NULL, payload_kiov, + payload_offset, payload_len)); +} + +static ptl_err_t +kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, + unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov, + size_t offset, size_t mlen, size_t rlen) +{ + kib_rx_t *rx = private; + kib_msg_t *rxmsg = rx->rx_msg; + int msg_nob; + + LASSERT (mlen <= rlen); + LASSERT (!in_interrupt ()); + /* Either all pages or all vaddrs */ + LASSERT (!(kiov != NULL && iov != NULL)); + + switch (rxmsg->ibm_type) { + default: + LBUG(); + return (PTL_FAIL); + + case IBNAL_MSG_IMMEDIATE: + msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]); + if (msg_nob > IBNAL_MSG_SIZE) { + CERROR ("Immediate message from "LPX64" too big: %d\n", + rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen); + return (PTL_FAIL); + } + + if (kiov != NULL) + lib_copy_buf2kiov(niov, kiov, offset, + rxmsg->ibm_u.immediate.ibim_payload, + mlen); + else + lib_copy_buf2iov(niov, iov, offset, + rxmsg->ibm_u.immediate.ibim_payload, + mlen); + + lib_finalize (nal, NULL, libmsg, PTL_OK); + return (PTL_OK); + + case IBNAL_MSG_GET_RDMA: + /* We get called here just to discard any junk after the + * GET hdr. */ + LASSERT (libmsg == NULL); + lib_finalize (nal, NULL, libmsg, PTL_OK); + return (PTL_OK); + + case IBNAL_MSG_PUT_RDMA: + kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0, + rx, libmsg, + niov, iov, kiov, offset, mlen); + return (PTL_OK); + } +} + +static ptl_err_t +kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, + unsigned int niov, struct iovec *iov, + size_t offset, size_t mlen, size_t rlen) +{ + return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL, + offset, mlen, rlen)); +} + +static ptl_err_t +kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg, + unsigned int niov, ptl_kiov_t *kiov, + size_t offset, size_t mlen, size_t rlen) +{ + return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov, + offset, mlen, rlen)); +} + +/***************************************************************************** + * the rest of this file concerns connection management. active connetions + * start with connect_peer, passive connections start with passive_callback. + * active disconnects start with conn_close, cm_callback starts passive + * disconnects and contains the guts of how the disconnect state machine + * progresses. + *****************************************************************************/ + +int +kibnal_thread_start (int (*fn)(void *arg), void *arg) +{ + long pid = kernel_thread (fn, arg, 0); + + if (pid < 0) + return ((int)pid); + + atomic_inc (&kibnal_data.kib_nthreads); + return (0); +} + +static void +kibnal_thread_fini (void) +{ + atomic_dec (&kibnal_data.kib_nthreads); +} + +/* this can be called by anyone at any time to close a connection. if + * the connection is still established it heads to the connd to start + * the disconnection in a safe context. It has no effect if called + * on a connection that is already disconnecting */ +void +kibnal_close_conn_locked (kib_conn_t *conn, int error) +{ + /* This just does the immmediate housekeeping, and schedules the + * connection for the connd to finish off. + * Caller holds kib_global_lock exclusively in irq context */ + kib_peer_t *peer = conn->ibc_peer; + + KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_CONNECTING, + IBNAL_CONN_DISCONNECTED); + + if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) + return; /* already disconnecting */ + + CDEBUG (error == 0 ? D_NET : D_ERROR, + "closing conn to "LPX64": error %d\n", peer->ibp_nid, error); + + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { + /* kib_connd_conns takes ibc_list's ref */ + list_del (&conn->ibc_list); + } else { + /* new ref for kib_connd_conns */ + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); + } + + if (list_empty (&peer->ibp_conns) && + peer->ibp_persistence == 0) { + /* Non-persistent peer with no more conns... */ + kibnal_unlink_peer_locked (peer); + } + + conn->ibc_state = IBNAL_CONN_SEND_DREQ; + + spin_lock (&kibnal_data.kib_connd_lock); + + list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns); + wake_up (&kibnal_data.kib_connd_waitq); + + spin_unlock (&kibnal_data.kib_connd_lock); +} + +void +kibnal_close_conn (kib_conn_t *conn, int error) +{ + unsigned long flags; + + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + + kibnal_close_conn_locked (conn, error); + + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); +} + +static void +kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc) +{ + LIST_HEAD (zombies); + kib_tx_t *tx; + unsigned long flags; + + LASSERT (rc != 0); + LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL); + + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + + LASSERT (peer->ibp_connecting != 0); + peer->ibp_connecting--; + + if (peer->ibp_connecting != 0) { + /* another connection attempt under way (loopback?)... */ + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + return; + } + + if (list_empty(&peer->ibp_conns)) { + /* Say when active connection can be re-attempted */ + peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval; + /* Increase reconnection interval */ + peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2, + IBNAL_MAX_RECONNECT_INTERVAL); + + /* Take peer's blocked blocked transmits; I'll complete + * them with error */ + while (!list_empty (&peer->ibp_tx_queue)) { + tx = list_entry (peer->ibp_tx_queue.next, + kib_tx_t, tx_list); + + list_del (&tx->tx_list); + list_add_tail (&tx->tx_list, &zombies); + } + + if (kibnal_peer_active(peer) && + (peer->ibp_persistence == 0)) { + /* failed connection attempt on non-persistent peer */ + kibnal_unlink_peer_locked (peer); + } + } else { + /* Can't have blocked transmits if there are connections */ + LASSERT (list_empty(&peer->ibp_tx_queue)); + } + + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + + if (!list_empty (&zombies)) + CERROR ("Deleting messages for "LPX64": connection failed\n", + peer->ibp_nid); + + while (!list_empty (&zombies)) { + tx = list_entry (zombies.next, kib_tx_t, tx_list); + + list_del (&tx->tx_list); + /* complete now */ + tx->tx_status = -EHOSTUNREACH; + kibnal_tx_done (tx); + } +} + +static void +kibnal_connreq_done (kib_conn_t *conn, int active, int status) +{ + int state = conn->ibc_state; + kib_peer_t *peer = conn->ibc_peer; + kib_tx_t *tx; + unsigned long flags; + int i; + + /* passive connection has no connreq & vice versa */ + LASSERTF(!active == !(conn->ibc_connreq != NULL), + "%d %p\n", active, conn->ibc_connreq); + if (active) { + PORTAL_FREE (conn->ibc_connreq, sizeof (*conn->ibc_connreq)); + conn->ibc_connreq = NULL; + } + + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + + LASSERT (peer->ibp_connecting != 0); + + if (status == 0) { + /* connection established... */ + KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_CONNECTING); + conn->ibc_state = IBNAL_CONN_ESTABLISHED; + + if (!kibnal_peer_active(peer)) { + /* ...but peer deleted meantime */ + status = -ECONNABORTED; + } + } else { + KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_INIT_QP, + IBNAL_CONN_CONNECTING); + } + + if (status == 0) { + /* Everything worked! */ + + peer->ibp_connecting--; + + /* +1 ref for ibc_list; caller(== CM)'s ref remains until + * the IB_CM_IDLE callback */ + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); + list_add (&conn->ibc_list, &peer->ibp_conns); + + /* reset reconnect interval for next attempt */ + peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; + + /* post blocked sends to the new connection */ + spin_lock (&conn->ibc_lock); + + while (!list_empty (&peer->ibp_tx_queue)) { + tx = list_entry (peer->ibp_tx_queue.next, + kib_tx_t, tx_list); + + list_del (&tx->tx_list); + + /* +1 ref for each tx */ + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); + kibnal_queue_tx_locked (tx, conn); + } + + spin_unlock (&conn->ibc_lock); + + /* Nuke any dangling conns from a different peer instance... */ + kibnal_close_stale_conns_locked (conn->ibc_peer, + conn->ibc_incarnation); + + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + + /* queue up all the receives */ + for (i = 0; i < IBNAL_RX_MSGS; i++) { + /* +1 ref for rx desc */ + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); + + CDEBUG(D_NET, "RX[%d] %p->%p - "LPX64"\n", + i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg, + conn->ibc_rxs[i].rx_vaddr); + + kibnal_post_rx (&conn->ibc_rxs[i], 0); + } + + kibnal_check_sends (conn); + return; + } + + /* connection failed */ + if (state == IBNAL_CONN_CONNECTING) { + /* schedule for connd to close */ + kibnal_close_conn_locked (conn, status); + } else { + /* Don't have a CM comm_id; just wait for refs to drain */ + conn->ibc_state = IBNAL_CONN_DISCONNECTED; + } + + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + + kibnal_peer_connect_failed (conn->ibc_peer, active, status); + + /* If we didn't establish the connection we don't have to pass + * through the disconnect protocol before dropping the CM ref */ + if (state < IBNAL_CONN_CONNECTING) + kibnal_put_conn (conn); +} + +static int +kibnal_accept (kib_conn_t **connp, IB_HANDLE *cep, + ptl_nid_t nid, __u64 incarnation, int queue_depth) +{ + kib_conn_t *conn = kibnal_create_conn(); + kib_peer_t *peer; + kib_peer_t *peer2; + unsigned long flags; + + if (conn == NULL) + return (-ENOMEM); + + if (queue_depth != IBNAL_MSG_QUEUE_SIZE) { + CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n", + nid, queue_depth, IBNAL_MSG_QUEUE_SIZE); + atomic_dec (&conn->ibc_refcount); + kibnal_destroy_conn(conn); + return (-EPROTO); + } + + /* assume 'nid' is a new peer */ + peer = kibnal_create_peer (nid); + if (peer == NULL) { + CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_dec (&conn->ibc_refcount); + kibnal_destroy_conn(conn); + return (-ENOMEM); + } + + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + + peer2 = kibnal_find_peer_locked(nid); + if (peer2 == NULL) { + /* peer table takes my ref on peer */ + list_add_tail (&peer->ibp_list, kibnal_nid2peerlist(nid)); + } else { + kib_peer_decref (peer); + peer = peer2; + } + + kib_peer_addref(peer); /* +1 ref for conn */ + peer->ibp_connecting++; + + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + + conn->ibc_peer = peer; + conn->ibc_state = IBNAL_CONN_CONNECTING; + /* conn->ibc_cep is set when cm_accept is called */ + conn->ibc_incarnation = incarnation; + conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; + + *connp = conn; + return (0); +} + +static void kibnal_set_qp_state(IB_HANDLE *qp, IB_QP_STATE state) +{ + IB_QP_ATTRIBUTES_MODIFY modify_attr = {0,}; + FSTATUS frc; + + modify_attr.RequestState = state; + + frc = iibt_qp_modify(qp, &modify_attr, NULL); + if (frc != FSUCCESS) + CERROR("couldn't set qp state to %d, error %d\n", state, frc); +} + +static void kibnal_flush_pending(kib_conn_t *conn) +{ + LIST_HEAD (zombies); + struct list_head *tmp; + struct list_head *nxt; + kib_tx_t *tx; + unsigned long flags; + int done; + + /* NB we wait until the connection has closed before completing + * outstanding passive RDMAs so we can be sure the network can't + * touch the mapped memory any more. */ + KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_DISCONNECTED); + + /* set the QP to the error state so that we get flush callbacks + * on our posted receives which can then drop their conn refs */ + kibnal_set_qp_state(conn->ibc_qp, QPStateError); + + spin_lock_irqsave (&conn->ibc_lock, flags); + + /* grab passive RDMAs not waiting for the tx callback */ + list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) { + tx = list_entry (tmp, kib_tx_t, tx_list); + + LASSERT (tx->tx_passive_rdma || + !tx->tx_passive_rdma_wait); + + LASSERT (tx->tx_passive_rdma_wait || + tx->tx_sending != 0); + + /* still waiting for tx callback? */ + if (!tx->tx_passive_rdma_wait) + continue; + + tx->tx_status = -ECONNABORTED; + tx->tx_passive_rdma_wait = 0; + done = (tx->tx_sending == 0); + + if (!done) + continue; + + list_del (&tx->tx_list); + list_add (&tx->tx_list, &zombies); + } + + /* grab all blocked transmits */ + list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) { + tx = list_entry (tmp, kib_tx_t, tx_list); + + list_del (&tx->tx_list); + list_add (&tx->tx_list, &zombies); + } + + spin_unlock_irqrestore (&conn->ibc_lock, flags); + + while (!list_empty(&zombies)) { + tx = list_entry (zombies.next, kib_tx_t, tx_list); + + list_del(&tx->tx_list); + kibnal_tx_done (tx); + } +} + +static void +kibnal_reject (IB_HANDLE cep, uint16_t reason) +{ + CM_REJECT_INFO *rej; + + PORTAL_ALLOC(rej, sizeof(*rej)); + if (rej == NULL) /* PORTAL_ALLOC() will CERROR on failure */ + return; + + rej->Reason = reason; + iibt_cm_reject(cep, rej); + PORTAL_FREE(rej, sizeof(*rej)); +} + +static FSTATUS +kibnal_qp_rts(IB_HANDLE qp_handle, __u32 qpn, __u8 resp_res, + IB_PATH_RECORD *path, __u8 init_depth, __u32 send_psn) +{ + IB_QP_ATTRIBUTES_MODIFY modify_attr; + FSTATUS frc; + ENTRY; + + modify_attr = (IB_QP_ATTRIBUTES_MODIFY) { + .RequestState = QPStateReadyToRecv, + .RecvPSN = IBNAL_STARTING_PSN, + .DestQPNumber = qpn, + .ResponderResources = resp_res, + .MinRnrTimer = UsecToRnrNakTimer(2000), /* 20 ms */ + .Attrs = (IB_QP_ATTR_RECVPSN | + IB_QP_ATTR_DESTQPNUMBER | + IB_QP_ATTR_RESPONDERRESOURCES | + IB_QP_ATTR_DESTAV | + IB_QP_ATTR_PATHMTU | + IB_QP_ATTR_MINRNRTIMER), + }; + GetAVFromPath(0, path, &modify_attr.PathMTU, NULL, + &modify_attr.DestAV); + + frc = iibt_qp_modify(qp_handle, &modify_attr, NULL); + if (frc != FSUCCESS) + RETURN(frc); + + modify_attr = (IB_QP_ATTRIBUTES_MODIFY) { + .RequestState = QPStateReadyToSend, + .FlowControl = TRUE, + .InitiatorDepth = init_depth, + .SendPSN = send_psn, + .LocalAckTimeout = path->PktLifeTime + 2, /* 2 or 1? */ + .RetryCount = IBNAL_RETRY, + .RnrRetryCount = IBNAL_RNR_RETRY, + .Attrs = (IB_QP_ATTR_FLOWCONTROL | + IB_QP_ATTR_INITIATORDEPTH | + IB_QP_ATTR_SENDPSN | + IB_QP_ATTR_LOCALACKTIMEOUT | + IB_QP_ATTR_RETRYCOUNT | + IB_QP_ATTR_RNRRETRYCOUNT), + }; + + frc = iibt_qp_modify(qp_handle, &modify_attr, NULL); + RETURN(frc); +} + +static void +kibnal_connect_reply (IB_HANDLE cep, CM_CONN_INFO *info, void *arg) +{ + IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs; + kib_conn_t *conn = arg; + kib_wire_connreq_t *wcr; + CM_REPLY_INFO *rep = &info->Info.Reply; + uint16_t reason; + FSTATUS frc; + + wcr = (kib_wire_connreq_t *)info->Info.Reply.PrivateData; + + if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) { + CERROR ("Can't connect "LPX64": bad magic %08x\n", + conn->ibc_peer->ibp_nid, le32_to_cpu(wcr->wcr_magic)); + GOTO(reject, reason = RC_USER_REJ); + } + + if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) { + CERROR ("Can't connect "LPX64": bad version %d\n", + conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_magic)); + GOTO(reject, reason = RC_USER_REJ); + } + + if (wcr->wcr_queue_depth != cpu_to_le16(IBNAL_MSG_QUEUE_SIZE)) { + CERROR ("Can't connect "LPX64": bad queue depth %d\n", + conn->ibc_peer->ibp_nid, + le16_to_cpu(wcr->wcr_queue_depth)); + GOTO(reject, reason = RC_USER_REJ); + } + + if (le64_to_cpu(wcr->wcr_nid) != conn->ibc_peer->ibp_nid) { + CERROR ("Unexpected NID "LPX64" from "LPX64"\n", + le64_to_cpu(wcr->wcr_nid), conn->ibc_peer->ibp_nid); + GOTO(reject, reason = RC_USER_REJ); + } + + CDEBUG(D_NET, "Connection %p -> "LPX64" REP_RECEIVED.\n", + conn, conn->ibc_peer->ibp_nid); + + conn->ibc_incarnation = le64_to_cpu(wcr->wcr_incarnation); + conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; + + frc = kibnal_qp_rts(conn->ibc_qp, rep->QPN, + min_t(__u8, rep->ArbInitiatorDepth, + ca_attr->MaxQPResponderResources), + &conn->ibc_connreq->cr_path, + min_t(__u8, rep->ArbResponderResources, + ca_attr->MaxQPInitiatorDepth), + rep->StartingPSN); + if (frc != FSUCCESS) { + CERROR("Connection %p -> "LPX64" QP RTS/RTR failed: %d\n", + conn, conn->ibc_peer->ibp_nid, frc); + GOTO(reject, reason = RC_NO_QP); + } + + /* the callback arguments are ignored for an active accept */ + conn->ibc_connreq->cr_discarded.Status = FSUCCESS; + frc = iibt_cm_accept(cep, &conn->ibc_connreq->cr_discarded, + NULL, NULL, NULL, NULL); + if (frc != FCM_CONNECT_ESTABLISHED) { + CERROR("Connection %p -> "LPX64" CMAccept failed: %d\n", + conn, conn->ibc_peer->ibp_nid, frc); + kibnal_connreq_done (conn, 1, -ECONNABORTED); + /* XXX don't call reject after accept fails? */ + return; + } + + CDEBUG(D_NET, "Connection %p -> "LPX64" Established\n", + conn, conn->ibc_peer->ibp_nid); + + kibnal_connreq_done (conn, 1, 0); + return; + +reject: + kibnal_reject(cep, reason); + kibnal_connreq_done (conn, 1, -EPROTO); +} + +/* ib_cm.h has a wealth of information on the CM procedures */ +static void +kibnal_cm_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg) +{ + kib_conn_t *conn = arg; + + CDEBUG(D_NET, "status 0x%x\n", info->Status); + + /* Established Connection Notifier */ + switch (info->Status) { + default: + CERROR("unknown status %d on Connection %p -> "LPX64"\n", + info->Status, conn, conn->ibc_peer->ibp_nid); + LBUG(); + break; + + case FCM_CONNECT_REPLY: + kibnal_connect_reply(cep, info, arg); + break; + + case FCM_DISCONNECT_REQUEST: + /* XXX lock around these state management bits? */ + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) + kibnal_close_conn (conn, 0); + conn->ibc_state = IBNAL_CONN_DREP; + iibt_cm_disconnect(conn->ibc_cep, NULL, NULL); + break; + + /* these both guarantee that no more cm callbacks will occur */ + case FCM_DISCONNECTED: /* aka FCM_DISCONNECT_TIMEOUT */ + case FCM_DISCONNECT_REPLY: + CDEBUG(D_NET, "Connection %p -> "LPX64" disconnect done.\n", + conn, conn->ibc_peer->ibp_nid); + + conn->ibc_state = IBNAL_CONN_DISCONNECTED; + kibnal_flush_pending(conn); + kibnal_put_conn(conn); /* Lose CM's ref */ + break; + } + + return; +} + +static int +kibnal_set_cm_flags(IB_HANDLE cep) +{ + FSTATUS frc; + uint32 value = 1; + + frc = iibt_cm_modify_cep(cep, CM_FLAG_TIMEWAIT_CALLBACK, + (char *)&value, sizeof(value), 0); + if (frc != FSUCCESS) { + CERROR("error setting timeout callback: %d\n", frc); + return -1; + } + +#if 0 + frc = iibt_cm_modify_cep(cep, CM_FLAG_ASYNC_ACCEPT, (char *)&value, + sizeof(value), 0); + if (frc != FSUCCESS) { + CERROR("error setting async accept: %d\n", frc); + return -1; + } +#endif + + return 0; +} + +void +kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg) +{ + IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs; + IB_QP_ATTRIBUTES_QUERY *query; + CM_REQUEST_INFO *req; + CM_CONN_INFO *rep = NULL, *rcv = NULL; + kib_wire_connreq_t *wcr; + kib_conn_t *conn = NULL; + uint16_t reason = 0; + FSTATUS frc; + int rc = 0; + + LASSERT(cep); + LASSERT(info); + LASSERT(arg == NULL); /* no conn yet for passive */ + + CDEBUG(D_NET, "status 0x%x\n", info->Status); + + req = &info->Info.Request; + wcr = (kib_wire_connreq_t *)req->PrivateData; + + CDEBUG(D_NET, "%d from "LPX64"\n", info->Status, + le64_to_cpu(wcr->wcr_nid)); + + if (info->Status == FCM_CONNECT_CANCEL) + return; + + LASSERT (info->Status == FCM_CONNECT_REQUEST); + + if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) { + CERROR ("Can't accept: bad magic %08x\n", + le32_to_cpu(wcr->wcr_magic)); + GOTO(out, reason = RC_USER_REJ); + } + + if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) { + CERROR ("Can't accept: bad version %d\n", + le16_to_cpu(wcr->wcr_magic)); + GOTO(out, reason = RC_USER_REJ); + } + + rc = kibnal_accept(&conn, cep, + le64_to_cpu(wcr->wcr_nid), + le64_to_cpu(wcr->wcr_incarnation), + le16_to_cpu(wcr->wcr_queue_depth)); + if (rc != 0) { + CERROR ("Can't accept "LPX64": %d\n", + le64_to_cpu(wcr->wcr_nid), rc); + GOTO(out, reason = RC_NO_RESOURCES); + } + + frc = kibnal_qp_rts(conn->ibc_qp, req->CEPInfo.QPN, + min_t(__u8, req->CEPInfo.OfferedInitiatorDepth, + ca_attr->MaxQPResponderResources), + &req->PathInfo.Path, + min_t(__u8, req->CEPInfo.OfferedResponderResources, + ca_attr->MaxQPInitiatorDepth), + req->CEPInfo.StartingPSN); + + if (frc != FSUCCESS) { + CERROR ("Can't mark QP RTS/RTR "LPX64": %d\n", + le64_to_cpu(wcr->wcr_nid), frc); + GOTO(out, reason = RC_NO_QP); + } + + frc = iibt_qp_query(conn->ibc_qp, &conn->ibc_qp_attrs, NULL); + if (frc != FSUCCESS) { + CERROR ("Couldn't query qp attributes "LPX64": %d\n", + le64_to_cpu(wcr->wcr_nid), frc); + GOTO(out, reason = RC_NO_QP); + } + query = &conn->ibc_qp_attrs; + + PORTAL_ALLOC(rep, sizeof(*rep)); + PORTAL_ALLOC(rcv, sizeof(*rcv)); + if (rep == NULL || rcv == NULL) { + CERROR ("can't reply and receive buffers\n"); + GOTO(out, reason = RC_INSUFFICIENT_RESP_RES); + } + + /* don't try to deref this into the incoming wcr :) */ + wcr = (kib_wire_connreq_t *)rep->Info.Reply.PrivateData; + + rep->Info.Reply = (CM_REPLY_INFO) { + .QPN = query->QPNumber, + .QKey = query->Qkey, + .StartingPSN = query->RecvPSN, + .EndToEndFlowControl = query->FlowControl, + /* XXX Hmm. */ + .ArbInitiatorDepth = query->InitiatorDepth, + .ArbResponderResources = query->ResponderResources, + .TargetAckDelay = 0, + .FailoverAccepted = 0, + .RnRRetryCount = req->CEPInfo.RnrRetryCount, + }; + + *wcr = (kib_wire_connreq_t) { + .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC), + .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION), + .wcr_queue_depth = cpu_to_le32(IBNAL_MSG_QUEUE_SIZE), + .wcr_nid = cpu_to_le64(kibnal_data.kib_nid), + .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation), + }; + + frc = iibt_cm_accept(cep, rep, rcv, kibnal_cm_callback, conn, + &conn->ibc_cep); + + PORTAL_FREE(rep, sizeof(*rep)); + PORTAL_FREE(rcv, sizeof(*rcv)); + + if (frc != FCM_CONNECT_ESTABLISHED) { + /* XXX it seems we don't call reject after this point? */ + CERROR("iibt_cm_accept() failed: %d, aborting\n", frc); + rc = -ECONNABORTED; + goto out; + } + + if (kibnal_set_cm_flags(conn->ibc_cep)) { + rc = -ECONNABORTED; + goto out; + } + + CDEBUG(D_WARNING, "Connection %p -> "LPX64" ESTABLISHED.\n", + conn, conn->ibc_peer->ibp_nid); + +out: + if (reason) { + kibnal_reject(cep, reason); + rc = -ECONNABORTED; + } + if (conn != NULL) + kibnal_connreq_done(conn, 0, rc); + + return; +} + +static void +dump_path_records(PATH_RESULTS *results) +{ + IB_PATH_RECORD *path; + int i; + + for(i = 0; i < results->NumPathRecords; i++) { + path = &results->PathRecords[i]; + CDEBUG(D_NET, "%d: sgid "LPX64":"LPX64" dgid " + LPX64":"LPX64" pkey %x\n", + i, + path->SGID.Type.Global.SubnetPrefix, + path->SGID.Type.Global.InterfaceID, + path->DGID.Type.Global.SubnetPrefix, + path->DGID.Type.Global.InterfaceID, + path->P_Key); + } +} + +static void +kibnal_pathreq_callback (void *arg, QUERY *query, + QUERY_RESULT_VALUES *query_res) +{ + IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs; + kib_conn_t *conn = arg; + PATH_RESULTS *path; + FSTATUS frc; + + if (query_res->Status != FSUCCESS || query_res->ResultDataSize == 0) { + CERROR ("status %d data size %d\n", query_res->Status, + query_res->ResultDataSize); + kibnal_connreq_done (conn, 1, -EINVAL); + return; + } + + path = (PATH_RESULTS *)query_res->QueryResult; + + if (path->NumPathRecords < 1) { + CERROR ("expected path records: %d\n", path->NumPathRecords); + kibnal_connreq_done (conn, 1, -EINVAL); + return; + } + + dump_path_records(path); + + /* just using the first. this is probably a horrible idea. */ + conn->ibc_connreq->cr_path = path->PathRecords[0]; + + conn->ibc_cep = iibt_cm_create_cep(CM_RC_TYPE); + if (conn->ibc_cep == NULL) { + CERROR ("Can't create CEP\n"); + kibnal_connreq_done (conn, 1, -EINVAL); + return; + } + + if (kibnal_set_cm_flags(conn->ibc_cep)) { + kibnal_connreq_done (conn, 1, -EINVAL); + return; + } + + conn->ibc_connreq->cr_wcr = (kib_wire_connreq_t) { + .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC), + .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION), + .wcr_queue_depth = cpu_to_le16(IBNAL_MSG_QUEUE_SIZE), + .wcr_nid = cpu_to_le64(kibnal_data.kib_nid), + .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation), + }; + + conn->ibc_connreq->cr_cmreq = (CM_REQUEST_INFO) { + .SID = conn->ibc_connreq->cr_service.RID.ServiceID, + .CEPInfo = (CM_CEP_INFO) { + .CaGUID = kibnal_data.kib_hca_guids[0], + .EndToEndFlowControl = FALSE, + .PortGUID = conn->ibc_connreq->cr_path.SGID.Type.Global.InterfaceID, + .RetryCount = IBNAL_RETRY, + .RnrRetryCount = IBNAL_RNR_RETRY, + .AckTimeout = IBNAL_ACK_TIMEOUT, + .StartingPSN = IBNAL_STARTING_PSN, + .QPN = conn->ibc_qp_attrs.QPNumber, + .QKey = conn->ibc_qp_attrs.Qkey, + .OfferedResponderResources = ca_attr->MaxQPResponderResources, + .OfferedInitiatorDepth = ca_attr->MaxQPInitiatorDepth, + }, + .PathInfo = (CM_CEP_PATHINFO) { + .bSubnetLocal = TRUE, + .Path = conn->ibc_connreq->cr_path, + }, + }; + +#if 0 + /* XXX set timeout just like SDP!!!*/ + conn->ibc_connreq->cr_path.packet_life = 13; +#endif + /* Flag I'm getting involved with the CM... */ + conn->ibc_state = IBNAL_CONN_CONNECTING; + + CDEBUG(D_NET, "Connecting to, service id "LPX64", on "LPX64"\n", + conn->ibc_connreq->cr_service.RID.ServiceID, + *kibnal_service_nid_field(&conn->ibc_connreq->cr_service)); + + memset(conn->ibc_connreq->cr_cmreq.PrivateData, 0, + CM_REQUEST_INFO_USER_LEN); + memcpy(conn->ibc_connreq->cr_cmreq.PrivateData, + &conn->ibc_connreq->cr_wcr, sizeof(conn->ibc_connreq->cr_wcr)); + + /* kibnal_cm_callback gets my conn ref */ + frc = iibt_cm_connect(conn->ibc_cep, &conn->ibc_connreq->cr_cmreq, + kibnal_cm_callback, conn); + if (frc != FPENDING && frc != FSUCCESS) { + CERROR ("Connect: %d\n", frc); + /* Back out state change as connect failed */ + conn->ibc_state = IBNAL_CONN_INIT_QP; + kibnal_connreq_done (conn, 1, -EINVAL); + } +} + +static void +dump_service_records(SERVICE_RECORD_RESULTS *results) +{ + IB_SERVICE_RECORD *svc; + int i; + + for(i = 0; i < results->NumServiceRecords; i++) { + svc = &results->ServiceRecords[i]; + CDEBUG(D_NET, "%d: sid "LPX64" gid "LPX64":"LPX64" pkey %x\n", + i, + svc->RID.ServiceID, + svc->RID.ServiceGID.Type.Global.SubnetPrefix, + svc->RID.ServiceGID.Type.Global.InterfaceID, + svc->RID.ServiceP_Key); + } +} + + +static void +kibnal_service_get_callback (void *arg, QUERY *query, + QUERY_RESULT_VALUES *query_res) +{ + kib_conn_t *conn = arg; + SERVICE_RECORD_RESULTS *svc; + COMMAND_CONTROL_PARAMETERS sd_params; + QUERY path_query; + FSTATUS frc; + + if (query_res->Status != FSUCCESS || query_res->ResultDataSize == 0) { + CERROR ("status %d data size %d\n", query_res->Status, + query_res->ResultDataSize); + kibnal_connreq_done (conn, 1, -EINVAL); + return; + } + + svc = (SERVICE_RECORD_RESULTS *)query_res->QueryResult; + + if (svc->NumServiceRecords < 1) { + CERROR ("%d service records\n", svc->NumServiceRecords); + kibnal_connreq_done (conn, 1, -EINVAL); + return; + } + + dump_service_records(svc); + + conn->ibc_connreq->cr_service = svc->ServiceRecords[0]; + + CDEBUG(D_NET, "Got status %d, service id "LPX64", on "LPX64"\n", + query_res->Status , conn->ibc_connreq->cr_service.RID.ServiceID, + *kibnal_service_nid_field(&conn->ibc_connreq->cr_service)); + + memset(&path_query, 0, sizeof(path_query)); + path_query.InputType = InputTypePortGuidPair; + path_query.OutputType = OutputTypePathRecord; + path_query.InputValue.PortGuidPair.SourcePortGuid = kibnal_data.kib_port_guid; + path_query.InputValue.PortGuidPair.DestPortGuid = conn->ibc_connreq->cr_service.RID.ServiceGID.Type.Global.InterfaceID; + + memset(&sd_params, 0, sizeof(sd_params)); + sd_params.RetryCount = IBNAL_RETRY; + sd_params.Timeout = 10 * 1000; /* wait 10 seconds */ + + /* kibnal_service_get_callback gets my conn ref */ + + frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd, + kibnal_data.kib_port_guid, + &path_query, + kibnal_pathreq_callback, + &sd_params, conn); + if (frc == FPENDING) + return; + + CERROR ("Path record request failed: %d\n", frc); + kibnal_connreq_done (conn, 1, -EINVAL); +} + +static void +kibnal_connect_peer (kib_peer_t *peer) +{ + COMMAND_CONTROL_PARAMETERS sd_params; + QUERY query; + FSTATUS frc; + kib_conn_t *conn = kibnal_create_conn(); + + LASSERT (peer->ibp_connecting != 0); + + if (conn == NULL) { + CERROR ("Can't allocate conn\n"); + kibnal_peer_connect_failed (peer, 1, -ENOMEM); + return; + } + + conn->ibc_peer = peer; + kib_peer_addref(peer); + + PORTAL_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq)); + if (conn->ibc_connreq == NULL) { + CERROR ("Can't allocate connreq\n"); + kibnal_connreq_done (conn, 1, -ENOMEM); + return; + } + + memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq)); + + kibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid); + + memset(&query, 0, sizeof(query)); + query.InputType = InputTypeServiceRecord; + query.OutputType = OutputTypeServiceRecord; + query.InputValue.ServiceRecordValue.ServiceRecord = conn->ibc_connreq->cr_service; + query.InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK; + + memset(&sd_params, 0, sizeof(sd_params)); + sd_params.RetryCount = IBNAL_RETRY; + sd_params.Timeout = 10 * 1000; /* wait 10 seconds */ + + /* kibnal_service_get_callback gets my conn ref */ + frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd, + kibnal_data.kib_port_guid, + &query, + kibnal_service_get_callback, + &sd_params, conn); + if (frc == FPENDING) + return; + + CERROR ("iibt_sd_query_port_fabric_information(): %d\n", frc); + kibnal_connreq_done (conn, 1, frc); +} + +static int +kibnal_conn_timed_out (kib_conn_t *conn) +{ + kib_tx_t *tx; + struct list_head *ttmp; + unsigned long flags; + + spin_lock_irqsave (&conn->ibc_lock, flags); + + list_for_each (ttmp, &conn->ibc_tx_queue) { + tx = list_entry (ttmp, kib_tx_t, tx_list); + + LASSERT (!tx->tx_passive_rdma_wait); + LASSERT (tx->tx_sending == 0); + + if (time_after_eq (jiffies, tx->tx_deadline)) { + spin_unlock_irqrestore (&conn->ibc_lock, flags); + return 1; + } + } + + list_for_each (ttmp, &conn->ibc_active_txs) { + tx = list_entry (ttmp, kib_tx_t, tx_list); + + LASSERT (tx->tx_passive_rdma || + !tx->tx_passive_rdma_wait); + + LASSERT (tx->tx_passive_rdma_wait || + tx->tx_sending != 0); + + if (time_after_eq (jiffies, tx->tx_deadline)) { + spin_unlock_irqrestore (&conn->ibc_lock, flags); + return 1; + } + } + + spin_unlock_irqrestore (&conn->ibc_lock, flags); + + return 0; +} + +static void +kibnal_check_conns (int idx) +{ + struct list_head *peers = &kibnal_data.kib_peers[idx]; + struct list_head *ptmp; + kib_peer_t *peer; + kib_conn_t *conn; + struct list_head *ctmp; + + again: + /* NB. We expect to have a look at all the peers and not find any + * rdmas to time out, so we just use a shared lock while we + * take a look... */ + read_lock (&kibnal_data.kib_global_lock); + + list_for_each (ptmp, peers) { + peer = list_entry (ptmp, kib_peer_t, ibp_list); + + list_for_each (ctmp, &peer->ibp_conns) { + conn = list_entry (ctmp, kib_conn_t, ibc_list); + + KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_ESTABLISHED); + + /* In case we have enough credits to return via a + * NOOP, but there were no non-blocking tx descs + * free to do it last time... */ + kibnal_check_sends(conn); + + if (!kibnal_conn_timed_out(conn)) + continue; + + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + + atomic_inc (&conn->ibc_refcount); + read_unlock (&kibnal_data.kib_global_lock); + + CERROR("Timed out RDMA with "LPX64"\n", + peer->ibp_nid); + + kibnal_close_conn (conn, -ETIMEDOUT); + kibnal_put_conn (conn); + + /* start again now I've dropped the lock */ + goto again; + } + } + + read_unlock (&kibnal_data.kib_global_lock); +} + +static void +kib_connd_handle_state(kib_conn_t *conn) +{ + FSTATUS frc; + + switch (conn->ibc_state) { + /* all refs have gone, free and be done with it */ + case IBNAL_CONN_DISCONNECTED: + kibnal_destroy_conn (conn); + return; /* avoid put_conn */ + + case IBNAL_CONN_SEND_DREQ: + frc = iibt_cm_disconnect(conn->ibc_cep, NULL, NULL); + if (frc != FSUCCESS) /* XXX do real things */ + CERROR("disconnect failed: %d\n", frc); + conn->ibc_state = IBNAL_CONN_DREQ; + break; + + /* a callback got to the conn before we did */ + case IBNAL_CONN_DREP: + break; + + default: + CERROR ("Bad conn %p state: %d\n", conn, + conn->ibc_state); + LBUG(); + break; + } + + /* drop ref from close_conn */ + kibnal_put_conn(conn); +} + +int +kibnal_connd (void *arg) +{ + wait_queue_t wait; + unsigned long flags; + kib_conn_t *conn; + kib_peer_t *peer; + int timeout; + int i; + int peer_index = 0; + unsigned long deadline = jiffies; + + kportal_daemonize ("kibnal_connd"); + kportal_blockallsigs (); + + init_waitqueue_entry (&wait, current); + + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); + + for (;;) { + if (!list_empty (&kibnal_data.kib_connd_conns)) { + conn = list_entry (kibnal_data.kib_connd_conns.next, + kib_conn_t, ibc_list); + list_del (&conn->ibc_list); + + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); + kib_connd_handle_state(conn); + + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); + continue; + } + + if (!list_empty (&kibnal_data.kib_connd_peers)) { + peer = list_entry (kibnal_data.kib_connd_peers.next, + kib_peer_t, ibp_connd_list); + + list_del_init (&peer->ibp_connd_list); + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); + + kibnal_connect_peer (peer); + kib_peer_decref (peer); + + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); + } + + /* shut down and nobody left to reap... */ + if (kibnal_data.kib_shutdown && + atomic_read(&kibnal_data.kib_nconns) == 0) + break; + + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); + + /* careful with the jiffy wrap... */ + while ((timeout = (int)(deadline - jiffies)) <= 0) { + const int n = 4; + const int p = 1; + int chunk = kibnal_data.kib_peer_hash_size; + + /* Time to check for RDMA timeouts on a few more + * peers: I do checks every 'p' seconds on a + * proportion of the peer table and I need to check + * every connection 'n' times within a timeout + * interval, to ensure I detect a timeout on any + * connection within (n+1)/n times the timeout + * interval. */ + + if (kibnal_tunables.kib_io_timeout > n * p) + chunk = (chunk * n * p) / + kibnal_tunables.kib_io_timeout; + if (chunk == 0) + chunk = 1; + + for (i = 0; i < chunk; i++) { + kibnal_check_conns (peer_index); + peer_index = (peer_index + 1) % + kibnal_data.kib_peer_hash_size; + } + + deadline += p * HZ; + } + + kibnal_data.kib_connd_waketime = jiffies + timeout; + + set_current_state (TASK_INTERRUPTIBLE); + add_wait_queue (&kibnal_data.kib_connd_waitq, &wait); + + if (!kibnal_data.kib_shutdown && + list_empty (&kibnal_data.kib_connd_conns) && + list_empty (&kibnal_data.kib_connd_peers)) + schedule_timeout (timeout); + + set_current_state (TASK_RUNNING); + remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait); + + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); + } + + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); + + kibnal_thread_fini (); + return (0); +} + +int +kibnal_scheduler(void *arg) +{ + long id = (long)arg; + char name[16]; + kib_rx_t *rx; + kib_tx_t *tx; + unsigned long flags; + int rc; + int counter = 0; + int did_something; + + snprintf(name, sizeof(name), "kibnal_sd_%02ld", id); + kportal_daemonize(name); + kportal_blockallsigs(); + + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); + + for (;;) { + did_something = 0; + + while (!list_empty(&kibnal_data.kib_sched_txq)) { + tx = list_entry(kibnal_data.kib_sched_txq.next, + kib_tx_t, tx_list); + list_del(&tx->tx_list); + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, + flags); + kibnal_tx_done(tx); + + spin_lock_irqsave(&kibnal_data.kib_sched_lock, + flags); + } + + if (!list_empty(&kibnal_data.kib_sched_rxq)) { + rx = list_entry(kibnal_data.kib_sched_rxq.next, + kib_rx_t, rx_list); + list_del(&rx->rx_list); + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, + flags); + + kibnal_rx(rx); + + did_something = 1; + spin_lock_irqsave(&kibnal_data.kib_sched_lock, + flags); + } + + /* shut down and no receives to complete... */ + if (kibnal_data.kib_shutdown && + atomic_read(&kibnal_data.kib_nconns) == 0) + break; + + /* nothing to do or hogging CPU */ + if (!did_something || counter++ == IBNAL_RESCHED) { + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, + flags); + counter = 0; + + if (!did_something) { + rc = wait_event_interruptible( + kibnal_data.kib_sched_waitq, + !list_empty(&kibnal_data.kib_sched_txq) || + !list_empty(&kibnal_data.kib_sched_rxq) || + (kibnal_data.kib_shutdown && + atomic_read (&kibnal_data.kib_nconns) == 0)); + } else { + our_cond_resched(); + } + + spin_lock_irqsave(&kibnal_data.kib_sched_lock, + flags); + } + } + + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); + + kibnal_thread_fini(); + return (0); +} + + +lib_nal_t kibnal_lib = { + libnal_data: &kibnal_data, /* NAL private data */ + libnal_send: kibnal_send, + libnal_send_pages: kibnal_send_pages, + libnal_recv: kibnal_recv, + libnal_recv_pages: kibnal_recv_pages, + libnal_dist: kibnal_dist +}; diff --git a/lnet/klnds/openiblnd/.cvsignore b/lnet/klnds/openiblnd/.cvsignore new file mode 100644 index 0000000..5ed596b --- /dev/null +++ b/lnet/klnds/openiblnd/.cvsignore @@ -0,0 +1,10 @@ +.deps +Makefile +.*.cmd +autoMakefile.in +autoMakefile +*.ko +*.mod.c +.*.flags +.tmp_versions +.depend diff --git a/lnet/klnds/openiblnd/openiblnd.c b/lnet/klnds/openiblnd/openiblnd.c index 6f66143..652eb34 100644 --- a/lnet/klnds/openiblnd/openiblnd.c +++ b/lnet/klnds/openiblnd/openiblnd.c @@ -23,26 +23,25 @@ #include "openibnal.h" -nal_t koibnal_api; -ptl_handle_ni_t koibnal_ni; -koib_data_t koibnal_data; -koib_tunables_t koibnal_tunables; +nal_t kibnal_api; +ptl_handle_ni_t kibnal_ni; +kib_data_t kibnal_data; +kib_tunables_t kibnal_tunables; #ifdef CONFIG_SYSCTL -#define OPENIBNAL_SYSCTL 202 +#define IBNAL_SYSCTL 202 -#define OPENIBNAL_SYSCTL_TIMEOUT 1 -#define OPENIBNAL_SYSCTL_ZERO_COPY 2 +#define IBNAL_SYSCTL_TIMEOUT 1 -static ctl_table koibnal_ctl_table[] = { - {OPENIBNAL_SYSCTL_TIMEOUT, "timeout", - &koibnal_tunables.koib_io_timeout, sizeof (int), +static ctl_table kibnal_ctl_table[] = { + {IBNAL_SYSCTL_TIMEOUT, "timeout", + &kibnal_tunables.kib_io_timeout, sizeof (int), 0644, NULL, &proc_dointvec}, { 0 } }; -static ctl_table koibnal_top_ctl_table[] = { - {OPENIBNAL_SYSCTL, "openibnal", NULL, 0, 0555, koibnal_ctl_table}, +static ctl_table kibnal_top_ctl_table[] = { + {IBNAL_SYSCTL, "openibnal", NULL, 0, 0555, kibnal_ctl_table}, { 0 } }; #endif @@ -66,167 +65,183 @@ print_service(struct ib_common_attrib_service *service, char *tag, int rc) "service id: "LPX64"\n" "name : %s\n" "NID : "LPX64"\n", tag, rc, - service->service_id, name, service->service_data64[0]); + service->service_id, name, + *kibnal_service_nid_field(service)); } void -koibnal_service_setunset_done (tTS_IB_CLIENT_QUERY_TID tid, int status, +kibnal_service_setunset_done (tTS_IB_CLIENT_QUERY_TID tid, int status, struct ib_common_attrib_service *service, void *arg) { *(int *)arg = status; - up (&koibnal_data.koib_nid_signal); + up (&kibnal_data.kib_nid_signal); } +#if IBNAL_CHECK_ADVERT +void +kibnal_check_advert (void) +{ + struct ib_common_attrib_service *svc; + __u64 tid; + int rc; + int rc2; + + PORTAL_ALLOC(svc, sizeof(*svc)); + if (svc == NULL) + return; + + memset (svc, 0, sizeof (*svc)); + kibnal_set_service_keys(svc, kibnal_data.kib_nid); + + rc = ib_service_get (kibnal_data.kib_device, + kibnal_data.kib_port, + svc, + KIBNAL_SERVICE_KEY_MASK, + kibnal_tunables.kib_io_timeout * HZ, + kibnal_service_setunset_done, &rc2, + &tid); + + if (rc != 0) { + CERROR ("Immediate error %d checking SM service\n", rc); + } else { + down (&kibnal_data.kib_nid_signal); + rc = rc2; + + if (rc != 0) + CERROR ("Error %d checking SM service\n", rc); + } + + PORTAL_FREE(svc, sizeof(*svc)); +} +#endif + int -koibnal_advertise (void) +kibnal_advertise (void) { + struct ib_common_attrib_service *svc; __u64 tid; int rc; int rc2; - LASSERT (koibnal_data.koib_nid != PTL_NID_ANY); + LASSERT (kibnal_data.kib_nid != PTL_NID_ANY); + + PORTAL_ALLOC(svc, sizeof(*svc)); + if (svc == NULL) + return (-ENOMEM); - memset (&koibnal_data.koib_service, 0, - sizeof (koibnal_data.koib_service)); + memset (svc, 0, sizeof (*svc)); - koibnal_data.koib_service.service_id - = koibnal_data.koib_cm_service_id; + svc->service_id = kibnal_data.kib_service_id; - rc = ib_cached_gid_get(koibnal_data.koib_device, - koibnal_data.koib_port, + rc = ib_cached_gid_get(kibnal_data.kib_device, + kibnal_data.kib_port, 0, - koibnal_data.koib_service.service_gid); + svc->service_gid); if (rc != 0) { CERROR ("Can't get port %d GID: %d\n", - koibnal_data.koib_port, rc); - return (rc); + kibnal_data.kib_port, rc); + goto out; } - rc = ib_cached_pkey_get(koibnal_data.koib_device, - koibnal_data.koib_port, + rc = ib_cached_pkey_get(kibnal_data.kib_device, + kibnal_data.kib_port, 0, - &koibnal_data.koib_service.service_pkey); + &svc->service_pkey); if (rc != 0) { CERROR ("Can't get port %d PKEY: %d\n", - koibnal_data.koib_port, rc); - return (rc); + kibnal_data.kib_port, rc); + goto out; } - koibnal_data.koib_service.service_lease = 0xffffffff; + svc->service_lease = 0xffffffff; - koibnal_set_service_keys(&koibnal_data.koib_service, koibnal_data.koib_nid); + kibnal_set_service_keys(svc, kibnal_data.kib_nid); CDEBUG(D_NET, "Advertising service id "LPX64" %s:"LPX64"\n", - koibnal_data.koib_service.service_id, - koibnal_data.koib_service.service_name, - *koibnal_service_nid_field(&koibnal_data.koib_service)); + svc->service_id, + svc->service_name, *kibnal_service_nid_field(svc)); - rc = ib_service_set (koibnal_data.koib_device, - koibnal_data.koib_port, - &koibnal_data.koib_service, + rc = ib_service_set (kibnal_data.kib_device, + kibnal_data.kib_port, + svc, IB_SA_SERVICE_COMP_MASK_ID | IB_SA_SERVICE_COMP_MASK_GID | IB_SA_SERVICE_COMP_MASK_PKEY | IB_SA_SERVICE_COMP_MASK_LEASE | - KOIBNAL_SERVICE_KEY_MASK, - koibnal_tunables.koib_io_timeout * HZ, - koibnal_service_setunset_done, &rc2, &tid); + KIBNAL_SERVICE_KEY_MASK, + kibnal_tunables.kib_io_timeout * HZ, + kibnal_service_setunset_done, &rc2, &tid); - if (rc == 0) { - down (&koibnal_data.koib_nid_signal); - rc = rc2; + if (rc != 0) { + CERROR ("Immediate error %d advertising NID "LPX64"\n", + rc, kibnal_data.kib_nid); + goto out; } - - if (rc != 0) - CERROR ("Error %d advertising SM service\n", rc); + down (&kibnal_data.kib_nid_signal); + + rc = rc2; + if (rc != 0) + CERROR ("Error %d advertising NID "LPX64"\n", + rc, kibnal_data.kib_nid); + out: + PORTAL_FREE(svc, sizeof(*svc)); return (rc); } -int -koibnal_unadvertise (int expect_success) +void +kibnal_unadvertise (int expect_success) { + struct ib_common_attrib_service *svc; __u64 tid; int rc; int rc2; - LASSERT (koibnal_data.koib_nid != PTL_NID_ANY); + LASSERT (kibnal_data.kib_nid != PTL_NID_ANY); - memset (&koibnal_data.koib_service, 0, - sizeof (koibnal_data.koib_service)); + PORTAL_ALLOC(svc, sizeof(*svc)); + if (svc == NULL) + return; - koibnal_set_service_keys(&koibnal_data.koib_service, koibnal_data.koib_nid); + memset (svc, 0, sizeof(*svc)); + + kibnal_set_service_keys(svc, kibnal_data.kib_nid); CDEBUG(D_NET, "Unadvertising service %s:"LPX64"\n", - koibnal_data.koib_service.service_name, - *koibnal_service_nid_field(&koibnal_data.koib_service)); - - rc = ib_service_delete (koibnal_data.koib_device, - koibnal_data.koib_port, - &koibnal_data.koib_service, - KOIBNAL_SERVICE_KEY_MASK, - koibnal_tunables.koib_io_timeout * HZ, - koibnal_service_setunset_done, &rc2, &tid); + svc->service_name, *kibnal_service_nid_field(svc)); + + rc = ib_service_delete (kibnal_data.kib_device, + kibnal_data.kib_port, + svc, + KIBNAL_SERVICE_KEY_MASK, + kibnal_tunables.kib_io_timeout * HZ, + kibnal_service_setunset_done, &rc2, &tid); if (rc != 0) { CERROR ("Immediate error %d unadvertising NID "LPX64"\n", - rc, koibnal_data.koib_nid); - return (rc); + rc, kibnal_data.kib_nid); + goto out; } - down (&koibnal_data.koib_nid_signal); + down (&kibnal_data.kib_nid_signal); if ((rc2 == 0) == !!expect_success) - return (0); + goto out; /* success: rc == 0 */ if (expect_success) CERROR("Error %d unadvertising NID "LPX64"\n", - rc, koibnal_data.koib_nid); + rc, kibnal_data.kib_nid); else CWARN("Removed conflicting NID "LPX64"\n", - koibnal_data.koib_nid); - - return (rc); -} - -int -koibnal_check_advert (void) -{ - __u64 tid; - int rc; - int rc2; - - static struct ib_common_attrib_service srv; - - memset (&srv, 0, sizeof (srv)); - - koibnal_set_service_keys(&srv, koibnal_data.koib_nid); - - rc = ib_service_get (koibnal_data.koib_device, - koibnal_data.koib_port, - &srv, - KOIBNAL_SERVICE_KEY_MASK, - koibnal_tunables.koib_io_timeout * HZ, - koibnal_service_setunset_done, &rc2, - &tid); - - if (rc != 0) { - CERROR ("Immediate error %d checking SM service\n", rc); - } else { - down (&koibnal_data.koib_nid_signal); - rc = rc2; - - if (rc != 0) - CERROR ("Error %d checking SM service\n", rc); - } - - return (rc); + kibnal_data.kib_nid); + out: + PORTAL_FREE(svc, sizeof(*svc)); } int -koibnal_set_mynid(ptl_nid_t nid) +kibnal_set_mynid(ptl_nid_t nid) { struct timeval tv; - lib_ni_t *ni = &koibnal_lib.libnal_ni; + lib_ni_t *ni = &kibnal_lib.libnal_ni; int rc; CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", @@ -234,75 +249,76 @@ koibnal_set_mynid(ptl_nid_t nid) do_gettimeofday(&tv); - down (&koibnal_data.koib_nid_mutex); + down (&kibnal_data.kib_nid_mutex); - if (nid == koibnal_data.koib_nid) { + if (nid == kibnal_data.kib_nid) { /* no change of NID */ - up (&koibnal_data.koib_nid_mutex); + up (&kibnal_data.kib_nid_mutex); return (0); } CDEBUG(D_NET, "NID "LPX64"("LPX64")\n", - koibnal_data.koib_nid, nid); + kibnal_data.kib_nid, nid); - if (koibnal_data.koib_nid != PTL_NID_ANY) { + if (kibnal_data.kib_nid != PTL_NID_ANY) { - koibnal_unadvertise (1); + kibnal_unadvertise (1); - rc = ib_cm_listen_stop (koibnal_data.koib_listen_handle); + rc = ib_cm_listen_stop (kibnal_data.kib_listen_handle); if (rc != 0) CERROR ("Error %d stopping listener\n", rc); } - koibnal_data.koib_nid = ni->ni_pid.nid = nid; - koibnal_data.koib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; + kibnal_data.kib_nid = ni->ni_pid.nid = nid; + kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; /* Delete all existing peers and their connections after new * NID/incarnation set to ensure no old connections in our brave * new world. */ - koibnal_del_peer (PTL_NID_ANY, 0); - - rc = 0; - if (koibnal_data.koib_nid != PTL_NID_ANY) { - /* New NID installed */ + kibnal_del_peer (PTL_NID_ANY, 0); - /* remove any previous advert (crashed node etc) */ - koibnal_unadvertise(0); + if (kibnal_data.kib_nid == PTL_NID_ANY) { + /* No new NID to install */ + up (&kibnal_data.kib_nid_mutex); + return (0); + } + + /* remove any previous advert (crashed node etc) */ + kibnal_unadvertise(0); - /* Assign new service number */ - koibnal_data.koib_cm_service_id = ib_cm_service_assign(); - CDEBUG(D_NET, "service_id "LPX64"\n", koibnal_data.koib_cm_service_id); + /* Assign new service number */ + kibnal_data.kib_service_id = ib_cm_service_assign(); + CDEBUG(D_NET, "service_id "LPX64"\n", kibnal_data.kib_service_id); - rc = ib_cm_listen(koibnal_data.koib_cm_service_id, - TS_IB_CM_SERVICE_EXACT_MASK, - koibnal_passive_conn_callback, NULL, - &koibnal_data.koib_listen_handle); - if (rc != 0) { - CERROR ("ib_cm_listen error: %d\n", rc); - goto out; + rc = ib_cm_listen(kibnal_data.kib_service_id, + TS_IB_CM_SERVICE_EXACT_MASK, + kibnal_passive_conn_callback, NULL, + &kibnal_data.kib_listen_handle); + if (rc == 0) { + rc = kibnal_advertise(); + if (rc == 0) { +#if IBNAL_CHECK_ADVERT + kibnal_check_advert(); +#endif + up (&kibnal_data.kib_nid_mutex); + return (0); } - rc = koibnal_advertise(); - - koibnal_check_advert(); - } - - out: - if (rc != 0) { - koibnal_data.koib_nid = PTL_NID_ANY; + ib_cm_listen_stop(kibnal_data.kib_listen_handle); /* remove any peers that sprung up while I failed to * advertise myself */ - koibnal_del_peer (PTL_NID_ANY, 0); + kibnal_del_peer (PTL_NID_ANY, 0); } - - up (&koibnal_data.koib_nid_mutex); - return (0); + + kibnal_data.kib_nid = PTL_NID_ANY; + up (&kibnal_data.kib_nid_mutex); + return (rc); } -koib_peer_t * -koibnal_create_peer (ptl_nid_t nid) +kib_peer_t * +kibnal_create_peer (ptl_nid_t nid) { - koib_peer_t *peer; + kib_peer_t *peer; LASSERT (nid != PTL_NID_ANY); @@ -320,20 +336,20 @@ koibnal_create_peer (ptl_nid_t nid) INIT_LIST_HEAD (&peer->ibp_tx_queue); peer->ibp_reconnect_time = jiffies; - peer->ibp_reconnect_interval = OPENIBNAL_MIN_RECONNECT_INTERVAL; + peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; - atomic_inc (&koibnal_data.koib_npeers); + atomic_inc (&kibnal_data.kib_npeers); return (peer); } void -koibnal_destroy_peer (koib_peer_t *peer) +kibnal_destroy_peer (kib_peer_t *peer) { CDEBUG (D_NET, "peer "LPX64" %p deleted\n", peer->ibp_nid, peer); LASSERT (atomic_read (&peer->ibp_refcount) == 0); LASSERT (peer->ibp_persistence == 0); - LASSERT (!koibnal_peer_active(peer)); + LASSERT (!kibnal_peer_active(peer)); LASSERT (peer->ibp_connecting == 0); LASSERT (list_empty (&peer->ibp_conns)); LASSERT (list_empty (&peer->ibp_tx_queue)); @@ -344,11 +360,11 @@ koibnal_destroy_peer (koib_peer_t *peer) * they are destroyed, so we can be assured that _all_ state to do * with this peer has been cleaned up when its refcount drops to * zero. */ - atomic_dec (&koibnal_data.koib_npeers); + atomic_dec (&kibnal_data.kib_npeers); } void -koibnal_put_peer (koib_peer_t *peer) +kibnal_put_peer (kib_peer_t *peer) { CDEBUG (D_OTHER, "putting peer[%p] -> "LPX64" (%d)\n", peer, peer->ibp_nid, @@ -358,19 +374,19 @@ koibnal_put_peer (koib_peer_t *peer) if (!atomic_dec_and_test (&peer->ibp_refcount)) return; - koibnal_destroy_peer (peer); + kibnal_destroy_peer (peer); } -koib_peer_t * -koibnal_find_peer_locked (ptl_nid_t nid) +kib_peer_t * +kibnal_find_peer_locked (ptl_nid_t nid) { - struct list_head *peer_list = koibnal_nid2peerlist (nid); + struct list_head *peer_list = kibnal_nid2peerlist (nid); struct list_head *tmp; - koib_peer_t *peer; + kib_peer_t *peer; list_for_each (tmp, peer_list) { - peer = list_entry (tmp, koib_peer_t, ibp_list); + peer = list_entry (tmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence != 0 || /* persistent peer */ peer->ibp_connecting != 0 || /* creating conns */ @@ -386,46 +402,46 @@ koibnal_find_peer_locked (ptl_nid_t nid) return (NULL); } -koib_peer_t * -koibnal_get_peer (ptl_nid_t nid) +kib_peer_t * +kibnal_get_peer (ptl_nid_t nid) { - koib_peer_t *peer; + kib_peer_t *peer; - read_lock (&koibnal_data.koib_global_lock); - peer = koibnal_find_peer_locked (nid); + read_lock (&kibnal_data.kib_global_lock); + peer = kibnal_find_peer_locked (nid); if (peer != NULL) /* +1 ref for caller? */ atomic_inc (&peer->ibp_refcount); - read_unlock (&koibnal_data.koib_global_lock); + read_unlock (&kibnal_data.kib_global_lock); return (peer); } void -koibnal_unlink_peer_locked (koib_peer_t *peer) +kibnal_unlink_peer_locked (kib_peer_t *peer) { LASSERT (peer->ibp_persistence == 0); LASSERT (list_empty(&peer->ibp_conns)); - LASSERT (koibnal_peer_active(peer)); + LASSERT (kibnal_peer_active(peer)); list_del_init (&peer->ibp_list); /* lose peerlist's ref */ - koibnal_put_peer (peer); + kibnal_put_peer (peer); } int -koibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep) +kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep) { - koib_peer_t *peer; + kib_peer_t *peer; struct list_head *ptmp; int i; - read_lock (&koibnal_data.koib_global_lock); + read_lock (&kibnal_data.kib_global_lock); - for (i = 0; i < koibnal_data.koib_peer_hash_size; i++) { + for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { - list_for_each (ptmp, &koibnal_data.koib_peers[i]) { + list_for_each (ptmp, &kibnal_data.kib_peers[i]) { - peer = list_entry (ptmp, koib_peer_t, ibp_list); + peer = list_entry (ptmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence != 0 || peer->ibp_connecting != 0 || !list_empty (&peer->ibp_conns)); @@ -436,53 +452,53 @@ koibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep) *nidp = peer->ibp_nid; *persistencep = peer->ibp_persistence; - read_unlock (&koibnal_data.koib_global_lock); + read_unlock (&kibnal_data.kib_global_lock); return (0); } } - read_unlock (&koibnal_data.koib_global_lock); + read_unlock (&kibnal_data.kib_global_lock); return (-ENOENT); } int -koibnal_add_persistent_peer (ptl_nid_t nid) +kibnal_add_persistent_peer (ptl_nid_t nid) { unsigned long flags; - koib_peer_t *peer; - koib_peer_t *peer2; + kib_peer_t *peer; + kib_peer_t *peer2; if (nid == PTL_NID_ANY) return (-EINVAL); - peer = koibnal_create_peer (nid); + peer = kibnal_create_peer (nid); if (peer == NULL) return (-ENOMEM); - write_lock_irqsave (&koibnal_data.koib_global_lock, flags); + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - peer2 = koibnal_find_peer_locked (nid); + peer2 = kibnal_find_peer_locked (nid); if (peer2 != NULL) { - koibnal_put_peer (peer); + kibnal_put_peer (peer); peer = peer2; } else { /* peer table takes existing ref on peer */ list_add_tail (&peer->ibp_list, - koibnal_nid2peerlist (nid)); + kibnal_nid2peerlist (nid)); } peer->ibp_persistence++; - write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags); + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); return (0); } void -koibnal_del_peer_locked (koib_peer_t *peer, int single_share) +kibnal_del_peer_locked (kib_peer_t *peer, int single_share) { struct list_head *ctmp; struct list_head *cnxt; - koib_conn_t *conn; + kib_conn_t *conn; if (!single_share) peer->ibp_persistence = 0; @@ -493,38 +509,38 @@ koibnal_del_peer_locked (koib_peer_t *peer, int single_share) return; list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { - conn = list_entry(ctmp, koib_conn_t, ibc_list); + conn = list_entry(ctmp, kib_conn_t, ibc_list); - koibnal_close_conn_locked (conn, 0); + kibnal_close_conn_locked (conn, 0); } /* NB peer unlinks itself when last conn is closed */ } int -koibnal_del_peer (ptl_nid_t nid, int single_share) +kibnal_del_peer (ptl_nid_t nid, int single_share) { unsigned long flags; struct list_head *ptmp; struct list_head *pnxt; - koib_peer_t *peer; + kib_peer_t *peer; int lo; int hi; int i; int rc = -ENOENT; - write_lock_irqsave (&koibnal_data.koib_global_lock, flags); + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); if (nid != PTL_NID_ANY) - lo = hi = koibnal_nid2peerlist(nid) - koibnal_data.koib_peers; + lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; else { lo = 0; - hi = koibnal_data.koib_peer_hash_size - 1; + hi = kibnal_data.kib_peer_hash_size - 1; } for (i = lo; i <= hi; i++) { - list_for_each_safe (ptmp, pnxt, &koibnal_data.koib_peers[i]) { - peer = list_entry (ptmp, koib_peer_t, ibp_list); + list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) { + peer = list_entry (ptmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence != 0 || peer->ibp_connecting != 0 || !list_empty (&peer->ibp_conns)); @@ -532,7 +548,7 @@ koibnal_del_peer (ptl_nid_t nid, int single_share) if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid)) continue; - koibnal_del_peer_locked (peer, single_share); + kibnal_del_peer_locked (peer, single_share); rc = 0; /* matched something */ if (single_share) @@ -540,26 +556,26 @@ koibnal_del_peer (ptl_nid_t nid, int single_share) } } out: - write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags); + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); return (rc); } -koib_conn_t * -koibnal_get_conn_by_idx (int index) +kib_conn_t * +kibnal_get_conn_by_idx (int index) { - koib_peer_t *peer; + kib_peer_t *peer; struct list_head *ptmp; - koib_conn_t *conn; + kib_conn_t *conn; struct list_head *ctmp; int i; - read_lock (&koibnal_data.koib_global_lock); + read_lock (&kibnal_data.kib_global_lock); - for (i = 0; i < koibnal_data.koib_peer_hash_size; i++) { - list_for_each (ptmp, &koibnal_data.koib_peers[i]) { + for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { + list_for_each (ptmp, &kibnal_data.kib_peers[i]) { - peer = list_entry (ptmp, koib_peer_t, ibp_list); + peer = list_entry (ptmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence > 0 || peer->ibp_connecting != 0 || !list_empty (&peer->ibp_conns)); @@ -568,25 +584,25 @@ koibnal_get_conn_by_idx (int index) if (index-- > 0) continue; - conn = list_entry (ctmp, koib_conn_t, ibc_list); + conn = list_entry (ctmp, kib_conn_t, ibc_list); CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", conn, conn->ibc_state, conn->ibc_peer->ibp_nid, atomic_read (&conn->ibc_refcount)); atomic_inc (&conn->ibc_refcount); - read_unlock (&koibnal_data.koib_global_lock); + read_unlock (&kibnal_data.kib_global_lock); return (conn); } } } - read_unlock (&koibnal_data.koib_global_lock); + read_unlock (&kibnal_data.kib_global_lock); return (NULL); } -koib_conn_t * -koibnal_create_conn (void) +kib_conn_t * +kibnal_create_conn (void) { - koib_conn_t *conn; + kib_conn_t *conn; int i; __u64 vaddr = 0; __u64 vaddr_base; @@ -608,57 +624,57 @@ koibnal_create_conn (void) memset (conn, 0, sizeof (*conn)); INIT_LIST_HEAD (&conn->ibc_tx_queue); - INIT_LIST_HEAD (&conn->ibc_rdma_queue); + INIT_LIST_HEAD (&conn->ibc_active_txs); spin_lock_init (&conn->ibc_lock); - atomic_inc (&koibnal_data.koib_nconns); + atomic_inc (&kibnal_data.kib_nconns); /* well not really, but I call destroy() on failure, which decrements */ - PORTAL_ALLOC (conn->ibc_rxs, OPENIBNAL_RX_MSGS * sizeof (koib_rx_t)); + PORTAL_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t)); if (conn->ibc_rxs == NULL) goto failed; - memset (conn->ibc_rxs, 0, OPENIBNAL_RX_MSGS * sizeof(koib_rx_t)); + memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t)); - rc = koibnal_alloc_pages(&conn->ibc_rx_pages, - OPENIBNAL_RX_MSG_PAGES, - IB_ACCESS_LOCAL_WRITE); + rc = kibnal_alloc_pages(&conn->ibc_rx_pages, + IBNAL_RX_MSG_PAGES, + IB_ACCESS_LOCAL_WRITE); if (rc != 0) goto failed; - vaddr_base = vaddr = conn->ibc_rx_pages->oibp_vaddr; + vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr; - for (i = ipage = page_offset = 0; i < OPENIBNAL_RX_MSGS; i++) { - struct page *page = conn->ibc_rx_pages->oibp_pages[ipage]; - koib_rx_t *rx = &conn->ibc_rxs[i]; + for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) { + struct page *page = conn->ibc_rx_pages->ibp_pages[ipage]; + kib_rx_t *rx = &conn->ibc_rxs[i]; rx->rx_conn = conn; rx->rx_vaddr = vaddr; - rx->rx_msg = (koib_msg_t *)(((char *)page_address(page)) + page_offset); + rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset); - vaddr += OPENIBNAL_MSG_SIZE; - LASSERT (vaddr <= vaddr_base + OPENIBNAL_RX_MSG_BYTES); + vaddr += IBNAL_MSG_SIZE; + LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES); - page_offset += OPENIBNAL_MSG_SIZE; + page_offset += IBNAL_MSG_SIZE; LASSERT (page_offset <= PAGE_SIZE); if (page_offset == PAGE_SIZE) { page_offset = 0; ipage++; - LASSERT (ipage <= OPENIBNAL_RX_MSG_PAGES); + LASSERT (ipage <= IBNAL_RX_MSG_PAGES); } } params.qp_create = (struct ib_qp_create_param) { .limit = { /* Sends have an optional RDMA */ - .max_outstanding_send_request = 2 * OPENIBNAL_MSG_QUEUE_SIZE, - .max_outstanding_receive_request = OPENIBNAL_MSG_QUEUE_SIZE, + .max_outstanding_send_request = 2 * IBNAL_MSG_QUEUE_SIZE, + .max_outstanding_receive_request = IBNAL_MSG_QUEUE_SIZE, .max_send_gather_element = 1, .max_receive_scatter_element = 1, }, - .pd = koibnal_data.koib_pd, - .send_queue = koibnal_data.koib_tx_cq, - .receive_queue = koibnal_data.koib_rx_cq, + .pd = kibnal_data.kib_pd, + .send_queue = kibnal_data.kib_cq, + .receive_queue = kibnal_data.kib_cq, .send_policy = IB_WQ_SIGNAL_SELECTABLE, .receive_policy = IB_WQ_SIGNAL_SELECTABLE, .rd_domain = 0, @@ -673,11 +689,11 @@ koibnal_create_conn (void) } /* Mark QP created */ - conn->ibc_state = OPENIBNAL_CONN_INIT_QP; + conn->ibc_state = IBNAL_CONN_INIT_QP; params.qp_attr = (struct ib_qp_attribute) { .state = IB_QP_STATE_INIT, - .port = koibnal_data.koib_port, + .port = kibnal_data.kib_port, .enable_rdma_read = 1, .enable_rdma_write = 1, .valid_fields = (IB_QP_ATTRIBUTE_STATE | @@ -696,12 +712,12 @@ koibnal_create_conn (void) return (conn); failed: - koibnal_destroy_conn (conn); + kibnal_destroy_conn (conn); return (NULL); } void -koibnal_destroy_conn (koib_conn_t *conn) +kibnal_destroy_conn (kib_conn_t *conn) { int rc; @@ -709,21 +725,21 @@ koibnal_destroy_conn (koib_conn_t *conn) LASSERT (atomic_read (&conn->ibc_refcount) == 0); LASSERT (list_empty(&conn->ibc_tx_queue)); - LASSERT (list_empty(&conn->ibc_rdma_queue)); + LASSERT (list_empty(&conn->ibc_active_txs)); LASSERT (conn->ibc_nsends_posted == 0); LASSERT (conn->ibc_connreq == NULL); switch (conn->ibc_state) { - case OPENIBNAL_CONN_ZOMBIE: + case IBNAL_CONN_ZOMBIE: /* called after connection sequence initiated */ - case OPENIBNAL_CONN_INIT_QP: + case IBNAL_CONN_INIT_QP: rc = ib_qp_destroy(conn->ibc_qp); if (rc != 0) CERROR("Can't destroy QP: %d\n", rc); /* fall through */ - case OPENIBNAL_CONN_INIT_NOTHING: + case IBNAL_CONN_INIT_NOTHING: break; default: @@ -731,30 +747,30 @@ koibnal_destroy_conn (koib_conn_t *conn) } if (conn->ibc_rx_pages != NULL) - koibnal_free_pages(conn->ibc_rx_pages); + kibnal_free_pages(conn->ibc_rx_pages); if (conn->ibc_rxs != NULL) PORTAL_FREE(conn->ibc_rxs, - OPENIBNAL_RX_MSGS * sizeof(koib_rx_t)); + IBNAL_RX_MSGS * sizeof(kib_rx_t)); if (conn->ibc_peer != NULL) - koibnal_put_peer(conn->ibc_peer); + kibnal_put_peer(conn->ibc_peer); PORTAL_FREE(conn, sizeof (*conn)); - atomic_dec(&koibnal_data.koib_nconns); + atomic_dec(&kibnal_data.kib_nconns); - if (atomic_read (&koibnal_data.koib_nconns) == 0 && - koibnal_data.koib_shutdown) { + if (atomic_read (&kibnal_data.kib_nconns) == 0 && + kibnal_data.kib_shutdown) { /* I just nuked the last connection on shutdown; wake up * everyone so they can exit. */ - wake_up_all(&koibnal_data.koib_sched_waitq); - wake_up_all(&koibnal_data.koib_connd_waitq); + wake_up_all(&kibnal_data.kib_sched_waitq); + wake_up_all(&kibnal_data.kib_connd_waitq); } } void -koibnal_put_conn (koib_conn_t *conn) +kibnal_put_conn (kib_conn_t *conn) { unsigned long flags; @@ -767,44 +783,44 @@ koibnal_put_conn (koib_conn_t *conn) return; /* last ref only goes on zombies */ - LASSERT (conn->ibc_state == OPENIBNAL_CONN_ZOMBIE); + LASSERT (conn->ibc_state == IBNAL_CONN_ZOMBIE); - spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags); + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); - list_add (&conn->ibc_list, &koibnal_data.koib_connd_conns); - wake_up (&koibnal_data.koib_connd_waitq); + list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns); + wake_up (&kibnal_data.kib_connd_waitq); - spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags); + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); } int -koibnal_close_peer_conns_locked (koib_peer_t *peer, int why) +kibnal_close_peer_conns_locked (kib_peer_t *peer, int why) { - koib_conn_t *conn; + kib_conn_t *conn; struct list_head *ctmp; struct list_head *cnxt; int count = 0; list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { - conn = list_entry (ctmp, koib_conn_t, ibc_list); + conn = list_entry (ctmp, kib_conn_t, ibc_list); count++; - koibnal_close_conn_locked (conn, why); + kibnal_close_conn_locked (conn, why); } return (count); } int -koibnal_close_stale_conns_locked (koib_peer_t *peer, __u64 incarnation) +kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation) { - koib_conn_t *conn; + kib_conn_t *conn; struct list_head *ctmp; struct list_head *cnxt; int count = 0; list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { - conn = list_entry (ctmp, koib_conn_t, ibc_list); + conn = list_entry (ctmp, kib_conn_t, ibc_list); if (conn->ibc_incarnation == incarnation) continue; @@ -813,17 +829,17 @@ koibnal_close_stale_conns_locked (koib_peer_t *peer, __u64 incarnation) peer->ibp_nid, conn->ibc_incarnation, incarnation); count++; - koibnal_close_conn_locked (conn, -ESTALE); + kibnal_close_conn_locked (conn, -ESTALE); } return (count); } int -koibnal_close_matching_conns (ptl_nid_t nid) +kibnal_close_matching_conns (ptl_nid_t nid) { unsigned long flags; - koib_peer_t *peer; + kib_peer_t *peer; struct list_head *ptmp; struct list_head *pnxt; int lo; @@ -831,19 +847,19 @@ koibnal_close_matching_conns (ptl_nid_t nid) int i; int count = 0; - write_lock_irqsave (&koibnal_data.koib_global_lock, flags); + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); if (nid != PTL_NID_ANY) - lo = hi = koibnal_nid2peerlist(nid) - koibnal_data.koib_peers; + lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; else { lo = 0; - hi = koibnal_data.koib_peer_hash_size - 1; + hi = kibnal_data.kib_peer_hash_size - 1; } for (i = lo; i <= hi; i++) { - list_for_each_safe (ptmp, pnxt, &koibnal_data.koib_peers[i]) { + list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) { - peer = list_entry (ptmp, koib_peer_t, ibp_list); + peer = list_entry (ptmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence != 0 || peer->ibp_connecting != 0 || !list_empty (&peer->ibp_conns)); @@ -851,11 +867,11 @@ koibnal_close_matching_conns (ptl_nid_t nid) if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid)) continue; - count += koibnal_close_peer_conns_locked (peer, 0); + count += kibnal_close_peer_conns_locked (peer, 0); } } - write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags); + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); /* wildcards always succeed */ if (nid == PTL_NID_ANY) @@ -865,7 +881,7 @@ koibnal_close_matching_conns (ptl_nid_t nid) } int -koibnal_cmd(struct portals_cfg *pcfg, void * private) +kibnal_cmd(struct portals_cfg *pcfg, void * private) { int rc = -EINVAL; @@ -876,8 +892,8 @@ koibnal_cmd(struct portals_cfg *pcfg, void * private) ptl_nid_t nid = 0; int share_count = 0; - rc = koibnal_get_peer_info(pcfg->pcfg_count, - &nid, &share_count); + rc = kibnal_get_peer_info(pcfg->pcfg_count, + &nid, &share_count); pcfg->pcfg_nid = nid; pcfg->pcfg_size = 0; pcfg->pcfg_id = 0; @@ -887,17 +903,17 @@ koibnal_cmd(struct portals_cfg *pcfg, void * private) break; } case NAL_CMD_ADD_PEER: { - rc = koibnal_add_persistent_peer (pcfg->pcfg_nid); + rc = kibnal_add_persistent_peer (pcfg->pcfg_nid); break; } case NAL_CMD_DEL_PEER: { - rc = koibnal_del_peer (pcfg->pcfg_nid, + rc = kibnal_del_peer (pcfg->pcfg_nid, /* flags == single_share */ pcfg->pcfg_flags != 0); break; } case NAL_CMD_GET_CONN: { - koib_conn_t *conn = koibnal_get_conn_by_idx (pcfg->pcfg_count); + kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count); if (conn == NULL) rc = -ENOENT; @@ -907,19 +923,19 @@ koibnal_cmd(struct portals_cfg *pcfg, void * private) pcfg->pcfg_id = 0; pcfg->pcfg_misc = 0; pcfg->pcfg_flags = 0; - koibnal_put_conn (conn); + kibnal_put_conn (conn); } break; } case NAL_CMD_CLOSE_CONNECTION: { - rc = koibnal_close_matching_conns (pcfg->pcfg_nid); + rc = kibnal_close_matching_conns (pcfg->pcfg_nid); break; } case NAL_CMD_REGISTER_MYNID: { if (pcfg->pcfg_nid == PTL_NID_ANY) rc = -EINVAL; else - rc = koibnal_set_mynid (pcfg->pcfg_nid); + rc = kibnal_set_mynid (pcfg->pcfg_nid); break; } } @@ -928,47 +944,47 @@ koibnal_cmd(struct portals_cfg *pcfg, void * private) } void -koibnal_free_pages (koib_pages_t *p) +kibnal_free_pages (kib_pages_t *p) { - int npages = p->oibp_npages; + int npages = p->ibp_npages; int rc; int i; - if (p->oibp_mapped) { - rc = ib_memory_deregister(p->oibp_handle); + if (p->ibp_mapped) { + rc = ib_memory_deregister(p->ibp_handle); if (rc != 0) CERROR ("Deregister error: %d\n", rc); } for (i = 0; i < npages; i++) - if (p->oibp_pages[i] != NULL) - __free_page(p->oibp_pages[i]); + if (p->ibp_pages[i] != NULL) + __free_page(p->ibp_pages[i]); - PORTAL_FREE (p, offsetof(koib_pages_t, oibp_pages[npages])); + PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages])); } int -koibnal_alloc_pages (koib_pages_t **pp, int npages, int access) +kibnal_alloc_pages (kib_pages_t **pp, int npages, int access) { - koib_pages_t *p; + kib_pages_t *p; struct ib_physical_buffer *phys_pages; int i; int rc; - PORTAL_ALLOC(p, offsetof(koib_pages_t, oibp_pages[npages])); + PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages])); if (p == NULL) { CERROR ("Can't allocate buffer %d\n", npages); return (-ENOMEM); } - memset (p, 0, offsetof(koib_pages_t, oibp_pages[npages])); - p->oibp_npages = npages; + memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages])); + p->ibp_npages = npages; for (i = 0; i < npages; i++) { - p->oibp_pages[i] = alloc_page (GFP_KERNEL); - if (p->oibp_pages[i] == NULL) { + p->ibp_pages[i] = alloc_page (GFP_KERNEL); + if (p->ibp_pages[i] == NULL) { CERROR ("Can't allocate page %d of %d\n", i, npages); - koibnal_free_pages(p); + kibnal_free_pages(p); return (-ENOMEM); } } @@ -976,96 +992,96 @@ koibnal_alloc_pages (koib_pages_t **pp, int npages, int access) PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages)); if (phys_pages == NULL) { CERROR ("Can't allocate physarray for %d pages\n", npages); - koibnal_free_pages(p); + kibnal_free_pages(p); return (-ENOMEM); } for (i = 0; i < npages; i++) { phys_pages[i].size = PAGE_SIZE; phys_pages[i].address = - koibnal_page2phys(p->oibp_pages[i]); + kibnal_page2phys(p->ibp_pages[i]); } - p->oibp_vaddr = 0; - rc = ib_memory_register_physical(koibnal_data.koib_pd, + p->ibp_vaddr = 0; + rc = ib_memory_register_physical(kibnal_data.kib_pd, phys_pages, npages, - &p->oibp_vaddr, + &p->ibp_vaddr, npages * PAGE_SIZE, 0, access, - &p->oibp_handle, - &p->oibp_lkey, - &p->oibp_rkey); + &p->ibp_handle, + &p->ibp_lkey, + &p->ibp_rkey); PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages)); if (rc != 0) { CERROR ("Error %d mapping %d pages\n", rc, npages); - koibnal_free_pages(p); + kibnal_free_pages(p); return (rc); } - p->oibp_mapped = 1; + p->ibp_mapped = 1; *pp = p; return (0); } int -koibnal_setup_tx_descs (void) +kibnal_setup_tx_descs (void) { int ipage = 0; int page_offset = 0; __u64 vaddr; __u64 vaddr_base; struct page *page; - koib_tx_t *tx; + kib_tx_t *tx; int i; int rc; /* pre-mapped messages are not bigger than 1 page */ - LASSERT (OPENIBNAL_MSG_SIZE <= PAGE_SIZE); + LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE); /* No fancy arithmetic when we do the buffer calculations */ - LASSERT (PAGE_SIZE % OPENIBNAL_MSG_SIZE == 0); + LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0); - rc = koibnal_alloc_pages(&koibnal_data.koib_tx_pages, - OPENIBNAL_TX_MSG_PAGES, - 0); /* local read access only */ + rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, + IBNAL_TX_MSG_PAGES, + 0); /* local read access only */ if (rc != 0) return (rc); - vaddr = vaddr_base = koibnal_data.koib_tx_pages->oibp_vaddr; + vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr; - for (i = 0; i < OPENIBNAL_TX_MSGS; i++) { - page = koibnal_data.koib_tx_pages->oibp_pages[ipage]; - tx = &koibnal_data.koib_tx_descs[i]; + for (i = 0; i < IBNAL_TX_MSGS; i++) { + page = kibnal_data.kib_tx_pages->ibp_pages[ipage]; + tx = &kibnal_data.kib_tx_descs[i]; memset (tx, 0, sizeof(*tx)); /* zero flags etc */ - tx->tx_msg = (koib_msg_t *)(((char *)page_address(page)) + page_offset); + tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset); tx->tx_vaddr = vaddr; - tx->tx_isnblk = (i >= OPENIBNAL_NTX); - tx->tx_mapped = KOIB_TX_UNMAPPED; + tx->tx_isnblk = (i >= IBNAL_NTX); + tx->tx_mapped = KIB_TX_UNMAPPED; CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", i, tx, tx->tx_msg, tx->tx_vaddr); if (tx->tx_isnblk) list_add (&tx->tx_list, - &koibnal_data.koib_idle_nblk_txs); + &kibnal_data.kib_idle_nblk_txs); else list_add (&tx->tx_list, - &koibnal_data.koib_idle_txs); + &kibnal_data.kib_idle_txs); - vaddr += OPENIBNAL_MSG_SIZE; - LASSERT (vaddr <= vaddr_base + OPENIBNAL_TX_MSG_BYTES); + vaddr += IBNAL_MSG_SIZE; + LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES); - page_offset += OPENIBNAL_MSG_SIZE; + page_offset += IBNAL_MSG_SIZE; LASSERT (page_offset <= PAGE_SIZE); if (page_offset == PAGE_SIZE) { page_offset = 0; ipage++; - LASSERT (ipage <= OPENIBNAL_TX_MSG_PAGES); + LASSERT (ipage <= IBNAL_TX_MSG_PAGES); } } @@ -1073,7 +1089,7 @@ koibnal_setup_tx_descs (void) } void -koibnal_api_shutdown (nal_t *nal) +kibnal_api_shutdown (nal_t *nal) { int i; int rc; @@ -1087,119 +1103,113 @@ koibnal_api_shutdown (nal_t *nal) CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", atomic_read (&portal_kmemory)); - LASSERT(nal == &koibnal_api); + LASSERT(nal == &kibnal_api); - switch (koibnal_data.koib_init) { + switch (kibnal_data.kib_init) { default: - CERROR ("Unexpected state %d\n", koibnal_data.koib_init); + CERROR ("Unexpected state %d\n", kibnal_data.kib_init); LBUG(); - case OPENIBNAL_INIT_ALL: + case IBNAL_INIT_ALL: /* stop calls to nal_cmd */ libcfs_nal_cmd_unregister(OPENIBNAL); /* No new peers */ /* resetting my NID to unadvertises me, removes my * listener and nukes all current peers */ - koibnal_set_mynid (PTL_NID_ANY); + kibnal_set_mynid (PTL_NID_ANY); /* Wait for all peer state to clean up */ i = 2; - while (atomic_read (&koibnal_data.koib_npeers) != 0) { + while (atomic_read (&kibnal_data.kib_npeers) != 0) { i++; CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ "waiting for %d peers to close down\n", - atomic_read (&koibnal_data.koib_npeers)); + atomic_read (&kibnal_data.kib_npeers)); set_current_state (TASK_INTERRUPTIBLE); schedule_timeout (HZ); } /* fall through */ - case OPENIBNAL_INIT_TX_CQ: - rc = ib_cq_destroy (koibnal_data.koib_tx_cq); - if (rc != 0) - CERROR ("Destroy tx CQ error: %d\n", rc); - /* fall through */ - - case OPENIBNAL_INIT_RX_CQ: - rc = ib_cq_destroy (koibnal_data.koib_rx_cq); + case IBNAL_INIT_CQ: + rc = ib_cq_destroy (kibnal_data.kib_cq); if (rc != 0) - CERROR ("Destroy rx CQ error: %d\n", rc); + CERROR ("Destroy CQ error: %d\n", rc); /* fall through */ - case OPENIBNAL_INIT_TXD: - koibnal_free_pages (koibnal_data.koib_tx_pages); + case IBNAL_INIT_TXD: + kibnal_free_pages (kibnal_data.kib_tx_pages); /* fall through */ -#if OPENIBNAL_FMR - case OPENIBNAL_INIT_FMR: - rc = ib_fmr_pool_destroy (koibnal_data.koib_fmr_pool); +#if IBNAL_FMR + case IBNAL_INIT_FMR: + rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool); if (rc != 0) CERROR ("Destroy FMR pool error: %d\n", rc); /* fall through */ #endif - case OPENIBNAL_INIT_PD: - rc = ib_pd_destroy(koibnal_data.koib_pd); + case IBNAL_INIT_PD: + rc = ib_pd_destroy(kibnal_data.kib_pd); if (rc != 0) CERROR ("Destroy PD error: %d\n", rc); /* fall through */ - case OPENIBNAL_INIT_LIB: - lib_fini(&koibnal_lib); + case IBNAL_INIT_LIB: + lib_fini(&kibnal_lib); /* fall through */ - case OPENIBNAL_INIT_DATA: + case IBNAL_INIT_DATA: /* Module refcount only gets to zero when all peers * have been closed so all lists must be empty */ - LASSERT (atomic_read (&koibnal_data.koib_npeers) == 0); - LASSERT (koibnal_data.koib_peers != NULL); - for (i = 0; i < koibnal_data.koib_peer_hash_size; i++) { - LASSERT (list_empty (&koibnal_data.koib_peers[i])); + LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0); + LASSERT (kibnal_data.kib_peers != NULL); + for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { + LASSERT (list_empty (&kibnal_data.kib_peers[i])); } - LASSERT (atomic_read (&koibnal_data.koib_nconns) == 0); - LASSERT (list_empty (&koibnal_data.koib_sched_rxq)); - LASSERT (list_empty (&koibnal_data.koib_sched_txq)); - LASSERT (list_empty (&koibnal_data.koib_connd_conns)); - LASSERT (list_empty (&koibnal_data.koib_connd_peers)); + LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0); + LASSERT (list_empty (&kibnal_data.kib_sched_rxq)); + LASSERT (list_empty (&kibnal_data.kib_sched_txq)); + LASSERT (list_empty (&kibnal_data.kib_connd_conns)); + LASSERT (list_empty (&kibnal_data.kib_connd_peers)); /* flag threads to terminate; wake and wait for them to die */ - koibnal_data.koib_shutdown = 1; - wake_up_all (&koibnal_data.koib_sched_waitq); - wake_up_all (&koibnal_data.koib_connd_waitq); + kibnal_data.kib_shutdown = 1; + wake_up_all (&kibnal_data.kib_sched_waitq); + wake_up_all (&kibnal_data.kib_connd_waitq); i = 2; - while (atomic_read (&koibnal_data.koib_nthreads) != 0) { + while (atomic_read (&kibnal_data.kib_nthreads) != 0) { i++; CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ "Waiting for %d threads to terminate\n", - atomic_read (&koibnal_data.koib_nthreads)); + atomic_read (&kibnal_data.kib_nthreads)); set_current_state (TASK_INTERRUPTIBLE); schedule_timeout (HZ); } /* fall through */ - case OPENIBNAL_INIT_NOTHING: + case IBNAL_INIT_NOTHING: break; } - if (koibnal_data.koib_tx_descs != NULL) - PORTAL_FREE (koibnal_data.koib_tx_descs, - OPENIBNAL_TX_MSGS * sizeof(koib_tx_t)); + if (kibnal_data.kib_tx_descs != NULL) + PORTAL_FREE (kibnal_data.kib_tx_descs, + IBNAL_TX_MSGS * sizeof(kib_tx_t)); - if (koibnal_data.koib_peers != NULL) - PORTAL_FREE (koibnal_data.koib_peers, + if (kibnal_data.kib_peers != NULL) + PORTAL_FREE (kibnal_data.kib_peers, sizeof (struct list_head) * - koibnal_data.koib_peer_hash_size); + kibnal_data.kib_peer_hash_size); CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", atomic_read (&portal_kmemory)); printk(KERN_INFO "Lustre: OpenIB NAL unloaded (final mem %d)\n", atomic_read(&portal_kmemory)); - koibnal_data.koib_init = OPENIBNAL_INIT_NOTHING; + kibnal_data.kib_init = IBNAL_INIT_NOTHING; } int -koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, +kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, ptl_ni_limits_t *requested_limits, ptl_ni_limits_t *actual_limits) { @@ -1208,65 +1218,66 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, int rc; int i; - LASSERT (nal == &koibnal_api); + LASSERT (nal == &kibnal_api); if (nal->nal_refct != 0) { if (actual_limits != NULL) - *actual_limits = koibnal_lib.libnal_ni.ni_actual_limits; + *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits; /* This module got the first ref */ PORTAL_MODULE_USE; return (PTL_OK); } - LASSERT (koibnal_data.koib_init == OPENIBNAL_INIT_NOTHING); + LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING); - memset (&koibnal_data, 0, sizeof (koibnal_data)); /* zero pointers, flags etc */ + memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */ - init_MUTEX (&koibnal_data.koib_nid_mutex); - init_MUTEX_LOCKED (&koibnal_data.koib_nid_signal); - koibnal_data.koib_nid = PTL_NID_ANY; + init_MUTEX (&kibnal_data.kib_nid_mutex); + init_MUTEX_LOCKED (&kibnal_data.kib_nid_signal); + kibnal_data.kib_nid = PTL_NID_ANY; - rwlock_init(&koibnal_data.koib_global_lock); + rwlock_init(&kibnal_data.kib_global_lock); - koibnal_data.koib_peer_hash_size = OPENIBNAL_PEER_HASH_SIZE; - PORTAL_ALLOC (koibnal_data.koib_peers, - sizeof (struct list_head) * koibnal_data.koib_peer_hash_size); - if (koibnal_data.koib_peers == NULL) { + kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE; + PORTAL_ALLOC (kibnal_data.kib_peers, + sizeof (struct list_head) * kibnal_data.kib_peer_hash_size); + if (kibnal_data.kib_peers == NULL) { goto failed; } - for (i = 0; i < koibnal_data.koib_peer_hash_size; i++) - INIT_LIST_HEAD(&koibnal_data.koib_peers[i]); - - spin_lock_init (&koibnal_data.koib_connd_lock); - INIT_LIST_HEAD (&koibnal_data.koib_connd_peers); - INIT_LIST_HEAD (&koibnal_data.koib_connd_conns); - init_waitqueue_head (&koibnal_data.koib_connd_waitq); - - spin_lock_init (&koibnal_data.koib_sched_lock); - INIT_LIST_HEAD (&koibnal_data.koib_sched_txq); - INIT_LIST_HEAD (&koibnal_data.koib_sched_rxq); - init_waitqueue_head (&koibnal_data.koib_sched_waitq); - - spin_lock_init (&koibnal_data.koib_tx_lock); - INIT_LIST_HEAD (&koibnal_data.koib_idle_txs); - INIT_LIST_HEAD (&koibnal_data.koib_idle_nblk_txs); - init_waitqueue_head(&koibnal_data.koib_idle_tx_waitq); - - PORTAL_ALLOC (koibnal_data.koib_tx_descs, - OPENIBNAL_TX_MSGS * sizeof(koib_tx_t)); - if (koibnal_data.koib_tx_descs == NULL) { + for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) + INIT_LIST_HEAD(&kibnal_data.kib_peers[i]); + + spin_lock_init (&kibnal_data.kib_connd_lock); + INIT_LIST_HEAD (&kibnal_data.kib_connd_peers); + INIT_LIST_HEAD (&kibnal_data.kib_connd_conns); + init_waitqueue_head (&kibnal_data.kib_connd_waitq); + + spin_lock_init (&kibnal_data.kib_sched_lock); + INIT_LIST_HEAD (&kibnal_data.kib_sched_txq); + INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq); + init_waitqueue_head (&kibnal_data.kib_sched_waitq); + + spin_lock_init (&kibnal_data.kib_tx_lock); + INIT_LIST_HEAD (&kibnal_data.kib_idle_txs); + INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs); + init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq); + + PORTAL_ALLOC (kibnal_data.kib_tx_descs, + IBNAL_TX_MSGS * sizeof(kib_tx_t)); + if (kibnal_data.kib_tx_descs == NULL) { CERROR ("Can't allocate tx descs\n"); goto failed; } /* lists/ptrs/locks initialised */ - koibnal_data.koib_init = OPENIBNAL_INIT_DATA; + kibnal_data.kib_init = IBNAL_INIT_DATA; /*****************************************************/ + process_id.pid = requested_pid; - process_id.nid = koibnal_data.koib_nid; + process_id.nid = kibnal_data.kib_nid; - rc = lib_init(&koibnal_lib, nal, process_id, + rc = lib_init(&kibnal_lib, nal, process_id, requested_limits, actual_limits); if (rc != PTL_OK) { CERROR("lib_init failed: error %d\n", rc); @@ -1274,11 +1285,11 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, } /* lib interface initialised */ - koibnal_data.koib_init = OPENIBNAL_INIT_LIB; + kibnal_data.kib_init = IBNAL_INIT_LIB; /*****************************************************/ - for (i = 0; i < OPENIBNAL_N_SCHED; i++) { - rc = koibnal_thread_start (koibnal_scheduler, (void *)i); + for (i = 0; i < IBNAL_N_SCHED; i++) { + rc = kibnal_thread_start (kibnal_scheduler, (void *)i); if (rc != 0) { CERROR("Can't spawn openibnal scheduler[%d]: %d\n", i, rc); @@ -1286,56 +1297,56 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, } } - rc = koibnal_thread_start (koibnal_connd, NULL); + rc = kibnal_thread_start (kibnal_connd, NULL); if (rc != 0) { CERROR ("Can't spawn openibnal connd: %d\n", rc); goto failed; } - koibnal_data.koib_device = ib_device_get_by_index(0); - if (koibnal_data.koib_device == NULL) { + kibnal_data.kib_device = ib_device_get_by_index(0); + if (kibnal_data.kib_device == NULL) { CERROR ("Can't open ib device 0\n"); goto failed; } - rc = ib_device_properties_get(koibnal_data.koib_device, - &koibnal_data.koib_device_props); + rc = ib_device_properties_get(kibnal_data.kib_device, + &kibnal_data.kib_device_props); if (rc != 0) { CERROR ("Can't get device props: %d\n", rc); goto failed; } CDEBUG(D_NET, "Max Initiator: %d Max Responder %d\n", - koibnal_data.koib_device_props.max_initiator_per_qp, - koibnal_data.koib_device_props.max_responder_per_qp); + kibnal_data.kib_device_props.max_initiator_per_qp, + kibnal_data.kib_device_props.max_responder_per_qp); - koibnal_data.koib_port = 0; + kibnal_data.kib_port = 0; for (i = 1; i <= 2; i++) { - rc = ib_port_properties_get(koibnal_data.koib_device, i, - &koibnal_data.koib_port_props); + rc = ib_port_properties_get(kibnal_data.kib_device, i, + &kibnal_data.kib_port_props); if (rc == 0) { - koibnal_data.koib_port = i; + kibnal_data.kib_port = i; break; } } - if (koibnal_data.koib_port == 0) { + if (kibnal_data.kib_port == 0) { CERROR ("Can't find a port\n"); goto failed; } - rc = ib_pd_create(koibnal_data.koib_device, - NULL, &koibnal_data.koib_pd); + rc = ib_pd_create(kibnal_data.kib_device, + NULL, &kibnal_data.kib_pd); if (rc != 0) { CERROR ("Can't create PD: %d\n", rc); goto failed; } /* flag PD initialised */ - koibnal_data.koib_init = OPENIBNAL_INIT_PD; + kibnal_data.kib_init = IBNAL_INIT_PD; /*****************************************************/ -#if OPENIBNAL_FMR +#if IBNAL_FMR { - const int pool_size = OPENIBNAL_NTX + OPENIBNAL_NTX_NBLK; + const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK; struct ib_fmr_pool_param params = { .max_pages_per_fmr = PTL_MTU/PAGE_SIZE, .access = (IB_ACCESS_LOCAL_WRITE | @@ -1347,8 +1358,8 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, .flush_arg = NULL, .cache = 1, }; - rc = ib_fmr_pool_create(koibnal_data.koib_pd, ¶ms, - &koibnal_data.koib_fmr_pool); + rc = ib_fmr_pool_create(kibnal_data.kib_pd, ¶ms, + &kibnal_data.kib_fmr_pool); if (rc != 0) { CERROR ("Can't create FMR pool size %d: %d\n", pool_size, rc); @@ -1357,84 +1368,56 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, } /* flag FMR pool initialised */ - koibnal_data.koib_init = OPENIBNAL_INIT_FMR; + kibnal_data.kib_init = IBNAL_INIT_FMR; #endif /*****************************************************/ - rc = koibnal_setup_tx_descs(); + rc = kibnal_setup_tx_descs(); if (rc != 0) { CERROR ("Can't register tx descs: %d\n", rc); goto failed; } /* flag TX descs initialised */ - koibnal_data.koib_init = OPENIBNAL_INIT_TXD; + kibnal_data.kib_init = IBNAL_INIT_TXD; /*****************************************************/ { struct ib_cq_callback callback = { - .context = OPENIBNAL_CALLBACK_CTXT, + .context = IBNAL_CALLBACK_CTXT, .policy = IB_CQ_PROVIDER_REARM, .function = { - .entry = koibnal_rx_callback, + .entry = kibnal_callback, }, .arg = NULL, }; - int nentries = OPENIBNAL_RX_CQ_ENTRIES; + int nentries = IBNAL_CQ_ENTRIES; - rc = ib_cq_create (koibnal_data.koib_device, + rc = ib_cq_create (kibnal_data.kib_device, &nentries, &callback, NULL, - &koibnal_data.koib_rx_cq); + &kibnal_data.kib_cq); if (rc != 0) { - CERROR ("Can't create RX CQ: %d\n", rc); + CERROR ("Can't create CQ: %d\n", rc); goto failed; } /* I only want solicited events */ - rc = ib_cq_request_notification(koibnal_data.koib_rx_cq, 1); + rc = ib_cq_request_notification(kibnal_data.kib_cq, 1); LASSERT (rc == 0); } - /* flag RX CQ initialised */ - koibnal_data.koib_init = OPENIBNAL_INIT_RX_CQ; - /*****************************************************/ - - { - struct ib_cq_callback callback = { - .context = OPENIBNAL_CALLBACK_CTXT, - .policy = IB_CQ_PROVIDER_REARM, - .function = { - .entry = koibnal_tx_callback, - }, - .arg = NULL, - }; - int nentries = OPENIBNAL_TX_CQ_ENTRIES; - - rc = ib_cq_create (koibnal_data.koib_device, - &nentries, &callback, NULL, - &koibnal_data.koib_tx_cq); - if (rc != 0) { - CERROR ("Can't create RX CQ: %d\n", rc); - goto failed; - } - - /* I only want solicited events */ - rc = ib_cq_request_notification(koibnal_data.koib_tx_cq, 1); - LASSERT (rc == 0); - } - - /* flag TX CQ initialised */ - koibnal_data.koib_init = OPENIBNAL_INIT_TX_CQ; + /* flag CQ initialised */ + kibnal_data.kib_init = IBNAL_INIT_CQ; /*****************************************************/ - rc = libcfs_nal_cmd_register(OPENIBNAL, &koibnal_cmd, NULL); + rc = libcfs_nal_cmd_register(OPENIBNAL, &kibnal_cmd, NULL); if (rc != 0) { CERROR ("Can't initialise command interface (rc = %d)\n", rc); goto failed; } /* flag everything initialised */ - koibnal_data.koib_init = OPENIBNAL_INIT_ALL; + kibnal_data.kib_init = IBNAL_INIT_ALL; /*****************************************************/ printk(KERN_INFO "Lustre: OpenIB NAL loaded " @@ -1443,44 +1426,44 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, return (PTL_OK); failed: - koibnal_api_shutdown (&koibnal_api); + kibnal_api_shutdown (&kibnal_api); return (PTL_FAIL); } void __exit -koibnal_module_fini (void) +kibnal_module_fini (void) { #ifdef CONFIG_SYSCTL - if (koibnal_tunables.koib_sysctl != NULL) - unregister_sysctl_table (koibnal_tunables.koib_sysctl); + if (kibnal_tunables.kib_sysctl != NULL) + unregister_sysctl_table (kibnal_tunables.kib_sysctl); #endif - PtlNIFini(koibnal_ni); + PtlNIFini(kibnal_ni); ptl_unregister_nal(OPENIBNAL); } int __init -koibnal_module_init (void) +kibnal_module_init (void) { int rc; /* the following must be sizeof(int) for proc_dointvec() */ - LASSERT(sizeof (koibnal_tunables.koib_io_timeout) == sizeof (int)); + LASSERT(sizeof (kibnal_tunables.kib_io_timeout) == sizeof (int)); - koibnal_api.nal_ni_init = koibnal_api_startup; - koibnal_api.nal_ni_fini = koibnal_api_shutdown; + kibnal_api.nal_ni_init = kibnal_api_startup; + kibnal_api.nal_ni_fini = kibnal_api_shutdown; /* Initialise dynamic tunables to defaults once only */ - koibnal_tunables.koib_io_timeout = OPENIBNAL_IO_TIMEOUT; + kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT; - rc = ptl_register_nal(OPENIBNAL, &koibnal_api); + rc = ptl_register_nal(OPENIBNAL, &kibnal_api); if (rc != PTL_OK) { - CERROR("Can't register OPENIBNAL: %d\n", rc); + CERROR("Can't register IBNAL: %d\n", rc); return (-ENOMEM); /* or something... */ } /* Pure gateways want the NAL started up at module load time... */ - rc = PtlNIInit(OPENIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &koibnal_ni); + rc = PtlNIInit(OPENIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni); if (rc != PTL_OK && rc != PTL_IFACE_DUP) { ptl_unregister_nal(OPENIBNAL); return (-ENODEV); @@ -1488,8 +1471,8 @@ koibnal_module_init (void) #ifdef CONFIG_SYSCTL /* Press on regardless even if registering sysctl doesn't work */ - koibnal_tunables.koib_sysctl = - register_sysctl_table (koibnal_top_ctl_table, 0); + kibnal_tunables.kib_sysctl = + register_sysctl_table (kibnal_top_ctl_table, 0); #endif return (0); } @@ -1498,6 +1481,6 @@ MODULE_AUTHOR("Cluster File Systems, Inc. "); MODULE_DESCRIPTION("Kernel OpenIB NAL v0.01"); MODULE_LICENSE("GPL"); -module_init(koibnal_module_init); -module_exit(koibnal_module_fini); +module_init(kibnal_module_init); +module_exit(kibnal_module_fini); diff --git a/lnet/klnds/openiblnd/openiblnd.h b/lnet/klnds/openiblnd/openiblnd.h index 301d3ae..f0610f2 100644 --- a/lnet/klnds/openiblnd/openiblnd.h +++ b/lnet/klnds/openiblnd/openiblnd.h @@ -48,7 +48,7 @@ #include #include -#define DEBUG_SUBSYSTEM S_OPENIBNAL +#define DEBUG_SUBSYSTEM S_IBNAL #include #include @@ -59,144 +59,140 @@ #include #include -#define OPENIBNAL_SERVICE_NAME "openibnal" +#define IBNAL_SERVICE_NAME "openibnal" #if CONFIG_SMP -# define OPENIBNAL_N_SCHED num_online_cpus() /* # schedulers */ +# define IBNAL_N_SCHED num_online_cpus() /* # schedulers */ #else -# define OPENIBNAL_N_SCHED 1 /* # schedulers */ +# define IBNAL_N_SCHED 1 /* # schedulers */ #endif -#define OPENIBNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */ -#define OPENIBNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */ +#define IBNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */ +#define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */ -#define OPENIBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */ +#define IBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */ -#define OPENIBNAL_MSG_QUEUE_SIZE 8 /* # messages in-flight */ -#define OPENIBNAL_CREDIT_HIGHWATER 6 /* when to eagerly return credits */ -#define OPENIBNAL_RETRY 7 /* # times to retry */ -#define OPENIBNAL_RNR_RETRY 7 /* */ -#define OPENIBNAL_CM_RETRY 7 /* # times to retry connection */ -#define OPENIBNAL_FLOW_CONTROL 1 -#define OPENIBNAL_RESPONDER_RESOURCES 8 +#define IBNAL_MSG_QUEUE_SIZE 8 /* # messages/RDMAs in-flight */ +#define IBNAL_CREDIT_HIGHWATER 6 /* when to eagerly return credits */ +#define IBNAL_RETRY 7 /* # times to retry */ +#define IBNAL_RNR_RETRY 7 /* */ +#define IBNAL_CM_RETRY 7 /* # times to retry connection */ +#define IBNAL_FLOW_CONTROL 1 +#define IBNAL_RESPONDER_RESOURCES 8 -#define OPENIBNAL_NTX 64 /* # tx descs */ -#define OPENIBNAL_NTX_NBLK 256 /* # reserved tx descs */ +#define IBNAL_NTX 64 /* # tx descs */ +#define IBNAL_NTX_NBLK 256 /* # reserved tx descs */ -#define OPENIBNAL_PEER_HASH_SIZE 101 /* # peer lists */ +#define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */ -#define OPENIBNAL_RESCHED 100 /* # scheduler loops before reschedule */ +#define IBNAL_RESCHED 100 /* # scheduler loops before reschedule */ -#define OPENIBNAL_CONCURRENT_PEERS 1000 /* # nodes all talking at once to me */ +#define IBNAL_CONCURRENT_PEERS 1000 /* # nodes all talking at once to me */ /* default vals for runtime tunables */ -#define OPENIBNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */ +#define IBNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */ /************************/ /* derived constants... */ /* TX messages (shared by all connections) */ -#define OPENIBNAL_TX_MSGS (OPENIBNAL_NTX + OPENIBNAL_NTX_NBLK) -#define OPENIBNAL_TX_MSG_BYTES (OPENIBNAL_TX_MSGS * OPENIBNAL_MSG_SIZE) -#define OPENIBNAL_TX_MSG_PAGES ((OPENIBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) - -/* we may have up to 2 completions per transmit */ -#define OPENIBNAL_TX_CQ_ENTRIES (2*OPENIBNAL_TX_MSGS) +#define IBNAL_TX_MSGS (IBNAL_NTX + IBNAL_NTX_NBLK) +#define IBNAL_TX_MSG_BYTES (IBNAL_TX_MSGS * IBNAL_MSG_SIZE) +#define IBNAL_TX_MSG_PAGES ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) /* RX messages (per connection) */ -#define OPENIBNAL_RX_MSGS OPENIBNAL_MSG_QUEUE_SIZE -#define OPENIBNAL_RX_MSG_BYTES (OPENIBNAL_RX_MSGS * OPENIBNAL_MSG_SIZE) -#define OPENIBNAL_RX_MSG_PAGES ((OPENIBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) +#define IBNAL_RX_MSGS IBNAL_MSG_QUEUE_SIZE +#define IBNAL_RX_MSG_BYTES (IBNAL_RX_MSGS * IBNAL_MSG_SIZE) +#define IBNAL_RX_MSG_PAGES ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) -/* 1 completion per receive, per connection */ -#define OPENIBNAL_RX_CQ_ENTRIES (OPENIBNAL_RX_MSGS * OPENIBNAL_CONCURRENT_PEERS) +/* we may have up to 2 completions per transmit + + 1 completion per receive, per connection */ +#define IBNAL_CQ_ENTRIES ((2*IBNAL_TX_MSGS) + \ + (IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS)) -#define OPENIBNAL_RDMA_BASE 0x0eeb0000 -#define OPENIBNAL_FMR 1 -#define OPENIBNAL_CKSUM 0 -//#define OPENIBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_PROCESS -#define OPENIBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_INTERRUPT +#define IBNAL_RDMA_BASE 0x0eeb0000 +#define IBNAL_FMR 1 +#define IBNAL_CKSUM 0 +//#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_PROCESS +#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_INTERRUPT typedef struct { - int koib_io_timeout; /* comms timeout (seconds) */ - struct ctl_table_header *koib_sysctl; /* sysctl interface */ -} koib_tunables_t; + int kib_io_timeout; /* comms timeout (seconds) */ + struct ctl_table_header *kib_sysctl; /* sysctl interface */ +} kib_tunables_t; typedef struct { - int oibp_npages; /* # pages */ - int oibp_mapped; /* mapped? */ - __u64 oibp_vaddr; /* mapped region vaddr */ - __u32 oibp_lkey; /* mapped region lkey */ - __u32 oibp_rkey; /* mapped region rkey */ - struct ib_mr *oibp_handle; /* mapped region handle */ - struct page *oibp_pages[0]; -} koib_pages_t; + int ibp_npages; /* # pages */ + int ibp_mapped; /* mapped? */ + __u64 ibp_vaddr; /* mapped region vaddr */ + __u32 ibp_lkey; /* mapped region lkey */ + __u32 ibp_rkey; /* mapped region rkey */ + struct ib_mr *ibp_handle; /* mapped region handle */ + struct page *ibp_pages[0]; +} kib_pages_t; typedef struct { - int koib_init; /* initialisation state */ - __u64 koib_incarnation; /* which one am I */ - int koib_shutdown; /* shut down? */ - atomic_t koib_nthreads; /* # live threads */ - - __u64 koib_cm_service_id; /* service number I listen on */ - ptl_nid_t koib_nid; /* my NID */ - struct semaphore koib_nid_mutex; /* serialise NID ops */ - struct semaphore koib_nid_signal; /* signal completion */ - - rwlock_t koib_global_lock; /* stabilize peer/conn ops */ - - struct list_head *koib_peers; /* hash table of all my known peers */ - int koib_peer_hash_size; /* size of koib_peers */ - atomic_t koib_npeers; /* # peers extant */ - atomic_t koib_nconns; /* # connections extant */ - - struct list_head koib_connd_conns; /* connections to progress */ - struct list_head koib_connd_peers; /* peers waiting for a connection */ - wait_queue_head_t koib_connd_waitq; /* connection daemons sleep here */ - unsigned long koib_connd_waketime; /* when connd will wake */ - spinlock_t koib_connd_lock; /* serialise */ - - wait_queue_head_t koib_sched_waitq; /* schedulers sleep here */ - struct list_head koib_sched_txq; /* tx requiring attention */ - struct list_head koib_sched_rxq; /* rx requiring attention */ - spinlock_t koib_sched_lock; /* serialise */ + int kib_init; /* initialisation state */ + __u64 kib_incarnation; /* which one am I */ + int kib_shutdown; /* shut down? */ + atomic_t kib_nthreads; /* # live threads */ + + __u64 kib_service_id; /* service number I listen on */ + ptl_nid_t kib_nid; /* my NID */ + struct semaphore kib_nid_mutex; /* serialise NID ops */ + struct semaphore kib_nid_signal; /* signal completion */ + + rwlock_t kib_global_lock; /* stabilize peer/conn ops */ + + struct list_head *kib_peers; /* hash table of all my known peers */ + int kib_peer_hash_size; /* size of kib_peers */ + atomic_t kib_npeers; /* # peers extant */ + atomic_t kib_nconns; /* # connections extant */ + + struct list_head kib_connd_conns; /* connections to progress */ + struct list_head kib_connd_peers; /* peers waiting for a connection */ + wait_queue_head_t kib_connd_waitq; /* connection daemons sleep here */ + unsigned long kib_connd_waketime; /* when connd will wake */ + spinlock_t kib_connd_lock; /* serialise */ + + wait_queue_head_t kib_sched_waitq; /* schedulers sleep here */ + struct list_head kib_sched_txq; /* tx requiring attention */ + struct list_head kib_sched_rxq; /* rx requiring attention */ + spinlock_t kib_sched_lock; /* serialise */ - struct koib_tx *koib_tx_descs; /* all the tx descriptors */ - koib_pages_t *koib_tx_pages; /* premapped tx msg pages */ - - struct list_head koib_idle_txs; /* idle tx descriptors */ - struct list_head koib_idle_nblk_txs; /* idle reserved tx descriptors */ - wait_queue_head_t koib_idle_tx_waitq; /* block here for tx descriptor */ - __u64 koib_next_tx_cookie; /* RDMA completion cookie */ - spinlock_t koib_tx_lock; /* serialise */ + struct kib_tx *kib_tx_descs; /* all the tx descriptors */ + kib_pages_t *kib_tx_pages; /* premapped tx msg pages */ + + struct list_head kib_idle_txs; /* idle tx descriptors */ + struct list_head kib_idle_nblk_txs; /* idle reserved tx descriptors */ + wait_queue_head_t kib_idle_tx_waitq; /* block here for tx descriptor */ + __u64 kib_next_tx_cookie; /* RDMA completion cookie */ + spinlock_t kib_tx_lock; /* serialise */ - struct ib_device *koib_device; /* "the" device */ - struct ib_device_properties koib_device_props; /* its properties */ - int koib_port; /* port on the device */ - struct ib_port_properties koib_port_props; /* its properties */ - struct ib_pd *koib_pd; /* protection domain */ -#if OPENIBNAL_FMR - struct ib_fmr_pool *koib_fmr_pool; /* fast memory region pool */ + struct ib_device *kib_device; /* "the" device */ + struct ib_device_properties kib_device_props; /* its properties */ + int kib_port; /* port on the device */ + struct ib_port_properties kib_port_props; /* its properties */ + struct ib_pd *kib_pd; /* protection domain */ +#if IBNAL_FMR + struct ib_fmr_pool *kib_fmr_pool; /* fast memory region pool */ #endif - struct ib_cq *koib_rx_cq; /* receive completion queue */ - struct ib_cq *koib_tx_cq; /* transmit completion queue */ - void *koib_listen_handle; /* where I listen for connections */ - struct ib_common_attrib_service koib_service; /* SM service */ + struct ib_cq *kib_cq; /* completion queue */ + void *kib_listen_handle; /* where I listen for connections */ -} koib_data_t; - -#define OPENIBNAL_INIT_NOTHING 0 -#define OPENIBNAL_INIT_DATA 1 -#define OPENIBNAL_INIT_LIB 2 -#define OPENIBNAL_INIT_PD 3 -#define OPENIBNAL_INIT_FMR 4 -#define OPENIBNAL_INIT_TXD 5 -#define OPENIBNAL_INIT_RX_CQ 6 -#define OPENIBNAL_INIT_TX_CQ 7 -#define OPENIBNAL_INIT_ALL 8 +} kib_data_t; + +#define IBNAL_INIT_NOTHING 0 +#define IBNAL_INIT_DATA 1 +#define IBNAL_INIT_LIB 2 +#define IBNAL_INIT_PD 3 +#define IBNAL_INIT_FMR 4 +#define IBNAL_INIT_TXD 5 +#define IBNAL_INIT_CQ 6 +#define IBNAL_INIT_ALL 7 /************************************************************************ * Wire message structs. @@ -214,125 +210,125 @@ typedef struct __u32 md_lkey; __u32 md_rkey; __u64 md_addr; -} koib_md_t; +} kib_md_t; typedef struct { __u32 rd_key; /* remote key */ __u32 rd_nob; /* # of bytes */ __u64 rd_addr; /* remote io vaddr */ -} koib_rdma_desc_t; +} kib_rdma_desc_t; typedef struct { - ptl_hdr_t oibim_hdr; /* portals header */ - char oibim_payload[0]; /* piggy-backed payload */ -} koib_immediate_msg_t; + ptl_hdr_t ibim_hdr; /* portals header */ + char ibim_payload[0]; /* piggy-backed payload */ +} kib_immediate_msg_t; typedef struct { - ptl_hdr_t oibrm_hdr; /* portals header */ - __u64 oibrm_cookie; /* opaque completion cookie */ - koib_rdma_desc_t oibrm_desc; /* where to suck/blow */ -} koib_rdma_msg_t; + ptl_hdr_t ibrm_hdr; /* portals header */ + __u64 ibrm_cookie; /* opaque completion cookie */ + kib_rdma_desc_t ibrm_desc; /* where to suck/blow */ +} kib_rdma_msg_t; typedef struct { - __u64 oibcm_cookie; /* opaque completion cookie */ - __u32 oibcm_status; /* completion status */ -} koib_completion_msg_t; + __u64 ibcm_cookie; /* opaque completion cookie */ + __u32 ibcm_status; /* completion status */ +} kib_completion_msg_t; typedef struct { - __u32 oibm_magic; /* I'm an openibnal message */ - __u16 oibm_version; /* this is my version number */ - __u8 oibm_type; /* msg type */ - __u8 oibm_credits; /* returned credits */ -#if OPENIBNAL_CKSUM - __u32 oibm_nob; - __u32 oibm_cksum; + __u32 ibm_magic; /* I'm an openibnal message */ + __u16 ibm_version; /* this is my version number */ + __u8 ibm_type; /* msg type */ + __u8 ibm_credits; /* returned credits */ +#if IBNAL_CKSUM + __u32 ibm_nob; + __u32 ibm_cksum; #endif union { - koib_immediate_msg_t immediate; - koib_rdma_msg_t rdma; - koib_completion_msg_t completion; - } oibm_u; -} koib_msg_t; - -#define OPENIBNAL_MSG_MAGIC 0x0be91b91 /* unique magic */ -#define OPENIBNAL_MSG_VERSION 1 /* current protocol version */ - -#define OPENIBNAL_MSG_NOOP 0xd0 /* nothing (just credits) */ -#define OPENIBNAL_MSG_IMMEDIATE 0xd1 /* portals hdr + payload */ -#define OPENIBNAL_MSG_PUT_RDMA 0xd2 /* portals PUT hdr + source rdma desc */ -#define OPENIBNAL_MSG_PUT_DONE 0xd3 /* signal PUT rdma completion */ -#define OPENIBNAL_MSG_GET_RDMA 0xd4 /* portals GET hdr + sink rdma desc */ -#define OPENIBNAL_MSG_GET_DONE 0xd5 /* signal GET rdma completion */ + kib_immediate_msg_t immediate; + kib_rdma_msg_t rdma; + kib_completion_msg_t completion; + } ibm_u; +} kib_msg_t; + +#define IBNAL_MSG_MAGIC 0x0be91b91 /* unique magic */ +#define IBNAL_MSG_VERSION 1 /* current protocol version */ + +#define IBNAL_MSG_NOOP 0xd0 /* nothing (just credits) */ +#define IBNAL_MSG_IMMEDIATE 0xd1 /* portals hdr + payload */ +#define IBNAL_MSG_PUT_RDMA 0xd2 /* portals PUT hdr + source rdma desc */ +#define IBNAL_MSG_PUT_DONE 0xd3 /* signal PUT rdma completion */ +#define IBNAL_MSG_GET_RDMA 0xd4 /* portals GET hdr + sink rdma desc */ +#define IBNAL_MSG_GET_DONE 0xd5 /* signal GET rdma completion */ /***********************************************************************/ -typedef struct koib_rx /* receive message */ +typedef struct kib_rx /* receive message */ { struct list_head rx_list; /* queue for attention */ - struct koib_conn *rx_conn; /* owning conn */ + struct kib_conn *rx_conn; /* owning conn */ int rx_rdma; /* RDMA completion posted? */ int rx_posted; /* posted? */ __u64 rx_vaddr; /* pre-mapped buffer (hca vaddr) */ - koib_msg_t *rx_msg; /* pre-mapped buffer (host vaddr) */ + kib_msg_t *rx_msg; /* pre-mapped buffer (host vaddr) */ struct ib_receive_param rx_sp; /* receive work item */ struct ib_gather_scatter rx_gl; /* and it's memory */ -} koib_rx_t; +} kib_rx_t; -typedef struct koib_tx /* transmit message */ +typedef struct kib_tx /* transmit message */ { struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */ int tx_isnblk; /* I'm reserved for non-blocking sends */ - struct koib_conn *tx_conn; /* owning conn */ + struct kib_conn *tx_conn; /* owning conn */ int tx_mapped; /* mapped for RDMA? */ int tx_sending; /* # tx callbacks outstanding */ int tx_status; /* completion status */ - int tx_passive_rdma; /* waiting for peer to RDMA? */ - int tx_passive_rdma_wait; /* on ibc_rdma_queue */ - unsigned long tx_passive_rdma_deadline; /* completion deadline */ + unsigned long tx_deadline; /* completion deadline */ + int tx_passive_rdma; /* peer sucks/blows */ + int tx_passive_rdma_wait; /* waiting for peer to complete */ __u64 tx_passive_rdma_cookie; /* completion cookie */ lib_msg_t *tx_libmsg[2]; /* lib msgs to finalize on completion */ - koib_md_t tx_md; /* RDMA mapping (active/passive) */ + kib_md_t tx_md; /* RDMA mapping (active/passive) */ __u64 tx_vaddr; /* pre-mapped buffer (hca vaddr) */ - koib_msg_t *tx_msg; /* pre-mapped buffer (host vaddr) */ + kib_msg_t *tx_msg; /* pre-mapped buffer (host vaddr) */ int tx_nsp; /* # send work items */ struct ib_send_param tx_sp[2]; /* send work items... */ struct ib_gather_scatter tx_gl[2]; /* ...and their memory */ -} koib_tx_t; +} kib_tx_t; -#define KOIB_TX_UNMAPPED 0 -#define KOIB_TX_MAPPED 1 -#define KOIB_TX_MAPPED_FMR 2 +#define KIB_TX_UNMAPPED 0 +#define KIB_TX_MAPPED 1 +#define KIB_TX_MAPPED_FMR 2 -typedef struct koib_wire_connreq +typedef struct kib_wire_connreq { __u32 wcr_magic; /* I'm an openibnal connreq */ __u16 wcr_version; /* this is my version number */ __u16 wcr_queue_depth; /* this is my receive queue size */ __u64 wcr_nid; /* peer's NID */ __u64 wcr_incarnation; /* peer's incarnation */ -} koib_wire_connreq_t; +} kib_wire_connreq_t; -typedef struct koib_connreq +typedef struct kib_connreq { /* connection-in-progress */ - struct koib_conn *cr_conn; - koib_wire_connreq_t cr_wcr; + struct kib_conn *cr_conn; + kib_wire_connreq_t cr_wcr; __u64 cr_tid; struct ib_common_attrib_service cr_service; tTS_IB_GID cr_gid; struct ib_path_record cr_path; struct ib_cm_active_param cr_connparam; -} koib_connreq_t; +} kib_connreq_t; -typedef struct koib_conn +typedef struct kib_conn { - struct koib_peer *ibc_peer; /* owning peer */ + struct kib_peer *ibc_peer; /* owning peer */ struct list_head ibc_list; /* stash on peer's conn list */ __u64 ibc_incarnation; /* which instance of the peer */ atomic_t ibc_refcount; /* # users */ @@ -342,27 +338,27 @@ typedef struct koib_conn int ibc_credits; /* # credits I have */ int ibc_outstanding_credits; /* # credits to return */ struct list_head ibc_tx_queue; /* send queue */ - struct list_head ibc_rdma_queue; /* tx awaiting RDMA completion */ + struct list_head ibc_active_txs; /* active tx awaiting completion */ spinlock_t ibc_lock; /* serialise */ - koib_rx_t *ibc_rxs; /* the rx descs */ - koib_pages_t *ibc_rx_pages; /* premapped rx msg pages */ + kib_rx_t *ibc_rxs; /* the rx descs */ + kib_pages_t *ibc_rx_pages; /* premapped rx msg pages */ struct ib_qp *ibc_qp; /* queue pair */ __u32 ibc_qpn; /* queue pair number */ tTS_IB_CM_COMM_ID ibc_comm_id; /* connection ID? */ - koib_connreq_t *ibc_connreq; /* connection request state */ -} koib_conn_t; + kib_connreq_t *ibc_connreq; /* connection request state */ +} kib_conn_t; -#define OPENIBNAL_CONN_INIT_NOTHING 0 /* initial state */ -#define OPENIBNAL_CONN_INIT_QP 1 /* ibc_qp set up */ -#define OPENIBNAL_CONN_CONNECTING 2 /* started to connect */ -#define OPENIBNAL_CONN_ESTABLISHED 3 /* connection established */ -#define OPENIBNAL_CONN_DEATHROW 4 /* waiting to be closed */ -#define OPENIBNAL_CONN_ZOMBIE 5 /* waiting to be freed */ +#define IBNAL_CONN_INIT_NOTHING 0 /* initial state */ +#define IBNAL_CONN_INIT_QP 1 /* ibc_qp set up */ +#define IBNAL_CONN_CONNECTING 2 /* started to connect */ +#define IBNAL_CONN_ESTABLISHED 3 /* connection established */ +#define IBNAL_CONN_DEATHROW 4 /* waiting to be closed */ +#define IBNAL_CONN_ZOMBIE 5 /* waiting to be freed */ -typedef struct koib_peer +typedef struct kib_peer { struct list_head ibp_list; /* stash on global peer list */ - struct list_head ibp_connd_list; /* schedule on koib_connd_peers */ + struct list_head ibp_connd_list; /* schedule on kib_connd_peers */ ptl_nid_t ibp_nid; /* who's on the other end(s) */ atomic_t ibp_refcount; /* # users */ int ibp_persistence; /* "known" peer refs */ @@ -371,30 +367,30 @@ typedef struct koib_peer int ibp_connecting; /* connecting+accepting */ unsigned long ibp_reconnect_time; /* when reconnect may be attempted */ unsigned long ibp_reconnect_interval; /* exponential backoff */ -} koib_peer_t; +} kib_peer_t; -extern lib_nal_t koibnal_lib; -extern koib_data_t koibnal_data; -extern koib_tunables_t koibnal_tunables; +extern lib_nal_t kibnal_lib; +extern kib_data_t kibnal_data; +extern kib_tunables_t kibnal_tunables; static inline struct list_head * -koibnal_nid2peerlist (ptl_nid_t nid) +kibnal_nid2peerlist (ptl_nid_t nid) { - unsigned int hash = ((unsigned int)nid) % koibnal_data.koib_peer_hash_size; + unsigned int hash = ((unsigned int)nid) % kibnal_data.kib_peer_hash_size; - return (&koibnal_data.koib_peers [hash]); + return (&kibnal_data.kib_peers [hash]); } static inline int -koibnal_peer_active(koib_peer_t *peer) +kibnal_peer_active(kib_peer_t *peer) { /* Am I in the peer hash table? */ return (!list_empty(&peer->ibp_list)); } static inline void -koibnal_queue_tx_locked (koib_tx_t *tx, koib_conn_t *conn) +kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn) { /* CAVEAT EMPTOR: tx takes caller's ref on conn */ @@ -402,40 +398,41 @@ koibnal_queue_tx_locked (koib_tx_t *tx, koib_conn_t *conn) LASSERT (tx->tx_conn == NULL); /* only set here */ tx->tx_conn = conn; + tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ; list_add_tail(&tx->tx_list, &conn->ibc_tx_queue); } -#define KOIBNAL_SERVICE_KEY_MASK (IB_SA_SERVICE_COMP_MASK_NAME | \ - IB_SA_SERVICE_COMP_MASK_DATA8_1 | \ - IB_SA_SERVICE_COMP_MASK_DATA8_2 | \ - IB_SA_SERVICE_COMP_MASK_DATA8_3 | \ - IB_SA_SERVICE_COMP_MASK_DATA8_4 | \ - IB_SA_SERVICE_COMP_MASK_DATA8_5 | \ - IB_SA_SERVICE_COMP_MASK_DATA8_6 | \ - IB_SA_SERVICE_COMP_MASK_DATA8_7 | \ - IB_SA_SERVICE_COMP_MASK_DATA8_8) +#define KIBNAL_SERVICE_KEY_MASK (IB_SA_SERVICE_COMP_MASK_NAME | \ + IB_SA_SERVICE_COMP_MASK_DATA8_1 | \ + IB_SA_SERVICE_COMP_MASK_DATA8_2 | \ + IB_SA_SERVICE_COMP_MASK_DATA8_3 | \ + IB_SA_SERVICE_COMP_MASK_DATA8_4 | \ + IB_SA_SERVICE_COMP_MASK_DATA8_5 | \ + IB_SA_SERVICE_COMP_MASK_DATA8_6 | \ + IB_SA_SERVICE_COMP_MASK_DATA8_7 | \ + IB_SA_SERVICE_COMP_MASK_DATA8_8) static inline __u64* -koibnal_service_nid_field(struct ib_common_attrib_service *srv) +kibnal_service_nid_field(struct ib_common_attrib_service *srv) { - /* must be consistent with KOIBNAL_SERVICE_KEY_MASK */ + /* must be consistent with KIBNAL_SERVICE_KEY_MASK */ return (__u64 *)srv->service_data8; } static inline void -koibnal_set_service_keys(struct ib_common_attrib_service *srv, ptl_nid_t nid) +kibnal_set_service_keys(struct ib_common_attrib_service *srv, ptl_nid_t nid) { - LASSERT (strlen (OPENIBNAL_SERVICE_NAME) < sizeof(srv->service_name)); + LASSERT (strlen (IBNAL_SERVICE_NAME) < sizeof(srv->service_name)); memset (srv->service_name, 0, sizeof(srv->service_name)); - strcpy (srv->service_name, OPENIBNAL_SERVICE_NAME); + strcpy (srv->service_name, IBNAL_SERVICE_NAME); - *koibnal_service_nid_field(srv) = cpu_to_le64(nid); + *kibnal_service_nid_field(srv) = cpu_to_le64(nid); } #if 0 static inline void -koibnal_show_rdma_attr (koib_conn_t *conn) +kibnal_show_rdma_attr (kib_conn_t *conn) { struct ib_qp_attribute qp_attr; int rc; @@ -457,7 +454,7 @@ koibnal_show_rdma_attr (koib_conn_t *conn) #if CONFIG_X86 static inline __u64 -koibnal_page2phys (struct page *p) +kibnal_page2phys (struct page *p) { __u64 page_number = p - mem_map; @@ -467,42 +464,69 @@ koibnal_page2phys (struct page *p) # error "no page->phys" #endif -extern koib_peer_t *koibnal_create_peer (ptl_nid_t nid); -extern void koibnal_put_peer (koib_peer_t *peer); -extern int koibnal_del_peer (ptl_nid_t nid, int single_share); -extern koib_peer_t *koibnal_find_peer_locked (ptl_nid_t nid); -extern void koibnal_unlink_peer_locked (koib_peer_t *peer); -extern int koibnal_close_stale_conns_locked (koib_peer_t *peer, +/* CAVEAT EMPTOR: + * We rely on tx/rx descriptor alignment to allow us to use the lowest bit + * of the work request id as a flag to determine if the completion is for a + * transmit or a receive. It seems that that the CQ entry's 'op' field + * isn't always set correctly on completions that occur after QP teardown. */ + +static inline __u64 +kibnal_ptr2wreqid (void *ptr, int isrx) +{ + unsigned long lptr = (unsigned long)ptr; + + LASSERT ((lptr & 1) == 0); + return (__u64)(lptr | (isrx ? 1 : 0)); +} + +static inline void * +kibnal_wreqid2ptr (__u64 wreqid) +{ + return (void *)(((unsigned long)wreqid) & ~1UL); +} + +static inline int +kibnal_wreqid_is_rx (__u64 wreqid) +{ + return (wreqid & 1) != 0; +} + +extern kib_peer_t *kibnal_create_peer (ptl_nid_t nid); +extern void kibnal_put_peer (kib_peer_t *peer); +extern int kibnal_del_peer (ptl_nid_t nid, int single_share); +extern kib_peer_t *kibnal_find_peer_locked (ptl_nid_t nid); +extern void kibnal_unlink_peer_locked (kib_peer_t *peer); +extern int kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation); -extern koib_conn_t *koibnal_create_conn (void); -extern void koibnal_put_conn (koib_conn_t *conn); -extern void koibnal_destroy_conn (koib_conn_t *conn); -extern int koibnal_alloc_pages (koib_pages_t **pp, int npages, int access); -extern void koibnal_free_pages (koib_pages_t *p); +extern kib_conn_t *kibnal_create_conn (void); +extern void kibnal_put_conn (kib_conn_t *conn); +extern void kibnal_destroy_conn (kib_conn_t *conn); +extern int kibnal_alloc_pages (kib_pages_t **pp, int npages, int access); +extern void kibnal_free_pages (kib_pages_t *p); -extern void koibnal_check_sends (koib_conn_t *conn); +extern void kibnal_check_sends (kib_conn_t *conn); extern tTS_IB_CM_CALLBACK_RETURN -koibnal_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid, +kibnal_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid, void *param, void *arg); extern tTS_IB_CM_CALLBACK_RETURN -koibnal_passive_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid, +kibnal_passive_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid, void *param, void *arg); -extern void koibnal_close_conn_locked (koib_conn_t *conn, int error); -extern void koibnal_destroy_conn (koib_conn_t *conn); -extern int koibnal_thread_start (int (*fn)(void *arg), void *arg); -extern int koibnal_scheduler(void *arg); -extern int koibnal_connd (void *arg); -extern void koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg); -extern void koibnal_tx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg); -extern void koibnal_init_tx_msg (koib_tx_t *tx, int type, int body_nob); -extern int koibnal_close_conn (koib_conn_t *conn, int why); -extern void koibnal_start_active_rdma (int type, int status, - koib_rx_t *rx, lib_msg_t *libmsg, - unsigned int niov, - struct iovec *iov, ptl_kiov_t *kiov, - size_t offset, size_t nob); +extern void kibnal_close_conn_locked (kib_conn_t *conn, int error); +extern void kibnal_destroy_conn (kib_conn_t *conn); +extern int kibnal_thread_start (int (*fn)(void *arg), void *arg); +extern int kibnal_scheduler(void *arg); +extern int kibnal_connd (void *arg); +extern void kibnal_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg); +extern void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob); +extern int kibnal_close_conn (kib_conn_t *conn, int why); +extern void kibnal_start_active_rdma (int type, int status, + kib_rx_t *rx, lib_msg_t *libmsg, + unsigned int niov, + struct iovec *iov, ptl_kiov_t *kiov, + size_t offset, size_t nob); + diff --git a/lnet/klnds/openiblnd/openiblnd_cb.c b/lnet/klnds/openiblnd/openiblnd_cb.c index 79bf37a..d774853 100644 --- a/lnet/klnds/openiblnd/openiblnd_cb.c +++ b/lnet/klnds/openiblnd/openiblnd_cb.c @@ -28,20 +28,20 @@ * */ void -koibnal_schedule_tx_done (koib_tx_t *tx) +kibnal_schedule_tx_done (kib_tx_t *tx) { unsigned long flags; - spin_lock_irqsave (&koibnal_data.koib_sched_lock, flags); + spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags); - list_add_tail(&tx->tx_list, &koibnal_data.koib_sched_txq); - wake_up (&koibnal_data.koib_sched_waitq); + list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq); + wake_up (&kibnal_data.kib_sched_waitq); - spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, flags); + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); } void -koibnal_tx_done (koib_tx_t *tx) +kibnal_tx_done (kib_tx_t *tx) { ptl_err_t ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL; unsigned long flags; @@ -49,31 +49,31 @@ koibnal_tx_done (koib_tx_t *tx) int rc; LASSERT (tx->tx_sending == 0); /* mustn't be awaiting callback */ - LASSERT (!tx->tx_passive_rdma_wait); /* mustn't be on ibc_rdma_queue */ + LASSERT (!tx->tx_passive_rdma_wait); /* mustn't be awaiting RDMA */ switch (tx->tx_mapped) { default: LBUG(); - case KOIB_TX_UNMAPPED: + case KIB_TX_UNMAPPED: break; - case KOIB_TX_MAPPED: + case KIB_TX_MAPPED: if (in_interrupt()) { /* can't deregister memory in IRQ context... */ - koibnal_schedule_tx_done(tx); + kibnal_schedule_tx_done(tx); return; } rc = ib_memory_deregister(tx->tx_md.md_handle.mr); LASSERT (rc == 0); - tx->tx_mapped = KOIB_TX_UNMAPPED; + tx->tx_mapped = KIB_TX_UNMAPPED; break; -#if OPENIBNAL_FMR - case KOIB_TX_MAPPED_FMR: +#if IBNAL_FMR + case KIB_TX_MAPPED_FMR: if (in_interrupt() && tx->tx_status != 0) { /* can't flush FMRs in IRQ context... */ - koibnal_schedule_tx_done(tx); + kibnal_schedule_tx_done(tx); return; } @@ -81,8 +81,8 @@ koibnal_tx_done (koib_tx_t *tx) LASSERT (rc == 0); if (tx->tx_status != 0) - ib_fmr_pool_force_flush(koibnal_data.koib_fmr_pool); - tx->tx_mapped = KOIB_TX_UNMAPPED; + ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool); + tx->tx_mapped = KIB_TX_UNMAPPED; break; #endif } @@ -92,12 +92,12 @@ koibnal_tx_done (koib_tx_t *tx) if (tx->tx_libmsg[i] == NULL) continue; - lib_finalize (&koibnal_lib, NULL, tx->tx_libmsg[i], ptlrc); + lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc); tx->tx_libmsg[i] = NULL; } if (tx->tx_conn != NULL) { - koibnal_put_conn (tx->tx_conn); + kibnal_put_conn (tx->tx_conn); tx->tx_conn = NULL; } @@ -105,52 +105,52 @@ koibnal_tx_done (koib_tx_t *tx) tx->tx_passive_rdma = 0; tx->tx_status = 0; - spin_lock_irqsave (&koibnal_data.koib_tx_lock, flags); + spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); if (tx->tx_isnblk) { - list_add_tail (&tx->tx_list, &koibnal_data.koib_idle_nblk_txs); + list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs); } else { - list_add_tail (&tx->tx_list, &koibnal_data.koib_idle_txs); - wake_up (&koibnal_data.koib_idle_tx_waitq); + list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs); + wake_up (&kibnal_data.kib_idle_tx_waitq); } - spin_unlock_irqrestore (&koibnal_data.koib_tx_lock, flags); + spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); } -koib_tx_t * -koibnal_get_idle_tx (int may_block) +kib_tx_t * +kibnal_get_idle_tx (int may_block) { - unsigned long flags; - koib_tx_t *tx = NULL; + unsigned long flags; + kib_tx_t *tx = NULL; for (;;) { - spin_lock_irqsave (&koibnal_data.koib_tx_lock, flags); + spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); /* "normal" descriptor is free */ - if (!list_empty (&koibnal_data.koib_idle_txs)) { - tx = list_entry (koibnal_data.koib_idle_txs.next, - koib_tx_t, tx_list); + if (!list_empty (&kibnal_data.kib_idle_txs)) { + tx = list_entry (kibnal_data.kib_idle_txs.next, + kib_tx_t, tx_list); break; } if (!may_block) { /* may dip into reserve pool */ - if (list_empty (&koibnal_data.koib_idle_nblk_txs)) { + if (list_empty (&kibnal_data.kib_idle_nblk_txs)) { CERROR ("reserved tx desc pool exhausted\n"); break; } - tx = list_entry (koibnal_data.koib_idle_nblk_txs.next, - koib_tx_t, tx_list); + tx = list_entry (kibnal_data.kib_idle_nblk_txs.next, + kib_tx_t, tx_list); break; } /* block for idle tx */ - spin_unlock_irqrestore (&koibnal_data.koib_tx_lock, flags); + spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); - wait_event (koibnal_data.koib_idle_tx_waitq, - !list_empty (&koibnal_data.koib_idle_txs) || - koibnal_data.koib_shutdown); + wait_event (kibnal_data.kib_idle_tx_waitq, + !list_empty (&kibnal_data.kib_idle_txs) || + kibnal_data.kib_shutdown); } if (tx != NULL) { @@ -159,9 +159,9 @@ koibnal_get_idle_tx (int may_block) /* Allocate a new passive RDMA completion cookie. It might * not be needed, but we've got a lock right now and we're * unlikely to wrap... */ - tx->tx_passive_rdma_cookie = koibnal_data.koib_next_tx_cookie++; + tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++; - LASSERT (tx->tx_mapped == KOIB_TX_UNMAPPED); + LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); LASSERT (tx->tx_nsp == 0); LASSERT (tx->tx_sending == 0); LASSERT (tx->tx_status == 0); @@ -172,15 +172,15 @@ koibnal_get_idle_tx (int may_block) LASSERT (tx->tx_libmsg[1] == NULL); } - spin_unlock_irqrestore (&koibnal_data.koib_tx_lock, flags); + spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); return (tx); } int -koibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) +kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) { - /* I would guess that if koibnal_get_peer (nid) == NULL, + /* I would guess that if kibnal_get_peer (nid) == NULL, and we're not routing, then 'nid' is very distant :) */ if ( nal->libnal_ni.ni_pid.nid == nid ) { *dist = 0; @@ -192,7 +192,7 @@ koibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) } void -koibnal_complete_passive_rdma(koib_conn_t *conn, __u64 cookie, int status) +kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status) { struct list_head *ttmp; unsigned long flags; @@ -200,30 +200,34 @@ koibnal_complete_passive_rdma(koib_conn_t *conn, __u64 cookie, int status) spin_lock_irqsave (&conn->ibc_lock, flags); - list_for_each (ttmp, &conn->ibc_rdma_queue) { - koib_tx_t *tx = list_entry(ttmp, koib_tx_t, tx_list); - - LASSERT (tx->tx_passive_rdma); - LASSERT (tx->tx_passive_rdma_wait); + list_for_each (ttmp, &conn->ibc_active_txs) { + kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list); - if (tx->tx_passive_rdma_cookie != cookie) - continue; + LASSERT (tx->tx_passive_rdma || + !tx->tx_passive_rdma_wait); - CDEBUG(D_NET, "Complete %p "LPD64"\n", tx, cookie); + LASSERT (tx->tx_passive_rdma_wait || + tx->tx_sending != 0); - list_del (&tx->tx_list); + if (!tx->tx_passive_rdma_wait || + tx->tx_passive_rdma_cookie != cookie) + continue; + + CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status); + tx->tx_status = status; tx->tx_passive_rdma_wait = 0; idle = (tx->tx_sending == 0); - tx->tx_status = status; + if (idle) + list_del (&tx->tx_list); spin_unlock_irqrestore (&conn->ibc_lock, flags); /* I could be racing with tx callbacks. It's whoever * _makes_ tx idle that frees it */ if (idle) - koibnal_tx_done (tx); + kibnal_tx_done (tx); return; } @@ -234,32 +238,32 @@ koibnal_complete_passive_rdma(koib_conn_t *conn, __u64 cookie, int status) } void -koibnal_post_rx (koib_rx_t *rx, int do_credits) +kibnal_post_rx (kib_rx_t *rx, int do_credits) { - koib_conn_t *conn = rx->rx_conn; + kib_conn_t *conn = rx->rx_conn; int rc; unsigned long flags; rx->rx_gl = (struct ib_gather_scatter) { .address = rx->rx_vaddr, - .length = OPENIBNAL_MSG_SIZE, - .key = conn->ibc_rx_pages->oibp_lkey, + .length = IBNAL_MSG_SIZE, + .key = conn->ibc_rx_pages->ibp_lkey, }; - + rx->rx_sp = (struct ib_receive_param) { - .work_request_id = (__u64)(unsigned long)rx, + .work_request_id = kibnal_ptr2wreqid(rx, 1), .scatter_list = &rx->rx_gl, .num_scatter_entries = 1, .device_specific = NULL, .signaled = 1, }; - LASSERT (conn->ibc_state >= OPENIBNAL_CONN_ESTABLISHED); + LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED); LASSERT (!rx->rx_posted); rx->rx_posted = 1; mb(); - if (conn->ibc_state != OPENIBNAL_CONN_ESTABLISHED) + if (conn->ibc_state != IBNAL_CONN_ESTABLISHED) rc = -ECONNABORTED; else rc = ib_receive (conn->ibc_qp, &rx->rx_sp, 1); @@ -270,26 +274,26 @@ koibnal_post_rx (koib_rx_t *rx, int do_credits) conn->ibc_outstanding_credits++; spin_unlock_irqrestore(&conn->ibc_lock, flags); - koibnal_check_sends(conn); + kibnal_check_sends(conn); } return; } - if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED) { + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { CERROR ("Error posting receive -> "LPX64": %d\n", conn->ibc_peer->ibp_nid, rc); - koibnal_close_conn (rx->rx_conn, rc); + kibnal_close_conn (rx->rx_conn, rc); } else { CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n", conn->ibc_peer->ibp_nid, rc); } /* Drop rx's ref */ - koibnal_put_conn (conn); + kibnal_put_conn (conn); } -#if OPENIBNAL_CKSUM -__u32 koibnal_cksum (void *ptr, int nob) +#if IBNAL_CKSUM +__u32 kibnal_cksum (void *ptr, int nob) { char *c = ptr; __u32 sum = 0; @@ -302,17 +306,17 @@ __u32 koibnal_cksum (void *ptr, int nob) #endif void -koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg) +kibnal_rx_callback (struct ib_cq_entry *e) { - koib_rx_t *rx = (koib_rx_t *)((unsigned long)e->work_request_id); - koib_msg_t *msg = rx->rx_msg; - koib_conn_t *conn = rx->rx_conn; + kib_rx_t *rx = (kib_rx_t *)kibnal_wreqid2ptr(e->work_request_id); + kib_msg_t *msg = rx->rx_msg; + kib_conn_t *conn = rx->rx_conn; int nob = e->bytes_transferred; - const int base_nob = offsetof(koib_msg_t, oibm_u); + const int base_nob = offsetof(kib_msg_t, ibm_u); int credits; int flipped; unsigned long flags; -#if OPENIBNAL_CKSUM +#if IBNAL_CKSUM __u32 msg_cksum; __u32 computed_cksum; #endif @@ -324,11 +328,11 @@ koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg) /* receives complete with error in any case after we've started * closing the QP */ - if (conn->ibc_state >= OPENIBNAL_CONN_DEATHROW) + if (conn->ibc_state >= IBNAL_CONN_DEATHROW) goto failed; /* We don't post receives until the conn is established */ - LASSERT (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED); + LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED); if (e->status != IB_COMPLETION_STATUS_SUCCESS) { CERROR("Rx from "LPX64" failed: %d\n", @@ -344,35 +348,35 @@ koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg) /* Receiver does any byte flipping if necessary... */ - if (msg->oibm_magic == OPENIBNAL_MSG_MAGIC) { + if (msg->ibm_magic == IBNAL_MSG_MAGIC) { flipped = 0; } else { - if (msg->oibm_magic != __swab32(OPENIBNAL_MSG_MAGIC)) { + if (msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) { CERROR ("Unrecognised magic: %08x from "LPX64"\n", - msg->oibm_magic, conn->ibc_peer->ibp_nid); + msg->ibm_magic, conn->ibc_peer->ibp_nid); goto failed; } flipped = 1; - __swab16s (&msg->oibm_version); - LASSERT (sizeof(msg->oibm_type) == 1); - LASSERT (sizeof(msg->oibm_credits) == 1); + __swab16s (&msg->ibm_version); + LASSERT (sizeof(msg->ibm_type) == 1); + LASSERT (sizeof(msg->ibm_credits) == 1); } - if (msg->oibm_version != OPENIBNAL_MSG_VERSION) { + if (msg->ibm_version != IBNAL_MSG_VERSION) { CERROR ("Incompatible msg version %d (%d expected)\n", - msg->oibm_version, OPENIBNAL_MSG_VERSION); + msg->ibm_version, IBNAL_MSG_VERSION); goto failed; } -#if OPENIBNAL_CKSUM - if (nob != msg->oibm_nob) { - CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->oibm_nob); +#if IBNAL_CKSUM + if (nob != msg->ibm_nob) { + CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->ibm_nob); goto failed; } - msg_cksum = le32_to_cpu(msg->oibm_cksum); - msg->oibm_cksum = 0; - computed_cksum = koibnal_cksum (msg, nob); + msg_cksum = le32_to_cpu(msg->ibm_cksum); + msg->ibm_cksum = 0; + computed_cksum = kibnal_cksum (msg, nob); if (msg_cksum != computed_cksum) { CERROR ("Checksum failure %d: (%d expected)\n", @@ -383,101 +387,101 @@ koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg) #endif /* Have I received credits that will let me send? */ - credits = msg->oibm_credits; + credits = msg->ibm_credits; if (credits != 0) { spin_lock_irqsave(&conn->ibc_lock, flags); conn->ibc_credits += credits; spin_unlock_irqrestore(&conn->ibc_lock, flags); - koibnal_check_sends(conn); + kibnal_check_sends(conn); } - switch (msg->oibm_type) { - case OPENIBNAL_MSG_NOOP: - koibnal_post_rx (rx, 1); + switch (msg->ibm_type) { + case IBNAL_MSG_NOOP: + kibnal_post_rx (rx, 1); return; - case OPENIBNAL_MSG_IMMEDIATE: - if (nob < base_nob + sizeof (koib_immediate_msg_t)) { + case IBNAL_MSG_IMMEDIATE: + if (nob < base_nob + sizeof (kib_immediate_msg_t)) { CERROR ("Short IMMEDIATE from "LPX64": %d\n", conn->ibc_peer->ibp_nid, nob); goto failed; } break; - case OPENIBNAL_MSG_PUT_RDMA: - case OPENIBNAL_MSG_GET_RDMA: - if (nob < base_nob + sizeof (koib_rdma_msg_t)) { + case IBNAL_MSG_PUT_RDMA: + case IBNAL_MSG_GET_RDMA: + if (nob < base_nob + sizeof (kib_rdma_msg_t)) { CERROR ("Short RDMA msg from "LPX64": %d\n", conn->ibc_peer->ibp_nid, nob); goto failed; } if (flipped) { - __swab32s(&msg->oibm_u.rdma.oibrm_desc.rd_key); - __swab32s(&msg->oibm_u.rdma.oibrm_desc.rd_nob); - __swab64s(&msg->oibm_u.rdma.oibrm_desc.rd_addr); + __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_key); + __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_nob); + __swab64s(&msg->ibm_u.rdma.ibrm_desc.rd_addr); } CDEBUG(D_NET, "%d RDMA: cookie "LPX64", key %x, addr "LPX64", nob %d\n", - msg->oibm_type, msg->oibm_u.rdma.oibrm_cookie, - msg->oibm_u.rdma.oibrm_desc.rd_key, - msg->oibm_u.rdma.oibrm_desc.rd_addr, - msg->oibm_u.rdma.oibrm_desc.rd_nob); + msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie, + msg->ibm_u.rdma.ibrm_desc.rd_key, + msg->ibm_u.rdma.ibrm_desc.rd_addr, + msg->ibm_u.rdma.ibrm_desc.rd_nob); break; - case OPENIBNAL_MSG_PUT_DONE: - case OPENIBNAL_MSG_GET_DONE: - if (nob < base_nob + sizeof (koib_completion_msg_t)) { + case IBNAL_MSG_PUT_DONE: + case IBNAL_MSG_GET_DONE: + if (nob < base_nob + sizeof (kib_completion_msg_t)) { CERROR ("Short COMPLETION msg from "LPX64": %d\n", conn->ibc_peer->ibp_nid, nob); goto failed; } if (flipped) - __swab32s(&msg->oibm_u.completion.oibcm_status); + __swab32s(&msg->ibm_u.completion.ibcm_status); CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n", - msg->oibm_type, msg->oibm_u.completion.oibcm_cookie, - msg->oibm_u.completion.oibcm_status); + msg->ibm_type, msg->ibm_u.completion.ibcm_cookie, + msg->ibm_u.completion.ibcm_status); - koibnal_complete_passive_rdma (conn, - msg->oibm_u.completion.oibcm_cookie, - msg->oibm_u.completion.oibcm_status); - koibnal_post_rx (rx, 1); + kibnal_complete_passive_rdma (conn, + msg->ibm_u.completion.ibcm_cookie, + msg->ibm_u.completion.ibcm_status); + kibnal_post_rx (rx, 1); return; default: CERROR ("Can't parse type from "LPX64": %d\n", - conn->ibc_peer->ibp_nid, msg->oibm_type); + conn->ibc_peer->ibp_nid, msg->ibm_type); goto failed; } - /* schedule for koibnal_rx() in thread context */ - spin_lock_irqsave(&koibnal_data.koib_sched_lock, flags); + /* schedule for kibnal_rx() in thread context */ + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); - list_add_tail (&rx->rx_list, &koibnal_data.koib_sched_rxq); - wake_up (&koibnal_data.koib_sched_waitq); + list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq); + wake_up (&kibnal_data.kib_sched_waitq); - spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, flags); + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); return; failed: CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); - koibnal_close_conn(conn, -ECONNABORTED); + kibnal_close_conn(conn, -ECONNABORTED); /* Don't re-post rx & drop its ref on conn */ - koibnal_put_conn(conn); + kibnal_put_conn(conn); } void -koibnal_rx (koib_rx_t *rx) +kibnal_rx (kib_rx_t *rx) { - koib_msg_t *msg = rx->rx_msg; + kib_msg_t *msg = rx->rx_msg; /* Clear flag so I can detect if I've sent an RDMA completion */ rx->rx_rdma = 0; - switch (msg->oibm_type) { - case OPENIBNAL_MSG_GET_RDMA: - lib_parse(&koibnal_lib, &msg->oibm_u.rdma.oibrm_hdr, rx); + switch (msg->ibm_type) { + case IBNAL_MSG_GET_RDMA: + lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx); /* If the incoming get was matched, I'll have initiated the * RDMA and the completion message... */ if (rx->rx_rdma) @@ -487,12 +491,12 @@ koibnal_rx (koib_rx_t *rx) * the peer's GET blocking for the full timeout. */ CERROR ("Completing unmatched RDMA GET from "LPX64"\n", rx->rx_conn->ibc_peer->ibp_nid); - koibnal_start_active_rdma (OPENIBNAL_MSG_GET_DONE, -EIO, - rx, NULL, 0, NULL, NULL, 0, 0); + kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO, + rx, NULL, 0, NULL, NULL, 0, 0); break; - case OPENIBNAL_MSG_PUT_RDMA: - lib_parse(&koibnal_lib, &msg->oibm_u.rdma.oibrm_hdr, rx); + case IBNAL_MSG_PUT_RDMA: + lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx); if (rx->rx_rdma) break; /* This is most unusual, since even if lib_parse() didn't @@ -505,8 +509,8 @@ koibnal_rx (koib_rx_t *rx) rx->rx_conn->ibc_peer->ibp_nid); break; - case OPENIBNAL_MSG_IMMEDIATE: - lib_parse(&koibnal_lib, &msg->oibm_u.immediate.oibim_hdr, rx); + case IBNAL_MSG_IMMEDIATE: + lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx); LASSERT (!rx->rx_rdma); break; @@ -515,12 +519,12 @@ koibnal_rx (koib_rx_t *rx) break; } - koibnal_post_rx (rx, 1); + kibnal_post_rx (rx, 1); } #if 0 int -koibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp) +kibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp) { struct page *page; @@ -531,7 +535,7 @@ koibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp) else if (vaddr >= PKMAP_BASE && vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) page = vmalloc_to_page ((void *)vaddr); - /* in 2.4 ^ just walks the page tables */ + /* in 2.4 ^ just walks the page tables */ #endif else page = virt_to_page (vaddr); @@ -540,13 +544,13 @@ koibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp) !VALID_PAGE (page)) return (-EFAULT); - *physp = koibnal_page2phys(page) + (vaddr & (PAGE_SIZE - 1)); + *physp = kibnal_page2phys(page) + (vaddr & (PAGE_SIZE - 1)); return (0); } #endif int -koibnal_map_iov (koib_tx_t *tx, enum ib_memory_access access, +kibnal_map_iov (kib_tx_t *tx, enum ib_memory_access access, int niov, struct iovec *iov, int offset, int nob) { @@ -555,7 +559,7 @@ koibnal_map_iov (koib_tx_t *tx, enum ib_memory_access access, LASSERT (nob > 0); LASSERT (niov > 0); - LASSERT (tx->tx_mapped == KOIB_TX_UNMAPPED); + LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); while (offset >= iov->iov_len) { offset -= iov->iov_len; @@ -572,7 +576,7 @@ koibnal_map_iov (koib_tx_t *tx, enum ib_memory_access access, vaddr = (void *)(((unsigned long)iov->iov_base) + offset); tx->tx_md.md_addr = (__u64)((unsigned long)vaddr); - rc = ib_memory_register (koibnal_data.koib_pd, + rc = ib_memory_register (kibnal_data.kib_pd, vaddr, nob, access, &tx->tx_md.md_handle.mr, @@ -584,21 +588,21 @@ koibnal_map_iov (koib_tx_t *tx, enum ib_memory_access access, return (rc); } - tx->tx_mapped = KOIB_TX_MAPPED; + tx->tx_mapped = KIB_TX_MAPPED; return (0); } int -koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access, +kibnal_map_kiov (kib_tx_t *tx, enum ib_memory_access access, int nkiov, ptl_kiov_t *kiov, int offset, int nob) { -#if OPENIBNAL_FMR +#if IBNAL_FMR __u64 *phys; - const int mapped = KOIB_TX_MAPPED_FMR; + const int mapped = KIB_TX_MAPPED_FMR; #else struct ib_physical_buffer *phys; - const int mapped = KOIB_TX_MAPPED; + const int mapped = KIB_TX_MAPPED; #endif int page_offset; int nphys; @@ -610,7 +614,7 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access, LASSERT (nob > 0); LASSERT (nkiov > 0); - LASSERT (tx->tx_mapped == KOIB_TX_UNMAPPED); + LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); while (offset >= kiov->kiov_len) { offset -= kiov->kiov_len; @@ -627,10 +631,10 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access, } page_offset = kiov->kiov_offset + offset; -#if OPENIBNAL_FMR - phys[0] = koibnal_page2phys(kiov->kiov_page); +#if IBNAL_FMR + phys[0] = kibnal_page2phys(kiov->kiov_page); #else - phys[0].address = koibnal_page2phys(kiov->kiov_page); + phys[0].address = kibnal_page2phys(kiov->kiov_page); phys[0].size = PAGE_SIZE; #endif nphys = 1; @@ -667,10 +671,10 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access, } LASSERT (nphys * sizeof (*phys) < phys_size); -#if OPENIBNAL_FMR - phys[nphys] = koibnal_page2phys(kiov->kiov_page); +#if IBNAL_FMR + phys[nphys] = kibnal_page2phys(kiov->kiov_page); #else - phys[nphys].address = koibnal_page2phys(kiov->kiov_page); + phys[nphys].address = kibnal_page2phys(kiov->kiov_page); phys[nphys].size = PAGE_SIZE; #endif nphys++; @@ -683,10 +687,10 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access, for (rc = 0; rc < nphys; rc++) CWARN (" [%d] "LPX64" / %d\n", rc, phys[rc].address, phys[rc].size); #endif - tx->tx_md.md_addr = OPENIBNAL_RDMA_BASE; + tx->tx_md.md_addr = IBNAL_RDMA_BASE; -#if OPENIBNAL_FMR - rc = ib_fmr_register_physical (koibnal_data.koib_fmr_pool, +#if IBNAL_FMR + rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool, phys, nphys, &tx->tx_md.md_addr, page_offset, @@ -694,7 +698,7 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access, &tx->tx_md.md_lkey, &tx->tx_md.md_rkey); #else - rc = ib_memory_register_physical (koibnal_data.koib_pd, + rc = ib_memory_register_physical (kibnal_data.kib_pd, phys, nphys, &tx->tx_md.md_addr, nob, page_offset, @@ -717,24 +721,24 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access, return (rc); } -koib_conn_t * -koibnal_find_conn_locked (koib_peer_t *peer) +kib_conn_t * +kibnal_find_conn_locked (kib_peer_t *peer) { struct list_head *tmp; /* just return the first connection */ list_for_each (tmp, &peer->ibp_conns) { - return (list_entry(tmp, koib_conn_t, ibc_list)); + return (list_entry(tmp, kib_conn_t, ibc_list)); } return (NULL); } void -koibnal_check_sends (koib_conn_t *conn) +kibnal_check_sends (kib_conn_t *conn) { unsigned long flags; - koib_tx_t *tx; + kib_tx_t *tx; int rc; int i; int done; @@ -742,39 +746,39 @@ koibnal_check_sends (koib_conn_t *conn) spin_lock_irqsave (&conn->ibc_lock, flags); + LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE); + if (list_empty(&conn->ibc_tx_queue) && - conn->ibc_outstanding_credits >= OPENIBNAL_CREDIT_HIGHWATER) { + conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) { spin_unlock_irqrestore(&conn->ibc_lock, flags); - - tx = koibnal_get_idle_tx(0); /* don't block */ + + tx = kibnal_get_idle_tx(0); /* don't block */ if (tx != NULL) - koibnal_init_tx_msg(tx, OPENIBNAL_MSG_NOOP, 0); + kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0); spin_lock_irqsave(&conn->ibc_lock, flags); - + if (tx != NULL) { atomic_inc(&conn->ibc_refcount); - koibnal_queue_tx_locked(tx, conn); + kibnal_queue_tx_locked(tx, conn); } } - LASSERT (conn->ibc_nsends_posted <= OPENIBNAL_MSG_QUEUE_SIZE); - while (!list_empty (&conn->ibc_tx_queue)) { - tx = list_entry (conn->ibc_tx_queue.next, koib_tx_t, tx_list); + tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list); /* We rely on this for QP sizing */ LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= 2); LASSERT (conn->ibc_outstanding_credits >= 0); - LASSERT (conn->ibc_outstanding_credits <= OPENIBNAL_MSG_QUEUE_SIZE); + LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE); LASSERT (conn->ibc_credits >= 0); - LASSERT (conn->ibc_credits <= OPENIBNAL_MSG_QUEUE_SIZE); + LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE); /* Not on ibc_rdma_queue */ LASSERT (!tx->tx_passive_rdma_wait); - if (conn->ibc_nsends_posted == OPENIBNAL_MSG_QUEUE_SIZE) + if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE) break; if (conn->ibc_credits == 0) /* no credits */ @@ -786,37 +790,29 @@ koibnal_check_sends (koib_conn_t *conn) list_del (&tx->tx_list); - if (tx->tx_msg->oibm_type == OPENIBNAL_MSG_NOOP && + if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP && (!list_empty(&conn->ibc_tx_queue) || - conn->ibc_outstanding_credits < OPENIBNAL_CREDIT_HIGHWATER)) { - /* Redundant NOOP */ + conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) { + /* redundant NOOP */ spin_unlock_irqrestore(&conn->ibc_lock, flags); - koibnal_tx_done(tx); + kibnal_tx_done(tx); spin_lock_irqsave(&conn->ibc_lock, flags); continue; } - - /* incoming RDMA completion can find this one now */ - if (tx->tx_passive_rdma) { - list_add (&tx->tx_list, &conn->ibc_rdma_queue); - tx->tx_passive_rdma_wait = 1; - tx->tx_passive_rdma_deadline = - jiffies + koibnal_tunables.koib_io_timeout * HZ; - } - tx->tx_msg->oibm_credits = conn->ibc_outstanding_credits; + tx->tx_msg->ibm_credits = conn->ibc_outstanding_credits; conn->ibc_outstanding_credits = 0; - /* use the free memory barrier when we unlock to ensure - * sending set before we can get the tx callback. */ conn->ibc_nsends_posted++; conn->ibc_credits--; - tx->tx_sending = tx->tx_nsp; -#if OPENIBNAL_CKSUM - tx->tx_msg->oibm_cksum = 0; - tx->tx_msg->oibm_cksum = koibnal_cksum(tx->tx_msg, tx->tx_msg->oibm_nob); - CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->oibm_cksum, tx->tx_msg->oibm_nob); + tx->tx_sending = tx->tx_nsp; + tx->tx_passive_rdma_wait = tx->tx_passive_rdma; + list_add (&tx->tx_list, &conn->ibc_active_txs); +#if IBNAL_CKSUM + tx->tx_msg->ibm_cksum = 0; + tx->tx_msg->ibm_cksum = kibnal_cksum(tx->tx_msg, tx->tx_msg->ibm_nob); + CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->ibm_cksum, tx->tx_msg->ibm_nob); #endif spin_unlock_irqrestore (&conn->ibc_lock, flags); @@ -827,7 +823,7 @@ koibnal_check_sends (koib_conn_t *conn) rc = -ECONNABORTED; nwork = 0; - if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED) { + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { tx->tx_status = 0; /* Driver only accepts 1 item at a time */ for (i = 0; i < tx->tx_nsp; i++) { @@ -842,31 +838,31 @@ koibnal_check_sends (koib_conn_t *conn) if (rc != 0) { /* NB credits are transferred in the actual * message, which can only be the last work item */ - conn->ibc_outstanding_credits += tx->tx_msg->oibm_credits; + conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits; conn->ibc_credits++; conn->ibc_nsends_posted--; - tx->tx_sending -= tx->tx_nsp - nwork; + tx->tx_status = rc; + tx->tx_passive_rdma_wait = 0; + tx->tx_sending -= tx->tx_nsp - nwork; + done = (tx->tx_sending == 0); - - if (tx->tx_passive_rdma) { - tx->tx_passive_rdma_wait = 0; + if (done) list_del (&tx->tx_list); - } spin_unlock_irqrestore (&conn->ibc_lock, flags); - if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED) + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) CERROR ("Error %d posting transmit to "LPX64"\n", rc, conn->ibc_peer->ibp_nid); else CDEBUG (D_NET, "Error %d posting transmit to " LPX64"\n", rc, conn->ibc_peer->ibp_nid); - koibnal_close_conn (conn, rc); + kibnal_close_conn (conn, rc); if (done) - koibnal_tx_done (tx); + kibnal_tx_done (tx); return; } @@ -876,10 +872,10 @@ koibnal_check_sends (koib_conn_t *conn) } void -koibnal_tx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg) +kibnal_tx_callback (struct ib_cq_entry *e) { - koib_tx_t *tx = (koib_tx_t *)((unsigned long)e->work_request_id); - koib_conn_t *conn; + kib_tx_t *tx = (kib_tx_t *)kibnal_wreqid2ptr(e->work_request_id); + kib_conn_t *conn; unsigned long flags; int idle; @@ -901,6 +897,8 @@ koibnal_tx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg) tx->tx_sending--; idle = (tx->tx_sending == 0) && /* This is the final callback */ (!tx->tx_passive_rdma_wait); /* Not waiting for RDMA completion */ + if (idle) + list_del(&tx->tx_list); CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", conn, conn->ibc_state, conn->ibc_peer->ibp_nid, @@ -917,53 +915,62 @@ koibnal_tx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg) spin_unlock_irqrestore(&conn->ibc_lock, flags); if (idle) - koibnal_tx_done (tx); + kibnal_tx_done (tx); if (e->status != IB_COMPLETION_STATUS_SUCCESS) { CERROR ("Tx completion to "LPX64" failed: %d\n", conn->ibc_peer->ibp_nid, e->status); - koibnal_close_conn (conn, -ENETDOWN); + kibnal_close_conn (conn, -ENETDOWN); } else { /* can I shovel some more sends out the door? */ - koibnal_check_sends(conn); + kibnal_check_sends(conn); } - koibnal_put_conn (conn); + kibnal_put_conn (conn); } void -koibnal_init_tx_msg (koib_tx_t *tx, int type, int body_nob) +kibnal_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg) +{ + if (kibnal_wreqid_is_rx(e->work_request_id)) + kibnal_rx_callback (e); + else + kibnal_tx_callback (e); +} + +void +kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob) { struct ib_gather_scatter *gl = &tx->tx_gl[tx->tx_nsp]; struct ib_send_param *sp = &tx->tx_sp[tx->tx_nsp]; int fence; - int nob = offsetof (koib_msg_t, oibm_u) + body_nob; + int nob = offsetof (kib_msg_t, ibm_u) + body_nob; LASSERT (tx->tx_nsp >= 0 && tx->tx_nsp < sizeof(tx->tx_sp)/sizeof(tx->tx_sp[0])); - LASSERT (nob <= OPENIBNAL_MSG_SIZE); + LASSERT (nob <= IBNAL_MSG_SIZE); - tx->tx_msg->oibm_magic = OPENIBNAL_MSG_MAGIC; - tx->tx_msg->oibm_version = OPENIBNAL_MSG_VERSION; - tx->tx_msg->oibm_type = type; -#if OPENIBNAL_CKSUM - tx->tx_msg->oibm_nob = nob; + tx->tx_msg->ibm_magic = IBNAL_MSG_MAGIC; + tx->tx_msg->ibm_version = IBNAL_MSG_VERSION; + tx->tx_msg->ibm_type = type; +#if IBNAL_CKSUM + tx->tx_msg->ibm_nob = nob; #endif /* Fence the message if it's bundled with an RDMA read */ fence = (tx->tx_nsp > 0) && - (type == OPENIBNAL_MSG_PUT_DONE); + (type == IBNAL_MSG_PUT_DONE); *gl = (struct ib_gather_scatter) { .address = tx->tx_vaddr, .length = nob, - .key = koibnal_data.koib_tx_pages->oibp_lkey, + .key = kibnal_data.kib_tx_pages->ibp_lkey, }; /* NB If this is an RDMA read, the completion message must wait for * the RDMA to complete. Sends wait for previous RDMA writes * anyway... */ *sp = (struct ib_send_param) { - .work_request_id = (__u64)((unsigned long)tx), + .work_request_id = kibnal_ptr2wreqid(tx, 0), .op = IB_OP_SEND, .gather_list = gl, .num_gather_entries = 1, @@ -979,26 +986,26 @@ koibnal_init_tx_msg (koib_tx_t *tx, int type, int body_nob) } void -koibnal_queue_tx (koib_tx_t *tx, koib_conn_t *conn) +kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn) { unsigned long flags; spin_lock_irqsave(&conn->ibc_lock, flags); - koibnal_queue_tx_locked (tx, conn); + kibnal_queue_tx_locked (tx, conn); spin_unlock_irqrestore(&conn->ibc_lock, flags); - koibnal_check_sends(conn); + kibnal_check_sends(conn); } void -koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid) +kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid) { unsigned long flags; - koib_peer_t *peer; - koib_conn_t *conn; - rwlock_t *g_lock = &koibnal_data.koib_global_lock; + kib_peer_t *peer; + kib_conn_t *conn; + rwlock_t *g_lock = &kibnal_data.kib_global_lock; /* If I get here, I've committed to send, so I complete the tx with * failure on any problems */ @@ -1008,15 +1015,15 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid) read_lock (g_lock); - peer = koibnal_find_peer_locked (nid); + peer = kibnal_find_peer_locked (nid); if (peer == NULL) { read_unlock (g_lock); tx->tx_status = -EHOSTUNREACH; - koibnal_tx_done (tx); + kibnal_tx_done (tx); return; } - conn = koibnal_find_conn_locked (peer); + conn = kibnal_find_conn_locked (peer); if (conn != NULL) { CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", conn, conn->ibc_state, conn->ibc_peer->ibp_nid, @@ -1024,7 +1031,7 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid) atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */ read_unlock (g_lock); - koibnal_queue_tx (tx, conn); + kibnal_queue_tx (tx, conn); return; } @@ -1032,15 +1039,15 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid) read_unlock (g_lock); write_lock_irqsave (g_lock, flags); - peer = koibnal_find_peer_locked (nid); + peer = kibnal_find_peer_locked (nid); if (peer == NULL) { write_unlock_irqrestore (g_lock, flags); tx->tx_status = -EHOSTUNREACH; - koibnal_tx_done (tx); + kibnal_tx_done (tx); return; } - conn = koibnal_find_conn_locked (peer); + conn = kibnal_find_conn_locked (peer); if (conn != NULL) { /* Connection exists; queue message on it */ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", @@ -1049,7 +1056,7 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid) atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */ write_unlock_irqrestore (g_lock, flags); - koibnal_queue_tx (tx, conn); + kibnal_queue_tx (tx, conn); return; } @@ -1057,20 +1064,20 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid) if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) { write_unlock_irqrestore (g_lock, flags); tx->tx_status = -EHOSTUNREACH; - koibnal_tx_done (tx); + kibnal_tx_done (tx); return; } peer->ibp_connecting = 1; atomic_inc (&peer->ibp_refcount); /* extra ref for connd */ - spin_lock (&koibnal_data.koib_connd_lock); + spin_lock (&kibnal_data.kib_connd_lock); list_add_tail (&peer->ibp_connd_list, - &koibnal_data.koib_connd_peers); - wake_up (&koibnal_data.koib_connd_waitq); + &kibnal_data.kib_connd_peers); + wake_up (&kibnal_data.kib_connd_waitq); - spin_unlock (&koibnal_data.koib_connd_lock); + spin_unlock (&kibnal_data.kib_connd_lock); } /* A connection is being established; queue the message... */ @@ -1080,49 +1087,49 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid) } ptl_err_t -koibnal_start_passive_rdma (int type, ptl_nid_t nid, +kibnal_start_passive_rdma (int type, ptl_nid_t nid, lib_msg_t *libmsg, ptl_hdr_t *hdr) { int nob = libmsg->md->length; - koib_tx_t *tx; - koib_msg_t *oibmsg; + kib_tx_t *tx; + kib_msg_t *ibmsg; int rc; int access; - LASSERT (type == OPENIBNAL_MSG_PUT_RDMA || - type == OPENIBNAL_MSG_GET_RDMA); + LASSERT (type == IBNAL_MSG_PUT_RDMA || + type == IBNAL_MSG_GET_RDMA); LASSERT (nob > 0); LASSERT (!in_interrupt()); /* Mapping could block */ - if (type == OPENIBNAL_MSG_PUT_RDMA) { + if (type == IBNAL_MSG_PUT_RDMA) { access = IB_ACCESS_REMOTE_READ; } else { access = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; } - tx = koibnal_get_idle_tx (1); /* May block; caller is an app thread */ + tx = kibnal_get_idle_tx (1); /* May block; caller is an app thread */ LASSERT (tx != NULL); if ((libmsg->md->options & PTL_MD_KIOV) == 0) - rc = koibnal_map_iov (tx, access, - libmsg->md->md_niov, - libmsg->md->md_iov.iov, - 0, nob); + rc = kibnal_map_iov (tx, access, + libmsg->md->md_niov, + libmsg->md->md_iov.iov, + 0, nob); else - rc = koibnal_map_kiov (tx, access, - libmsg->md->md_niov, - libmsg->md->md_iov.kiov, - 0, nob); + rc = kibnal_map_kiov (tx, access, + libmsg->md->md_niov, + libmsg->md->md_iov.kiov, + 0, nob); if (rc != 0) { CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc); goto failed; } - if (type == OPENIBNAL_MSG_GET_RDMA) { + if (type == IBNAL_MSG_GET_RDMA) { /* reply gets finalized when tx completes */ - tx->tx_libmsg[1] = lib_create_reply_msg(&koibnal_lib, + tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, nid, libmsg); if (tx->tx_libmsg[1] == NULL) { CERROR ("Can't create reply for GET -> "LPX64"\n", @@ -1134,15 +1141,15 @@ koibnal_start_passive_rdma (int type, ptl_nid_t nid, tx->tx_passive_rdma = 1; - oibmsg = tx->tx_msg; + ibmsg = tx->tx_msg; - oibmsg->oibm_u.rdma.oibrm_hdr = *hdr; - oibmsg->oibm_u.rdma.oibrm_cookie = tx->tx_passive_rdma_cookie; - oibmsg->oibm_u.rdma.oibrm_desc.rd_key = tx->tx_md.md_rkey; - oibmsg->oibm_u.rdma.oibrm_desc.rd_addr = tx->tx_md.md_addr; - oibmsg->oibm_u.rdma.oibrm_desc.rd_nob = nob; + ibmsg->ibm_u.rdma.ibrm_hdr = *hdr; + ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie; + ibmsg->ibm_u.rdma.ibrm_desc.rd_key = tx->tx_md.md_rkey; + ibmsg->ibm_u.rdma.ibrm_desc.rd_addr = tx->tx_md.md_addr; + ibmsg->ibm_u.rdma.ibrm_desc.rd_nob = nob; - koibnal_init_tx_msg (tx, type, sizeof (koib_rdma_msg_t)); + kibnal_init_tx_msg (tx, type, sizeof (kib_rdma_msg_t)); CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr " LPX64", nob %d\n", @@ -1152,25 +1159,25 @@ koibnal_start_passive_rdma (int type, ptl_nid_t nid, /* libmsg gets finalized when tx completes. */ tx->tx_libmsg[0] = libmsg; - koibnal_launch_tx(tx, nid); + kibnal_launch_tx(tx, nid); return (PTL_OK); failed: tx->tx_status = rc; - koibnal_tx_done (tx); + kibnal_tx_done (tx); return (PTL_FAIL); } void -koibnal_start_active_rdma (int type, int status, - koib_rx_t *rx, lib_msg_t *libmsg, +kibnal_start_active_rdma (int type, int status, + kib_rx_t *rx, lib_msg_t *libmsg, unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov, size_t offset, size_t nob) { - koib_msg_t *rxmsg = rx->rx_msg; - koib_msg_t *txmsg; - koib_tx_t *tx; + kib_msg_t *rxmsg = rx->rx_msg; + kib_msg_t *txmsg; + kib_tx_t *tx; int access; int rdma_op; int rc; @@ -1187,8 +1194,8 @@ koibnal_start_active_rdma (int type, int status, /* No data if we're completing with failure */ LASSERT (status == 0 || nob == 0); - LASSERT (type == OPENIBNAL_MSG_GET_DONE || - type == OPENIBNAL_MSG_PUT_DONE); + LASSERT (type == IBNAL_MSG_GET_DONE || + type == IBNAL_MSG_PUT_DONE); /* Flag I'm completing the RDMA. Even if I fail to send the * completion message, I will have tried my best so further @@ -1196,22 +1203,22 @@ koibnal_start_active_rdma (int type, int status, LASSERT (!rx->rx_rdma); rx->rx_rdma = 1; - if (type == OPENIBNAL_MSG_GET_DONE) { + if (type == IBNAL_MSG_GET_DONE) { access = 0; rdma_op = IB_OP_RDMA_WRITE; - LASSERT (rxmsg->oibm_type == OPENIBNAL_MSG_GET_RDMA); + LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA); } else { access = IB_ACCESS_LOCAL_WRITE; rdma_op = IB_OP_RDMA_READ; - LASSERT (rxmsg->oibm_type == OPENIBNAL_MSG_PUT_RDMA); + LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA); } - tx = koibnal_get_idle_tx (0); /* Mustn't block */ + tx = kibnal_get_idle_tx (0); /* Mustn't block */ if (tx == NULL) { CERROR ("tx descs exhausted on RDMA from "LPX64 " completing locally with failure\n", - rx->rx_conn->ibc_peer->ibp_nid); - lib_finalize (&koibnal_lib, NULL, libmsg, PTL_NO_SPACE); + rx->rx_conn->ibc_peer->ibp_nid); + lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE); return; } LASSERT (tx->tx_nsp == 0); @@ -1222,11 +1229,11 @@ koibnal_start_active_rdma (int type, int status, * message is matched) */ if (kiov != NULL) - rc = koibnal_map_kiov (tx, access, - niov, kiov, offset, nob); + rc = kibnal_map_kiov (tx, access, + niov, kiov, offset, nob); else - rc = koibnal_map_iov (tx, access, - niov, iov, offset, nob); + rc = kibnal_map_iov (tx, access, + niov, iov, offset, nob); if (rc != 0) { CERROR ("Can't map RDMA -> "LPX64": %d\n", @@ -1242,12 +1249,12 @@ koibnal_start_active_rdma (int type, int status, }; tx->tx_sp[0] = (struct ib_send_param) { - .work_request_id = (__u64)((unsigned long)tx), + .work_request_id = kibnal_ptr2wreqid(tx, 0), .op = rdma_op, .gather_list = &tx->tx_gl[0], .num_gather_entries = 1, - .remote_address = rxmsg->oibm_u.rdma.oibrm_desc.rd_addr, - .rkey = rxmsg->oibm_u.rdma.oibrm_desc.rd_key, + .remote_address = rxmsg->ibm_u.rdma.ibrm_desc.rd_addr, + .rkey = rxmsg->ibm_u.rdma.ibrm_desc.rd_key, .device_specific = NULL, .solicited_event = 0, .signaled = 1, @@ -1262,10 +1269,10 @@ koibnal_start_active_rdma (int type, int status, txmsg = tx->tx_msg; - txmsg->oibm_u.completion.oibcm_cookie = rxmsg->oibm_u.rdma.oibrm_cookie; - txmsg->oibm_u.completion.oibcm_status = status; + txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie; + txmsg->ibm_u.completion.ibcm_status = status; - koibnal_init_tx_msg(tx, type, sizeof (koib_completion_msg_t)); + kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t)); if (status == 0 && nob != 0) { LASSERT (tx->tx_nsp > 1); @@ -1277,7 +1284,7 @@ koibnal_start_active_rdma (int type, int status, LASSERT (tx->tx_nsp == 1); /* No RDMA: local completion happens now! */ CDEBUG(D_WARNING,"No data: immediate completion\n"); - lib_finalize (&koibnal_lib, NULL, libmsg, + lib_finalize (&kibnal_lib, NULL, libmsg, status == 0 ? PTL_OK : PTL_FAIL); } @@ -1288,11 +1295,11 @@ koibnal_start_active_rdma (int type, int status, atomic_read (&rx->rx_conn->ibc_refcount)); atomic_inc (&rx->rx_conn->ibc_refcount); /* ...and queue it up */ - koibnal_queue_tx(tx, rx->rx_conn); + kibnal_queue_tx(tx, rx->rx_conn); } ptl_err_t -koibnal_sendmsg(lib_nal_t *nal, +kibnal_sendmsg(lib_nal_t *nal, void *private, lib_msg_t *libmsg, ptl_hdr_t *hdr, @@ -1305,8 +1312,8 @@ koibnal_sendmsg(lib_nal_t *nal, size_t payload_offset, size_t payload_nob) { - koib_msg_t *oibmsg; - koib_tx_t *tx; + kib_msg_t *ibmsg; + kib_tx_t *tx; int nob; /* NB 'private' is different depending on what we're sending.... */ @@ -1329,27 +1336,27 @@ koibnal_sendmsg(lib_nal_t *nal, case PTL_MSG_REPLY: { /* reply's 'private' is the incoming receive */ - koib_rx_t *rx = private; + kib_rx_t *rx = private; /* RDMA reply expected? */ - if (rx->rx_msg->oibm_type == OPENIBNAL_MSG_GET_RDMA) { - koibnal_start_active_rdma(OPENIBNAL_MSG_GET_DONE, 0, - rx, libmsg, payload_niov, - payload_iov, payload_kiov, - payload_offset, payload_nob); + if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) { + kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0, + rx, libmsg, payload_niov, + payload_iov, payload_kiov, + payload_offset, payload_nob); return (PTL_OK); } /* Incoming message consistent with immediate reply? */ - if (rx->rx_msg->oibm_type != OPENIBNAL_MSG_IMMEDIATE) { + if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) { CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n", - nid, rx->rx_msg->oibm_type); + nid, rx->rx_msg->ibm_type); return (PTL_FAIL); } /* Will it fit in a message? */ - nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[payload_nob]); - if (nob >= OPENIBNAL_MSG_SIZE) { + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); + if (nob >= IBNAL_MSG_SIZE) { CERROR("REPLY for "LPX64" too big (RDMA not requested): %d\n", nid, payload_nob); return (PTL_FAIL); @@ -1359,10 +1366,10 @@ koibnal_sendmsg(lib_nal_t *nal, case PTL_MSG_GET: /* might the REPLY message be big enough to need RDMA? */ - nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[libmsg->md->length]); - if (nob > OPENIBNAL_MSG_SIZE) - return (koibnal_start_passive_rdma(OPENIBNAL_MSG_GET_RDMA, - nid, libmsg, hdr)); + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]); + if (nob > IBNAL_MSG_SIZE) + return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, + nid, libmsg, hdr)); break; case PTL_MSG_ACK: @@ -1371,181 +1378,181 @@ koibnal_sendmsg(lib_nal_t *nal, case PTL_MSG_PUT: /* Is the payload big enough to need RDMA? */ - nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[payload_nob]); - if (nob > OPENIBNAL_MSG_SIZE) - return (koibnal_start_passive_rdma(OPENIBNAL_MSG_PUT_RDMA, - nid, libmsg, hdr)); + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); + if (nob > IBNAL_MSG_SIZE) + return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA, + nid, libmsg, hdr)); break; } - tx = koibnal_get_idle_tx(!(type == PTL_MSG_ACK || - type == PTL_MSG_REPLY || - in_interrupt())); + tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK || + type == PTL_MSG_REPLY || + in_interrupt())); if (tx == NULL) { CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n", type, nid, in_interrupt() ? " (intr)" : ""); return (PTL_NO_SPACE); } - oibmsg = tx->tx_msg; - oibmsg->oibm_u.immediate.oibim_hdr = *hdr; + ibmsg = tx->tx_msg; + ibmsg->ibm_u.immediate.ibim_hdr = *hdr; if (payload_nob > 0) { if (payload_kiov != NULL) - lib_copy_kiov2buf(oibmsg->oibm_u.immediate.oibim_payload, + lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload, payload_niov, payload_kiov, payload_offset, payload_nob); else - lib_copy_iov2buf(oibmsg->oibm_u.immediate.oibim_payload, + lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload, payload_niov, payload_iov, payload_offset, payload_nob); } - koibnal_init_tx_msg (tx, OPENIBNAL_MSG_IMMEDIATE, - offsetof(koib_immediate_msg_t, - oibim_payload[payload_nob])); + kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, + offsetof(kib_immediate_msg_t, + ibim_payload[payload_nob])); /* libmsg gets finalized when tx completes */ tx->tx_libmsg[0] = libmsg; - koibnal_launch_tx(tx, nid); + kibnal_launch_tx(tx, nid); return (PTL_OK); } ptl_err_t -koibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie, +kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, unsigned int payload_niov, struct iovec *payload_iov, size_t payload_offset, size_t payload_len) { - return (koibnal_sendmsg(nal, private, cookie, - hdr, type, nid, pid, - payload_niov, payload_iov, NULL, - payload_offset, payload_len)); + return (kibnal_sendmsg(nal, private, cookie, + hdr, type, nid, pid, + payload_niov, payload_iov, NULL, + payload_offset, payload_len)); } ptl_err_t -koibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, +kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, unsigned int payload_niov, ptl_kiov_t *payload_kiov, size_t payload_offset, size_t payload_len) { - return (koibnal_sendmsg(nal, private, cookie, - hdr, type, nid, pid, - payload_niov, NULL, payload_kiov, - payload_offset, payload_len)); + return (kibnal_sendmsg(nal, private, cookie, + hdr, type, nid, pid, + payload_niov, NULL, payload_kiov, + payload_offset, payload_len)); } ptl_err_t -koibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, +kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov, size_t offset, size_t mlen, size_t rlen) { - koib_rx_t *rx = private; - koib_msg_t *rxmsg = rx->rx_msg; - int msg_nob; + kib_rx_t *rx = private; + kib_msg_t *rxmsg = rx->rx_msg; + int msg_nob; LASSERT (mlen <= rlen); LASSERT (!in_interrupt ()); /* Either all pages or all vaddrs */ LASSERT (!(kiov != NULL && iov != NULL)); - switch (rxmsg->oibm_type) { + switch (rxmsg->ibm_type) { default: LBUG(); return (PTL_FAIL); - case OPENIBNAL_MSG_IMMEDIATE: - msg_nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[rlen]); - if (msg_nob > OPENIBNAL_MSG_SIZE) { + case IBNAL_MSG_IMMEDIATE: + msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]); + if (msg_nob > IBNAL_MSG_SIZE) { CERROR ("Immediate message from "LPX64" too big: %d\n", - rxmsg->oibm_u.immediate.oibim_hdr.src_nid, rlen); + rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen); return (PTL_FAIL); } if (kiov != NULL) lib_copy_buf2kiov(niov, kiov, offset, - rxmsg->oibm_u.immediate.oibim_payload, + rxmsg->ibm_u.immediate.ibim_payload, mlen); else lib_copy_buf2iov(niov, iov, offset, - rxmsg->oibm_u.immediate.oibim_payload, + rxmsg->ibm_u.immediate.ibim_payload, mlen); lib_finalize (nal, NULL, libmsg, PTL_OK); return (PTL_OK); - case OPENIBNAL_MSG_GET_RDMA: + case IBNAL_MSG_GET_RDMA: /* We get called here just to discard any junk after the * GET hdr. */ LASSERT (libmsg == NULL); lib_finalize (nal, NULL, libmsg, PTL_OK); return (PTL_OK); - case OPENIBNAL_MSG_PUT_RDMA: - koibnal_start_active_rdma (OPENIBNAL_MSG_PUT_DONE, 0, - rx, libmsg, - niov, iov, kiov, offset, mlen); + case IBNAL_MSG_PUT_RDMA: + kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0, + rx, libmsg, + niov, iov, kiov, offset, mlen); return (PTL_OK); } } ptl_err_t -koibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, +kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, unsigned int niov, struct iovec *iov, size_t offset, size_t mlen, size_t rlen) { - return (koibnal_recvmsg (nal, private, msg, niov, iov, NULL, - offset, mlen, rlen)); + return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL, + offset, mlen, rlen)); } ptl_err_t -koibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg, +kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg, unsigned int niov, ptl_kiov_t *kiov, size_t offset, size_t mlen, size_t rlen) { - return (koibnal_recvmsg (nal, private, msg, niov, NULL, kiov, - offset, mlen, rlen)); + return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov, + offset, mlen, rlen)); } int -koibnal_thread_start (int (*fn)(void *arg), void *arg) +kibnal_thread_start (int (*fn)(void *arg), void *arg) { long pid = kernel_thread (fn, arg, 0); if (pid < 0) return ((int)pid); - atomic_inc (&koibnal_data.koib_nthreads); + atomic_inc (&kibnal_data.kib_nthreads); return (0); } void -koibnal_thread_fini (void) +kibnal_thread_fini (void) { - atomic_dec (&koibnal_data.koib_nthreads); + atomic_dec (&kibnal_data.kib_nthreads); } void -koibnal_close_conn_locked (koib_conn_t *conn, int error) +kibnal_close_conn_locked (kib_conn_t *conn, int error) { /* This just does the immmediate housekeeping, and schedules the * connection for the connd to finish off. - * Caller holds koib_global_lock exclusively in irq context */ - koib_peer_t *peer = conn->ibc_peer; + * Caller holds kib_global_lock exclusively in irq context */ + kib_peer_t *peer = conn->ibc_peer; CDEBUG (error == 0 ? D_NET : D_ERROR, "closing conn to "LPX64": error %d\n", peer->ibp_nid, error); - LASSERT (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED || - conn->ibc_state == OPENIBNAL_CONN_CONNECTING); + LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED || + conn->ibc_state == IBNAL_CONN_CONNECTING); - if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED) { - /* koib_connd_conns takes ibc_list's ref */ + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { + /* kib_connd_conns takes ibc_list's ref */ list_del (&conn->ibc_list); } else { - /* new ref for koib_connd_conns */ + /* new ref for kib_connd_conns */ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", conn, conn->ibc_state, conn->ibc_peer->ibp_nid, atomic_read (&conn->ibc_refcount)); @@ -1555,57 +1562,57 @@ koibnal_close_conn_locked (koib_conn_t *conn, int error) if (list_empty (&peer->ibp_conns) && peer->ibp_persistence == 0) { /* Non-persistent peer with no more conns... */ - koibnal_unlink_peer_locked (peer); + kibnal_unlink_peer_locked (peer); } - conn->ibc_state = OPENIBNAL_CONN_DEATHROW; + conn->ibc_state = IBNAL_CONN_DEATHROW; /* Schedule conn for closing/destruction */ - spin_lock (&koibnal_data.koib_connd_lock); + spin_lock (&kibnal_data.kib_connd_lock); - list_add_tail (&conn->ibc_list, &koibnal_data.koib_connd_conns); - wake_up (&koibnal_data.koib_connd_waitq); + list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns); + wake_up (&kibnal_data.kib_connd_waitq); - spin_unlock (&koibnal_data.koib_connd_lock); + spin_unlock (&kibnal_data.kib_connd_lock); } int -koibnal_close_conn (koib_conn_t *conn, int why) +kibnal_close_conn (kib_conn_t *conn, int why) { unsigned long flags; int count = 0; - write_lock_irqsave (&koibnal_data.koib_global_lock, flags); + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - LASSERT (conn->ibc_state >= OPENIBNAL_CONN_CONNECTING); + LASSERT (conn->ibc_state >= IBNAL_CONN_CONNECTING); - if (conn->ibc_state <= OPENIBNAL_CONN_ESTABLISHED) { + if (conn->ibc_state <= IBNAL_CONN_ESTABLISHED) { count = 1; - koibnal_close_conn_locked (conn, why); + kibnal_close_conn_locked (conn, why); } - write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags); + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); return (count); } void -koibnal_peer_connect_failed (koib_peer_t *peer, int active, int rc) +kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc) { LIST_HEAD (zombies); - koib_tx_t *tx; + kib_tx_t *tx; unsigned long flags; LASSERT (rc != 0); - LASSERT (peer->ibp_reconnect_interval >= OPENIBNAL_MIN_RECONNECT_INTERVAL); + LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL); - write_lock_irqsave (&koibnal_data.koib_global_lock, flags); + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); LASSERT (peer->ibp_connecting != 0); peer->ibp_connecting--; if (peer->ibp_connecting != 0) { /* another connection attempt under way (loopback?)... */ - write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags); + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); return; } @@ -1614,50 +1621,50 @@ koibnal_peer_connect_failed (koib_peer_t *peer, int active, int rc) peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval; /* Increase reconnection interval */ peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2, - OPENIBNAL_MAX_RECONNECT_INTERVAL); + IBNAL_MAX_RECONNECT_INTERVAL); /* Take peer's blocked blocked transmits; I'll complete * them with error */ while (!list_empty (&peer->ibp_tx_queue)) { tx = list_entry (peer->ibp_tx_queue.next, - koib_tx_t, tx_list); + kib_tx_t, tx_list); list_del (&tx->tx_list); list_add_tail (&tx->tx_list, &zombies); } - if (koibnal_peer_active(peer) && + if (kibnal_peer_active(peer) && (peer->ibp_persistence == 0)) { /* failed connection attempt on non-persistent peer */ - koibnal_unlink_peer_locked (peer); + kibnal_unlink_peer_locked (peer); } } else { /* Can't have blocked transmits if there are connections */ LASSERT (list_empty(&peer->ibp_tx_queue)); } - write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags); + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); if (!list_empty (&zombies)) CERROR ("Deleting messages for "LPX64": connection failed\n", peer->ibp_nid); while (!list_empty (&zombies)) { - tx = list_entry (zombies.next, koib_tx_t, tx_list); + tx = list_entry (zombies.next, kib_tx_t, tx_list); list_del (&tx->tx_list); /* complete now */ tx->tx_status = -EHOSTUNREACH; - koibnal_tx_done (tx); + kibnal_tx_done (tx); } } void -koibnal_connreq_done (koib_conn_t *conn, int active, int status) +kibnal_connreq_done (kib_conn_t *conn, int active, int status) { int state = conn->ibc_state; - koib_peer_t *peer = conn->ibc_peer; - koib_tx_t *tx; + kib_peer_t *peer = conn->ibc_peer; + kib_tx_t *tx; unsigned long flags; int rc; int i; @@ -1669,31 +1676,31 @@ koibnal_connreq_done (koib_conn_t *conn, int active, int status) conn->ibc_connreq = NULL; } - if (state == OPENIBNAL_CONN_CONNECTING) { + if (state == IBNAL_CONN_CONNECTING) { /* Install common (active/passive) callback for * disconnect/idle notification if I got as far as getting * a CM comm_id */ rc = tsIbCmCallbackModify(conn->ibc_comm_id, - koibnal_conn_callback, conn); + kibnal_conn_callback, conn); LASSERT (rc == 0); } - write_lock_irqsave (&koibnal_data.koib_global_lock, flags); + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); LASSERT (peer->ibp_connecting != 0); if (status == 0) { /* connection established... */ - LASSERT (state == OPENIBNAL_CONN_CONNECTING); - conn->ibc_state = OPENIBNAL_CONN_ESTABLISHED; + LASSERT (state == IBNAL_CONN_CONNECTING); + conn->ibc_state = IBNAL_CONN_ESTABLISHED; - if (!koibnal_peer_active(peer)) { + if (!kibnal_peer_active(peer)) { /* ...but peer deleted meantime */ status = -ECONNABORTED; } } else { - LASSERT (state == OPENIBNAL_CONN_INIT_QP || - state == OPENIBNAL_CONN_CONNECTING); + LASSERT (state == IBNAL_CONN_INIT_QP || + state == IBNAL_CONN_CONNECTING); } if (status == 0) { @@ -1710,14 +1717,14 @@ koibnal_connreq_done (koib_conn_t *conn, int active, int status) list_add (&conn->ibc_list, &peer->ibp_conns); /* reset reconnect interval for next attempt */ - peer->ibp_reconnect_interval = OPENIBNAL_MIN_RECONNECT_INTERVAL; + peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; /* post blocked sends to the new connection */ spin_lock (&conn->ibc_lock); while (!list_empty (&peer->ibp_tx_queue)) { tx = list_entry (peer->ibp_tx_queue.next, - koib_tx_t, tx_list); + kib_tx_t, tx_list); list_del (&tx->tx_list); @@ -1726,19 +1733,19 @@ koibnal_connreq_done (koib_conn_t *conn, int active, int status) conn, conn->ibc_state, conn->ibc_peer->ibp_nid, atomic_read (&conn->ibc_refcount)); atomic_inc (&conn->ibc_refcount); - koibnal_queue_tx_locked (tx, conn); + kibnal_queue_tx_locked (tx, conn); } spin_unlock (&conn->ibc_lock); /* Nuke any dangling conns from a different peer instance... */ - koibnal_close_stale_conns_locked (conn->ibc_peer, - conn->ibc_incarnation); + kibnal_close_stale_conns_locked (conn->ibc_peer, + conn->ibc_incarnation); - write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags); + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); /* queue up all the receives */ - for (i = 0; i < OPENIBNAL_RX_MSGS; i++) { + for (i = 0; i < IBNAL_RX_MSGS; i++) { /* +1 ref for rx desc */ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", conn, conn->ibc_state, conn->ibc_peer->ibp_nid, @@ -1749,71 +1756,71 @@ koibnal_connreq_done (koib_conn_t *conn, int active, int status) i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg, conn->ibc_rxs[i].rx_vaddr); - koibnal_post_rx (&conn->ibc_rxs[i], 0); + kibnal_post_rx (&conn->ibc_rxs[i], 0); } - koibnal_check_sends (conn); + kibnal_check_sends (conn); return; } /* connection failed */ - if (state == OPENIBNAL_CONN_CONNECTING) { + if (state == IBNAL_CONN_CONNECTING) { /* schedule for connd to close */ - koibnal_close_conn_locked (conn, status); + kibnal_close_conn_locked (conn, status); } else { /* Don't have a CM comm_id; just wait for refs to drain */ - conn->ibc_state = OPENIBNAL_CONN_ZOMBIE; + conn->ibc_state = IBNAL_CONN_ZOMBIE; } - write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags); + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - koibnal_peer_connect_failed (conn->ibc_peer, active, status); + kibnal_peer_connect_failed (conn->ibc_peer, active, status); - if (state != OPENIBNAL_CONN_CONNECTING) { + if (state != IBNAL_CONN_CONNECTING) { /* drop caller's ref if we're not waiting for the * IB_CM_IDLE callback */ - koibnal_put_conn (conn); + kibnal_put_conn (conn); } } int -koibnal_accept (koib_conn_t **connp, tTS_IB_CM_COMM_ID cid, +kibnal_accept (kib_conn_t **connp, tTS_IB_CM_COMM_ID cid, ptl_nid_t nid, __u64 incarnation, int queue_depth) { - koib_conn_t *conn = koibnal_create_conn(); - koib_peer_t *peer; - koib_peer_t *peer2; + kib_conn_t *conn = kibnal_create_conn(); + kib_peer_t *peer; + kib_peer_t *peer2; unsigned long flags; if (conn == NULL) return (-ENOMEM); - if (queue_depth != OPENIBNAL_MSG_QUEUE_SIZE) { + if (queue_depth != IBNAL_MSG_QUEUE_SIZE) { CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n", - nid, queue_depth, OPENIBNAL_MSG_QUEUE_SIZE); + nid, queue_depth, IBNAL_MSG_QUEUE_SIZE); return (-EPROTO); } /* assume 'nid' is a new peer */ - peer = koibnal_create_peer (nid); + peer = kibnal_create_peer (nid); if (peer == NULL) { CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n", conn, conn->ibc_state, conn->ibc_peer->ibp_nid, atomic_read (&conn->ibc_refcount)); atomic_dec (&conn->ibc_refcount); - koibnal_destroy_conn(conn); + kibnal_destroy_conn(conn); return (-ENOMEM); } - write_lock_irqsave (&koibnal_data.koib_global_lock, flags); + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - peer2 = koibnal_find_peer_locked(nid); + peer2 = kibnal_find_peer_locked(nid); if (peer2 == NULL) { /* peer table takes my ref on peer */ list_add_tail (&peer->ibp_list, - koibnal_nid2peerlist(nid)); + kibnal_nid2peerlist(nid)); } else { - koibnal_put_peer (peer); + kibnal_put_peer (peer); peer = peer2; } @@ -1821,20 +1828,20 @@ koibnal_accept (koib_conn_t **connp, tTS_IB_CM_COMM_ID cid, atomic_inc (&peer->ibp_refcount); peer->ibp_connecting++; - write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags); + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); conn->ibc_peer = peer; - conn->ibc_state = OPENIBNAL_CONN_CONNECTING; + conn->ibc_state = IBNAL_CONN_CONNECTING; conn->ibc_comm_id = cid; conn->ibc_incarnation = incarnation; - conn->ibc_credits = OPENIBNAL_MSG_QUEUE_SIZE; + conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; *connp = conn; return (0); } tTS_IB_CM_CALLBACK_RETURN -koibnal_idle_conn_callback (tTS_IB_CM_EVENT event, +kibnal_idle_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid, void *param, void *arg) @@ -1846,13 +1853,19 @@ koibnal_idle_conn_callback (tTS_IB_CM_EVENT event, } tTS_IB_CM_CALLBACK_RETURN -koibnal_conn_callback (tTS_IB_CM_EVENT event, +kibnal_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid, void *param, void *arg) { - koib_conn_t *conn = arg; - int rc; + kib_conn_t *conn = arg; + LIST_HEAD (zombies); + struct list_head *tmp; + struct list_head *nxt; + kib_tx_t *tx; + unsigned long flags; + int done; + int rc; /* Established Connection Notifier */ @@ -1860,24 +1873,72 @@ koibnal_conn_callback (tTS_IB_CM_EVENT event, default: CERROR("Connection %p -> "LPX64" ERROR %d\n", conn, conn->ibc_peer->ibp_nid, event); - koibnal_close_conn (conn, -ECONNABORTED); + kibnal_close_conn (conn, -ECONNABORTED); break; case TS_IB_CM_DISCONNECTED: CDEBUG(D_WARNING, "Connection %p -> "LPX64" DISCONNECTED.\n", conn, conn->ibc_peer->ibp_nid); - koibnal_close_conn (conn, 0); + kibnal_close_conn (conn, 0); break; case TS_IB_CM_IDLE: CDEBUG(D_NET, "Connection %p -> "LPX64" IDLE.\n", conn, conn->ibc_peer->ibp_nid); - koibnal_put_conn (conn); /* Lose CM's ref */ + kibnal_put_conn (conn); /* Lose CM's ref */ /* LASSERT (no further callbacks) */ rc = tsIbCmCallbackModify(cid, - koibnal_idle_conn_callback, conn); + kibnal_idle_conn_callback, conn); LASSERT (rc == 0); + + /* NB we wait until the connection has closed before + * completing outstanding passive RDMAs so we can be sure + * the network can't touch the mapped memory any more. */ + + spin_lock_irqsave (&conn->ibc_lock, flags); + + /* grab passive RDMAs not waiting for the tx callback */ + list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) { + tx = list_entry (tmp, kib_tx_t, tx_list); + + LASSERT (tx->tx_passive_rdma || + !tx->tx_passive_rdma_wait); + + LASSERT (tx->tx_passive_rdma_wait || + tx->tx_sending != 0); + + /* still waiting for tx callback? */ + if (!tx->tx_passive_rdma_wait) + continue; + + tx->tx_status = -ECONNABORTED; + tx->tx_passive_rdma_wait = 0; + done = (tx->tx_sending == 0); + + if (!done) + continue; + + list_del (&tx->tx_list); + list_add (&tx->tx_list, &zombies); + } + + /* grab all blocked transmits */ + list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) { + tx = list_entry (tmp, kib_tx_t, tx_list); + + list_del (&tx->tx_list); + list_add (&tx->tx_list, &zombies); + } + + spin_unlock_irqrestore (&conn->ibc_lock, flags); + + while (!list_empty(&zombies)) { + tx = list_entry (zombies.next, kib_tx_t, tx_list); + + list_del(&tx->tx_list); + kibnal_tx_done (tx); + } break; } @@ -1885,12 +1946,12 @@ koibnal_conn_callback (tTS_IB_CM_EVENT event, } tTS_IB_CM_CALLBACK_RETURN -koibnal_passive_conn_callback (tTS_IB_CM_EVENT event, +kibnal_passive_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid, void *param, void *arg) { - koib_conn_t *conn = arg; + kib_conn_t *conn = arg; int rc; switch (event) { @@ -1903,12 +1964,12 @@ koibnal_passive_conn_callback (tTS_IB_CM_EVENT event, CERROR ("Unexpected event %p -> "LPX64": %d\n", conn, conn->ibc_peer->ibp_nid, event); - koibnal_connreq_done (conn, 0, -ECONNABORTED); + kibnal_connreq_done (conn, 0, -ECONNABORTED); break; case TS_IB_CM_REQ_RECEIVED: { struct ib_cm_req_received_param *req = param; - koib_wire_connreq_t *wcr = req->remote_private_data; + kib_wire_connreq_t *wcr = req->remote_private_data; LASSERT (conn == NULL); @@ -1920,23 +1981,23 @@ koibnal_passive_conn_callback (tTS_IB_CM_EVENT event, return TS_IB_CM_CALLBACK_ABORT; } - if (wcr->wcr_magic != cpu_to_le32(OPENIBNAL_MSG_MAGIC)) { + if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) { CERROR ("Can't accept LID %04x: bad magic %08x\n", req->dlid, le32_to_cpu(wcr->wcr_magic)); return TS_IB_CM_CALLBACK_ABORT; } - if (wcr->wcr_version != cpu_to_le16(OPENIBNAL_MSG_VERSION)) { + if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) { CERROR ("Can't accept LID %04x: bad version %d\n", req->dlid, le16_to_cpu(wcr->wcr_magic)); return TS_IB_CM_CALLBACK_ABORT; } - rc = koibnal_accept(&conn, - cid, - le64_to_cpu(wcr->wcr_nid), - le64_to_cpu(wcr->wcr_incarnation), - le16_to_cpu(wcr->wcr_queue_depth)); + rc = kibnal_accept(&conn, + cid, + le64_to_cpu(wcr->wcr_nid), + le64_to_cpu(wcr->wcr_incarnation), + le16_to_cpu(wcr->wcr_queue_depth)); if (rc != 0) { CERROR ("Can't accept "LPX64": %d\n", le64_to_cpu(wcr->wcr_nid), rc); @@ -1945,23 +2006,23 @@ koibnal_passive_conn_callback (tTS_IB_CM_EVENT event, /* update 'arg' for next callback */ rc = tsIbCmCallbackModify(cid, - koibnal_passive_conn_callback, conn); + kibnal_passive_conn_callback, conn); LASSERT (rc == 0); req->accept_param.qp = conn->ibc_qp; - *((koib_wire_connreq_t *)req->accept_param.reply_private_data) - = (koib_wire_connreq_t) { - .wcr_magic = cpu_to_le32(OPENIBNAL_MSG_MAGIC), - .wcr_version = cpu_to_le16(OPENIBNAL_MSG_VERSION), - .wcr_queue_depth = cpu_to_le32(OPENIBNAL_MSG_QUEUE_SIZE), - .wcr_nid = cpu_to_le64(koibnal_data.koib_nid), - .wcr_incarnation = cpu_to_le64(koibnal_data.koib_incarnation), + *((kib_wire_connreq_t *)req->accept_param.reply_private_data) + = (kib_wire_connreq_t) { + .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC), + .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION), + .wcr_queue_depth = cpu_to_le32(IBNAL_MSG_QUEUE_SIZE), + .wcr_nid = cpu_to_le64(kibnal_data.kib_nid), + .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation), }; - req->accept_param.reply_private_data_len = sizeof(koib_wire_connreq_t); - req->accept_param.responder_resources = OPENIBNAL_RESPONDER_RESOURCES; - req->accept_param.initiator_depth = OPENIBNAL_RESPONDER_RESOURCES; - req->accept_param.rnr_retry_count = OPENIBNAL_RNR_RETRY; - req->accept_param.flow_control = OPENIBNAL_FLOW_CONTROL; + req->accept_param.reply_private_data_len = sizeof(kib_wire_connreq_t); + req->accept_param.responder_resources = IBNAL_RESPONDER_RESOURCES; + req->accept_param.initiator_depth = IBNAL_RESPONDER_RESOURCES; + req->accept_param.rnr_retry_count = IBNAL_RNR_RETRY; + req->accept_param.flow_control = IBNAL_FLOW_CONTROL; CDEBUG(D_NET, "Proceeding\n"); break; @@ -1972,60 +2033,60 @@ koibnal_passive_conn_callback (tTS_IB_CM_EVENT event, CDEBUG(D_WARNING, "Connection %p -> "LPX64" ESTABLISHED.\n", conn, conn->ibc_peer->ibp_nid); - koibnal_connreq_done (conn, 0, 0); + kibnal_connreq_done (conn, 0, 0); break; } - /* NB if the connreq is done, we switch to koibnal_conn_callback */ + /* NB if the connreq is done, we switch to kibnal_conn_callback */ return TS_IB_CM_CALLBACK_PROCEED; } tTS_IB_CM_CALLBACK_RETURN -koibnal_active_conn_callback (tTS_IB_CM_EVENT event, +kibnal_active_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid, void *param, void *arg) { - koib_conn_t *conn = arg; + kib_conn_t *conn = arg; switch (event) { case TS_IB_CM_REP_RECEIVED: { struct ib_cm_rep_received_param *rep = param; - koib_wire_connreq_t *wcr = rep->remote_private_data; + kib_wire_connreq_t *wcr = rep->remote_private_data; if (rep->remote_private_data_len < sizeof (*wcr)) { CERROR ("Short reply from "LPX64": %d\n", conn->ibc_peer->ibp_nid, rep->remote_private_data_len); - koibnal_connreq_done (conn, 1, -EPROTO); + kibnal_connreq_done (conn, 1, -EPROTO); break; } - if (wcr->wcr_magic != cpu_to_le32(OPENIBNAL_MSG_MAGIC)) { + if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) { CERROR ("Can't connect "LPX64": bad magic %08x\n", conn->ibc_peer->ibp_nid, le32_to_cpu(wcr->wcr_magic)); - koibnal_connreq_done (conn, 1, -EPROTO); + kibnal_connreq_done (conn, 1, -EPROTO); break; } - if (wcr->wcr_version != cpu_to_le16(OPENIBNAL_MSG_VERSION)) { + if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) { CERROR ("Can't connect "LPX64": bad version %d\n", conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_magic)); - koibnal_connreq_done (conn, 1, -EPROTO); + kibnal_connreq_done (conn, 1, -EPROTO); break; } - if (wcr->wcr_queue_depth != cpu_to_le16(OPENIBNAL_MSG_QUEUE_SIZE)) { + if (wcr->wcr_queue_depth != cpu_to_le16(IBNAL_MSG_QUEUE_SIZE)) { CERROR ("Can't connect "LPX64": bad queue depth %d\n", conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_queue_depth)); - koibnal_connreq_done (conn, 1, -EPROTO); + kibnal_connreq_done (conn, 1, -EPROTO); break; } if (le64_to_cpu(wcr->wcr_nid) != conn->ibc_peer->ibp_nid) { CERROR ("Unexpected NID "LPX64" from "LPX64"\n", le64_to_cpu(wcr->wcr_nid), conn->ibc_peer->ibp_nid); - koibnal_connreq_done (conn, 1, -EPROTO); + kibnal_connreq_done (conn, 1, -EPROTO); break; } @@ -2033,7 +2094,7 @@ koibnal_active_conn_callback (tTS_IB_CM_EVENT event, conn, conn->ibc_peer->ibp_nid); conn->ibc_incarnation = le64_to_cpu(wcr->wcr_incarnation); - conn->ibc_credits = OPENIBNAL_MSG_QUEUE_SIZE; + conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; break; } @@ -2041,86 +2102,86 @@ koibnal_active_conn_callback (tTS_IB_CM_EVENT event, CDEBUG(D_WARNING, "Connection %p -> "LPX64" Established\n", conn, conn->ibc_peer->ibp_nid); - koibnal_connreq_done (conn, 1, 0); + kibnal_connreq_done (conn, 1, 0); break; case TS_IB_CM_IDLE: CERROR("Connection %p -> "LPX64" IDLE\n", conn, conn->ibc_peer->ibp_nid); /* Back out state change: I'm disengaged from CM */ - conn->ibc_state = OPENIBNAL_CONN_INIT_QP; + conn->ibc_state = IBNAL_CONN_INIT_QP; - koibnal_connreq_done (conn, 1, -ECONNABORTED); + kibnal_connreq_done (conn, 1, -ECONNABORTED); break; default: CERROR("Connection %p -> "LPX64" ERROR %d\n", conn, conn->ibc_peer->ibp_nid, event); - koibnal_connreq_done (conn, 1, -ECONNABORTED); + kibnal_connreq_done (conn, 1, -ECONNABORTED); break; } - /* NB if the connreq is done, we switch to koibnal_conn_callback */ + /* NB if the connreq is done, we switch to kibnal_conn_callback */ return TS_IB_CM_CALLBACK_PROCEED; } int -koibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status, +kibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status, struct ib_path_record *resp, int remaining, void *arg) { - koib_conn_t *conn = arg; + kib_conn_t *conn = arg; if (status != 0) { CERROR ("status %d\n", status); - koibnal_connreq_done (conn, 1, status); + kibnal_connreq_done (conn, 1, status); goto out; } conn->ibc_connreq->cr_path = *resp; - conn->ibc_connreq->cr_wcr = (koib_wire_connreq_t) { - .wcr_magic = cpu_to_le32(OPENIBNAL_MSG_MAGIC), - .wcr_version = cpu_to_le16(OPENIBNAL_MSG_VERSION), - .wcr_queue_depth = cpu_to_le16(OPENIBNAL_MSG_QUEUE_SIZE), - .wcr_nid = cpu_to_le64(koibnal_data.koib_nid), - .wcr_incarnation = cpu_to_le64(koibnal_data.koib_incarnation), + conn->ibc_connreq->cr_wcr = (kib_wire_connreq_t) { + .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC), + .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION), + .wcr_queue_depth = cpu_to_le16(IBNAL_MSG_QUEUE_SIZE), + .wcr_nid = cpu_to_le64(kibnal_data.kib_nid), + .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation), }; conn->ibc_connreq->cr_connparam = (struct ib_cm_active_param) { .qp = conn->ibc_qp, .req_private_data = &conn->ibc_connreq->cr_wcr, .req_private_data_len = sizeof(conn->ibc_connreq->cr_wcr), - .responder_resources = OPENIBNAL_RESPONDER_RESOURCES, - .initiator_depth = OPENIBNAL_RESPONDER_RESOURCES, - .retry_count = OPENIBNAL_RETRY, - .rnr_retry_count = OPENIBNAL_RNR_RETRY, - .cm_response_timeout = koibnal_tunables.koib_io_timeout, - .max_cm_retries = OPENIBNAL_CM_RETRY, - .flow_control = OPENIBNAL_FLOW_CONTROL, + .responder_resources = IBNAL_RESPONDER_RESOURCES, + .initiator_depth = IBNAL_RESPONDER_RESOURCES, + .retry_count = IBNAL_RETRY, + .rnr_retry_count = IBNAL_RNR_RETRY, + .cm_response_timeout = kibnal_tunables.kib_io_timeout, + .max_cm_retries = IBNAL_CM_RETRY, + .flow_control = IBNAL_FLOW_CONTROL, }; /* XXX set timeout just like SDP!!!*/ conn->ibc_connreq->cr_path.packet_life = 13; /* Flag I'm getting involved with the CM... */ - conn->ibc_state = OPENIBNAL_CONN_CONNECTING; + conn->ibc_state = IBNAL_CONN_CONNECTING; CDEBUG(D_NET, "Connecting to, service id "LPX64", on "LPX64"\n", conn->ibc_connreq->cr_service.service_id, - *koibnal_service_nid_field(&conn->ibc_connreq->cr_service)); + *kibnal_service_nid_field(&conn->ibc_connreq->cr_service)); - /* koibnal_connect_callback gets my conn ref */ + /* kibnal_connect_callback gets my conn ref */ status = ib_cm_connect (&conn->ibc_connreq->cr_connparam, &conn->ibc_connreq->cr_path, NULL, conn->ibc_connreq->cr_service.service_id, 0, - koibnal_active_conn_callback, conn, + kibnal_active_conn_callback, conn, &conn->ibc_comm_id); if (status != 0) { CERROR ("Connect: %d\n", status); /* Back out state change: I've not got a CM comm_id yet... */ - conn->ibc_state = OPENIBNAL_CONN_INIT_QP; - koibnal_connreq_done (conn, 1, status); + conn->ibc_state = IBNAL_CONN_INIT_QP; + kibnal_connreq_done (conn, 1, status); } out: @@ -2129,58 +2190,58 @@ koibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status, } void -koibnal_service_get_callback (tTS_IB_CLIENT_QUERY_TID tid, int status, - struct ib_common_attrib_service *resp, void *arg) +kibnal_service_get_callback (tTS_IB_CLIENT_QUERY_TID tid, int status, + struct ib_common_attrib_service *resp, void *arg) { - koib_conn_t *conn = arg; + kib_conn_t *conn = arg; if (status != 0) { CERROR ("status %d\n", status); - koibnal_connreq_done (conn, 1, status); + kibnal_connreq_done (conn, 1, status); return; } CDEBUG(D_NET, "Got status %d, service id "LPX64", on "LPX64"\n", status, resp->service_id, - *koibnal_service_nid_field(resp)); + *kibnal_service_nid_field(resp)); conn->ibc_connreq->cr_service = *resp; - status = ib_cached_gid_get(koibnal_data.koib_device, - koibnal_data.koib_port, 0, + status = ib_cached_gid_get(kibnal_data.kib_device, + kibnal_data.kib_port, 0, conn->ibc_connreq->cr_gid); LASSERT (status == 0); - /* koibnal_pathreq_callback gets my conn ref */ - status = tsIbPathRecordRequest (koibnal_data.koib_device, - koibnal_data.koib_port, + /* kibnal_pathreq_callback gets my conn ref */ + status = tsIbPathRecordRequest (kibnal_data.kib_device, + kibnal_data.kib_port, conn->ibc_connreq->cr_gid, conn->ibc_connreq->cr_service.service_gid, conn->ibc_connreq->cr_service.service_pkey, 0, - koibnal_tunables.koib_io_timeout * HZ, + kibnal_tunables.kib_io_timeout * HZ, 0, - koibnal_pathreq_callback, conn, + kibnal_pathreq_callback, conn, &conn->ibc_connreq->cr_tid); if (status == 0) return; CERROR ("Path record request: %d\n", status); - koibnal_connreq_done (conn, 1, status); + kibnal_connreq_done (conn, 1, status); } void -koibnal_connect_peer (koib_peer_t *peer) +kibnal_connect_peer (kib_peer_t *peer) { - koib_conn_t *conn = koibnal_create_conn(); + kib_conn_t *conn = kibnal_create_conn(); int rc; LASSERT (peer->ibp_connecting != 0); if (conn == NULL) { CERROR ("Can't allocate conn\n"); - koibnal_peer_connect_failed (peer, 1, -ENOMEM); + kibnal_peer_connect_failed (peer, 1, -ENOMEM); return; } @@ -2190,85 +2251,101 @@ koibnal_connect_peer (koib_peer_t *peer) PORTAL_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq)); if (conn->ibc_connreq == NULL) { CERROR ("Can't allocate connreq\n"); - koibnal_connreq_done (conn, 1, -ENOMEM); + kibnal_connreq_done (conn, 1, -ENOMEM); return; } memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq)); - koibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid); + kibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid); - /* koibnal_service_get_callback gets my conn ref */ - rc = ib_service_get (koibnal_data.koib_device, - koibnal_data.koib_port, + /* kibnal_service_get_callback gets my conn ref */ + rc = ib_service_get (kibnal_data.kib_device, + kibnal_data.kib_port, &conn->ibc_connreq->cr_service, - KOIBNAL_SERVICE_KEY_MASK, - koibnal_tunables.koib_io_timeout * HZ, - koibnal_service_get_callback, conn, + KIBNAL_SERVICE_KEY_MASK, + kibnal_tunables.kib_io_timeout * HZ, + kibnal_service_get_callback, conn, &conn->ibc_connreq->cr_tid); if (rc == 0) return; CERROR ("ib_service_get: %d\n", rc); - koibnal_connreq_done (conn, 1, rc); + kibnal_connreq_done (conn, 1, rc); } int -koibnal_conn_timed_out (koib_conn_t *conn) +kibnal_conn_timed_out (kib_conn_t *conn) { - koib_tx_t *tx; + kib_tx_t *tx; struct list_head *ttmp; unsigned long flags; - int rc = 0; spin_lock_irqsave (&conn->ibc_lock, flags); - list_for_each (ttmp, &conn->ibc_rdma_queue) { - tx = list_entry (ttmp, koib_tx_t, tx_list); + list_for_each (ttmp, &conn->ibc_tx_queue) { + tx = list_entry (ttmp, kib_tx_t, tx_list); - LASSERT (tx->tx_passive_rdma); - LASSERT (tx->tx_passive_rdma_wait); + LASSERT (!tx->tx_passive_rdma_wait); + LASSERT (tx->tx_sending == 0); - if (time_after_eq (jiffies, tx->tx_passive_rdma_deadline)) { - rc = 1; - break; + if (time_after_eq (jiffies, tx->tx_deadline)) { + spin_unlock_irqrestore (&conn->ibc_lock, flags); + return 1; } } + + list_for_each (ttmp, &conn->ibc_active_txs) { + tx = list_entry (ttmp, kib_tx_t, tx_list); + + LASSERT (tx->tx_passive_rdma || + !tx->tx_passive_rdma_wait); + + LASSERT (tx->tx_passive_rdma_wait || + tx->tx_sending != 0); + + if (time_after_eq (jiffies, tx->tx_deadline)) { + spin_unlock_irqrestore (&conn->ibc_lock, flags); + return 1; + } + } + spin_unlock_irqrestore (&conn->ibc_lock, flags); - return rc; + return 0; } void -koibnal_check_conns (int idx) +kibnal_check_conns (int idx) { - struct list_head *peers = &koibnal_data.koib_peers[idx]; + struct list_head *peers = &kibnal_data.kib_peers[idx]; struct list_head *ptmp; - koib_peer_t *peer; - koib_conn_t *conn; + kib_peer_t *peer; + kib_conn_t *conn; struct list_head *ctmp; again: /* NB. We expect to have a look at all the peers and not find any * rdmas to time out, so we just use a shared lock while we * take a look... */ - read_lock (&koibnal_data.koib_global_lock); + read_lock (&kibnal_data.kib_global_lock); list_for_each (ptmp, peers) { - peer = list_entry (ptmp, koib_peer_t, ibp_list); + peer = list_entry (ptmp, kib_peer_t, ibp_list); list_for_each (ctmp, &peer->ibp_conns) { - conn = list_entry (ctmp, koib_conn_t, ibc_list); + conn = list_entry (ctmp, kib_conn_t, ibc_list); + + LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED); - LASSERT (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED); /* In case we have enough credits to return via a * NOOP, but there were no non-blocking tx descs * free to do it last time... */ - koibnal_check_sends(conn); + kibnal_check_sends(conn); - if (!koibnal_conn_timed_out(conn)) + if (!kibnal_conn_timed_out(conn)) continue; CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", @@ -2276,108 +2353,76 @@ koibnal_check_conns (int idx) atomic_read (&conn->ibc_refcount)); atomic_inc (&conn->ibc_refcount); - read_unlock (&koibnal_data.koib_global_lock); + read_unlock (&kibnal_data.kib_global_lock); CERROR("Timed out RDMA with "LPX64"\n", peer->ibp_nid); - koibnal_close_conn (conn, -ETIMEDOUT); - koibnal_put_conn (conn); + kibnal_close_conn (conn, -ETIMEDOUT); + kibnal_put_conn (conn); /* start again now I've dropped the lock */ goto again; } } - read_unlock (&koibnal_data.koib_global_lock); + read_unlock (&kibnal_data.kib_global_lock); } void -koibnal_terminate_conn (koib_conn_t *conn) +kibnal_terminate_conn (kib_conn_t *conn) { - unsigned long flags; int rc; - int done; CDEBUG(D_NET, "conn %p\n", conn); - LASSERT (conn->ibc_state == OPENIBNAL_CONN_DEATHROW); - conn->ibc_state = OPENIBNAL_CONN_ZOMBIE; + LASSERT (conn->ibc_state == IBNAL_CONN_DEATHROW); + conn->ibc_state = IBNAL_CONN_ZOMBIE; rc = ib_cm_disconnect (conn->ibc_comm_id); if (rc != 0) CERROR ("Error %d disconnecting conn %p -> "LPX64"\n", rc, conn, conn->ibc_peer->ibp_nid); - - /* complete blocked passive RDMAs */ - spin_lock_irqsave (&conn->ibc_lock, flags); - - while (!list_empty (&conn->ibc_rdma_queue)) { - koib_tx_t *tx = list_entry (conn->ibc_rdma_queue.next, - koib_tx_t, tx_list); - - LASSERT (tx->tx_passive_rdma); - LASSERT (tx->tx_passive_rdma_wait); - - list_del (&tx->tx_list); - - tx->tx_passive_rdma_wait = 0; - done = (tx->tx_sending == 0); - - tx->tx_status = -ECONNABORTED; - - spin_unlock_irqrestore (&conn->ibc_lock, flags); - - if (done) - koibnal_tx_done (tx); - - spin_lock_irqsave (&conn->ibc_lock, flags); - } - - spin_unlock_irqrestore (&conn->ibc_lock, flags); - - /* Complete all blocked transmits */ - koibnal_check_sends(conn); } int -koibnal_connd (void *arg) +kibnal_connd (void *arg) { wait_queue_t wait; unsigned long flags; - koib_conn_t *conn; - koib_peer_t *peer; + kib_conn_t *conn; + kib_peer_t *peer; int timeout; int i; int peer_index = 0; unsigned long deadline = jiffies; - kportal_daemonize ("koibnal_connd"); + kportal_daemonize ("kibnal_connd"); kportal_blockallsigs (); init_waitqueue_entry (&wait, current); - spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags); + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); for (;;) { - if (!list_empty (&koibnal_data.koib_connd_conns)) { - conn = list_entry (koibnal_data.koib_connd_conns.next, - koib_conn_t, ibc_list); + if (!list_empty (&kibnal_data.kib_connd_conns)) { + conn = list_entry (kibnal_data.kib_connd_conns.next, + kib_conn_t, ibc_list); list_del (&conn->ibc_list); - spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags); + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); switch (conn->ibc_state) { - case OPENIBNAL_CONN_DEATHROW: + case IBNAL_CONN_DEATHROW: LASSERT (conn->ibc_comm_id != TS_IB_CM_COMM_ID_INVALID); /* Disconnect: conn becomes a zombie in the * callback and last ref reschedules it * here... */ - koibnal_terminate_conn(conn); - koibnal_put_conn (conn); + kibnal_terminate_conn(conn); + kibnal_put_conn (conn); break; - case OPENIBNAL_CONN_ZOMBIE: - koibnal_destroy_conn (conn); + case IBNAL_CONN_ZOMBIE: + kibnal_destroy_conn (conn); break; default: @@ -2386,35 +2431,35 @@ koibnal_connd (void *arg) LBUG(); } - spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags); + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); continue; } - if (!list_empty (&koibnal_data.koib_connd_peers)) { - peer = list_entry (koibnal_data.koib_connd_peers.next, - koib_peer_t, ibp_connd_list); + if (!list_empty (&kibnal_data.kib_connd_peers)) { + peer = list_entry (kibnal_data.kib_connd_peers.next, + kib_peer_t, ibp_connd_list); list_del_init (&peer->ibp_connd_list); - spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags); + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - koibnal_connect_peer (peer); - koibnal_put_peer (peer); + kibnal_connect_peer (peer); + kibnal_put_peer (peer); - spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags); + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); } /* shut down and nobody left to reap... */ - if (koibnal_data.koib_shutdown && - atomic_read(&koibnal_data.koib_nconns) == 0) + if (kibnal_data.kib_shutdown && + atomic_read(&kibnal_data.kib_nconns) == 0) break; - spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags); + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); /* careful with the jiffy wrap... */ while ((timeout = (int)(deadline - jiffies)) <= 0) { const int n = 4; const int p = 1; - int chunk = koibnal_data.koib_peer_hash_size; + int chunk = kibnal_data.kib_peer_hash_size; /* Time to check for RDMA timeouts on a few more * peers: I do checks every 'p' seconds on a @@ -2424,129 +2469,129 @@ koibnal_connd (void *arg) * connection within (n+1)/n times the timeout * interval. */ - if (koibnal_tunables.koib_io_timeout > n * p) + if (kibnal_tunables.kib_io_timeout > n * p) chunk = (chunk * n * p) / - koibnal_tunables.koib_io_timeout; + kibnal_tunables.kib_io_timeout; if (chunk == 0) chunk = 1; for (i = 0; i < chunk; i++) { - koibnal_check_conns (peer_index); + kibnal_check_conns (peer_index); peer_index = (peer_index + 1) % - koibnal_data.koib_peer_hash_size; + kibnal_data.kib_peer_hash_size; } deadline += p * HZ; } - koibnal_data.koib_connd_waketime = jiffies + timeout; + kibnal_data.kib_connd_waketime = jiffies + timeout; set_current_state (TASK_INTERRUPTIBLE); - add_wait_queue (&koibnal_data.koib_connd_waitq, &wait); + add_wait_queue (&kibnal_data.kib_connd_waitq, &wait); - if (!koibnal_data.koib_shutdown && - list_empty (&koibnal_data.koib_connd_conns) && - list_empty (&koibnal_data.koib_connd_peers)) + if (!kibnal_data.kib_shutdown && + list_empty (&kibnal_data.kib_connd_conns) && + list_empty (&kibnal_data.kib_connd_peers)) schedule_timeout (timeout); set_current_state (TASK_RUNNING); - remove_wait_queue (&koibnal_data.koib_connd_waitq, &wait); + remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait); - spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags); + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); } - spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags); + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - koibnal_thread_fini (); + kibnal_thread_fini (); return (0); } int -koibnal_scheduler(void *arg) +kibnal_scheduler(void *arg) { long id = (long)arg; char name[16]; - koib_rx_t *rx; - koib_tx_t *tx; + kib_rx_t *rx; + kib_tx_t *tx; unsigned long flags; int rc; int counter = 0; int did_something; - snprintf(name, sizeof(name), "koibnal_sd_%02ld", id); + snprintf(name, sizeof(name), "kibnal_sd_%02ld", id); kportal_daemonize(name); kportal_blockallsigs(); - spin_lock_irqsave(&koibnal_data.koib_sched_lock, flags); + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); for (;;) { did_something = 0; - while (!list_empty(&koibnal_data.koib_sched_txq)) { - tx = list_entry(koibnal_data.koib_sched_txq.next, - koib_tx_t, tx_list); + while (!list_empty(&kibnal_data.kib_sched_txq)) { + tx = list_entry(kibnal_data.kib_sched_txq.next, + kib_tx_t, tx_list); list_del(&tx->tx_list); - spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); - koibnal_tx_done(tx); + kibnal_tx_done(tx); - spin_lock_irqsave(&koibnal_data.koib_sched_lock, + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); } - if (!list_empty(&koibnal_data.koib_sched_rxq)) { - rx = list_entry(koibnal_data.koib_sched_rxq.next, - koib_rx_t, rx_list); + if (!list_empty(&kibnal_data.kib_sched_rxq)) { + rx = list_entry(kibnal_data.kib_sched_rxq.next, + kib_rx_t, rx_list); list_del(&rx->rx_list); - spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); - koibnal_rx(rx); + kibnal_rx(rx); did_something = 1; - spin_lock_irqsave(&koibnal_data.koib_sched_lock, + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); } /* shut down and no receives to complete... */ - if (koibnal_data.koib_shutdown && - atomic_read(&koibnal_data.koib_nconns) == 0) + if (kibnal_data.kib_shutdown && + atomic_read(&kibnal_data.kib_nconns) == 0) break; /* nothing to do or hogging CPU */ - if (!did_something || counter++ == OPENIBNAL_RESCHED) { - spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, + if (!did_something || counter++ == IBNAL_RESCHED) { + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); counter = 0; if (!did_something) { rc = wait_event_interruptible( - koibnal_data.koib_sched_waitq, - !list_empty(&koibnal_data.koib_sched_txq) || - !list_empty(&koibnal_data.koib_sched_rxq) || - (koibnal_data.koib_shutdown && - atomic_read (&koibnal_data.koib_nconns) == 0)); + kibnal_data.kib_sched_waitq, + !list_empty(&kibnal_data.kib_sched_txq) || + !list_empty(&kibnal_data.kib_sched_rxq) || + (kibnal_data.kib_shutdown && + atomic_read (&kibnal_data.kib_nconns) == 0)); } else { our_cond_resched(); } - spin_lock_irqsave(&koibnal_data.koib_sched_lock, + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); } } - spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, flags); + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); - koibnal_thread_fini(); + kibnal_thread_fini(); return (0); } -lib_nal_t koibnal_lib = { - libnal_data: &koibnal_data, /* NAL private data */ - libnal_send: koibnal_send, - libnal_send_pages: koibnal_send_pages, - libnal_recv: koibnal_recv, - libnal_recv_pages: koibnal_recv_pages, - libnal_dist: koibnal_dist +lib_nal_t kibnal_lib = { + libnal_data: &kibnal_data, /* NAL private data */ + libnal_send: kibnal_send, + libnal_send_pages: kibnal_send_pages, + libnal_recv: kibnal_recv, + libnal_recv_pages: kibnal_recv_pages, + libnal_dist: kibnal_dist }; diff --git a/lnet/klnds/qswlnd/qswlnd.c b/lnet/klnds/qswlnd/qswlnd.c index 16123c2..5aff4e9 100644 --- a/lnet/klnds/qswlnd/qswlnd.c +++ b/lnet/klnds/qswlnd/qswlnd.c @@ -40,10 +40,10 @@ kpr_nal_interface_t kqswnal_router_interface = { #define QSWNAL_SYSCTL 201 #define QSWNAL_SYSCTL_OPTIMIZED_GETS 1 -#define QSWNAL_SYSCTL_COPY_SMALL_FWD 2 +#define QSWNAL_SYSCTL_OPTIMIZED_PUTS 2 static ctl_table kqswnal_ctl_table[] = { - {QSWNAL_SYSCTL_OPTIMIZED_GETS, "optimized_puts", + {QSWNAL_SYSCTL_OPTIMIZED_PUTS, "optimized_puts", &kqswnal_tunables.kqn_optimized_puts, sizeof (int), 0644, NULL, &proc_dointvec}, {QSWNAL_SYSCTL_OPTIMIZED_GETS, "optimized_gets", @@ -121,6 +121,8 @@ static void kqswnal_shutdown(nal_t *nal) { unsigned long flags; + kqswnal_tx_t *ktx; + kqswnal_rx_t *krx; int do_lib_fini = 0; /* NB The first ref was this module! */ @@ -267,37 +269,25 @@ kqswnal_shutdown(nal_t *nal) * ep_dvma_release() get fixed (and releases any mappings in the * region), we can delete all the code from here --------> */ - if (kqswnal_data.kqn_txds != NULL) { - int i; + for (ktx = kqswnal_data.kqn_txds; ktx != NULL; ktx = ktx->ktx_alloclist) { + /* If ktx has a buffer, it got mapped; unmap now. NB only + * the pre-mapped stuff is still mapped since all tx descs + * must be idle */ - for (i = 0; i < KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS; i++) { - kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i]; - - /* If ktx has a buffer, it got mapped; unmap now. - * NB only the pre-mapped stuff is still mapped - * since all tx descs must be idle */ - - if (ktx->ktx_buffer != NULL) - ep_dvma_unload(kqswnal_data.kqn_ep, - kqswnal_data.kqn_ep_tx_nmh, - &ktx->ktx_ebuffer); - } + if (ktx->ktx_buffer != NULL) + ep_dvma_unload(kqswnal_data.kqn_ep, + kqswnal_data.kqn_ep_tx_nmh, + &ktx->ktx_ebuffer); } - if (kqswnal_data.kqn_rxds != NULL) { - int i; - - for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++) { - kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i]; - - /* If krx_kiov[0].kiov_page got allocated, it got mapped. - * NB subsequent pages get merged */ + for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) { + /* If krx_kiov[0].kiov_page got allocated, it got mapped. + * NB subsequent pages get merged */ - if (krx->krx_kiov[0].kiov_page != NULL) - ep_dvma_unload(kqswnal_data.kqn_ep, - kqswnal_data.kqn_ep_rx_nmh, - &krx->krx_elanbuffer); - } + if (krx->krx_kiov[0].kiov_page != NULL) + ep_dvma_unload(kqswnal_data.kqn_ep, + kqswnal_data.kqn_ep_rx_nmh, + &krx->krx_elanbuffer); } /* <----------- to here */ @@ -330,41 +320,26 @@ kqswnal_shutdown(nal_t *nal) } #endif - if (kqswnal_data.kqn_txds != NULL) - { - int i; + while (kqswnal_data.kqn_txds != NULL) { + ktx = kqswnal_data.kqn_txds; - for (i = 0; i < KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS; i++) - { - kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i]; - - if (ktx->ktx_buffer != NULL) - PORTAL_FREE(ktx->ktx_buffer, - KQSW_TX_BUFFER_SIZE); - } + if (ktx->ktx_buffer != NULL) + PORTAL_FREE(ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE); - PORTAL_FREE(kqswnal_data.kqn_txds, - sizeof (kqswnal_tx_t) * (KQSW_NTXMSGS + - KQSW_NNBLK_TXMSGS)); + kqswnal_data.kqn_txds = ktx->ktx_alloclist; + PORTAL_FREE(ktx, sizeof(*ktx)); } - if (kqswnal_data.kqn_rxds != NULL) - { - int i; - int j; + while (kqswnal_data.kqn_rxds != NULL) { + int i; - for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++) - { - kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i]; + krx = kqswnal_data.kqn_rxds; + for (i = 0; i < krx->krx_npages; i++) + if (krx->krx_kiov[i].kiov_page != NULL) + __free_page (krx->krx_kiov[i].kiov_page); - for (j = 0; j < krx->krx_npages; j++) - if (krx->krx_kiov[j].kiov_page != NULL) - __free_page (krx->krx_kiov[j].kiov_page); - } - - PORTAL_FREE(kqswnal_data.kqn_rxds, - sizeof(kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL + - KQSW_NRXMSGS_LARGE)); + kqswnal_data.kqn_rxds = krx->krx_alloclist; + PORTAL_FREE(krx, sizeof (*krx)); } /* resets flags, pointers to NULL etc */ @@ -388,6 +363,8 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, #endif int rc; int i; + kqswnal_rx_t *krx; + kqswnal_tx_t *ktx; int elan_page_idx; ptl_process_id_t my_process_id; int pkmem = atomic_read(&portal_kmemory); @@ -560,23 +537,22 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, /**********************************************************************/ /* Allocate/Initialise transmit descriptors */ - PORTAL_ALLOC(kqswnal_data.kqn_txds, - sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS)); - if (kqswnal_data.kqn_txds == NULL) - { - kqswnal_shutdown (nal); - return (PTL_NO_SPACE); - } - - /* clear flags, null pointers etc */ - memset(kqswnal_data.kqn_txds, 0, - sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS)); + kqswnal_data.kqn_txds = NULL; for (i = 0; i < (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS); i++) { int premapped_pages; - kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i]; int basepage = i * KQSW_NTXMSGPAGES; + PORTAL_ALLOC (ktx, sizeof(*ktx)); + if (ktx == NULL) { + kqswnal_shutdown (nal); + return (PTL_NO_SPACE); + } + + memset(ktx, 0, sizeof(*ktx)); /* NULL pointers; zero flags */ + ktx->ktx_alloclist = kqswnal_data.kqn_txds; + kqswnal_data.kqn_txds = ktx; + PORTAL_ALLOC (ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE); if (ktx->ktx_buffer == NULL) { @@ -615,18 +591,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, /**********************************************************************/ /* Allocate/Initialise receive descriptors */ - - PORTAL_ALLOC (kqswnal_data.kqn_rxds, - sizeof (kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE)); - if (kqswnal_data.kqn_rxds == NULL) - { - kqswnal_shutdown (nal); - return (PTL_NO_SPACE); - } - - memset(kqswnal_data.kqn_rxds, 0, /* clear flags, null pointers etc */ - sizeof(kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL+KQSW_NRXMSGS_LARGE)); - + kqswnal_data.kqn_rxds = NULL; elan_page_idx = 0; for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++) { @@ -636,7 +601,16 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, E3_Addr elanbuffer; #endif int j; - kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i]; + + PORTAL_ALLOC(krx, sizeof(*krx)); + if (krx == NULL) { + kqswnal_shutdown(nal); + return (PTL_NO_SPACE); + } + + memset(krx, 0, sizeof(*krx)); /* clear flags, null pointers etc */ + krx->krx_alloclist = kqswnal_data.kqn_rxds; + kqswnal_data.kqn_rxds = krx; if (i < KQSW_NRXMSGS_SMALL) { @@ -717,10 +691,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, /**********************************************************************/ /* Queue receives, now that it's OK to run their completion callbacks */ - for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++) - { - kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i]; - + for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) { /* NB this enqueue can allocate/sleep (attr == 0) */ krx->krx_state = KRX_POSTED; #if MULTIRAIL_EKC diff --git a/lnet/klnds/qswlnd/qswlnd.h b/lnet/klnds/qswlnd/qswlnd.h index 438edc6..b08d710 100644 --- a/lnet/klnds/qswlnd/qswlnd.h +++ b/lnet/klnds/qswlnd/qswlnd.h @@ -99,10 +99,10 @@ typedef unsigned long kqsw_csum_t; #define KQSW_TX_MAXCONTIG (1<<10) /* largest payload that gets made contiguous on transmit */ #define KQSW_NTXMSGS 8 /* # normal transmit messages */ -#define KQSW_NNBLK_TXMSGS 256 /* # reserved transmit messages if can't block */ +#define KQSW_NNBLK_TXMSGS 512 /* # reserved transmit messages if can't block */ #define KQSW_NRXMSGS_LARGE 64 /* # large receive buffers */ -#define KQSW_EP_ENVELOPES_LARGE 128 /* # large ep envelopes */ +#define KQSW_EP_ENVELOPES_LARGE 256 /* # large ep envelopes */ #define KQSW_NRXMSGS_SMALL 256 /* # small receive buffers */ #define KQSW_EP_ENVELOPES_SMALL 2048 /* # small ep envelopes */ @@ -144,9 +144,10 @@ typedef struct #endif } kqswnal_remotemd_t; -typedef struct +typedef struct kqswnal_rx { struct list_head krx_list; /* enqueue -> thread */ + struct kqswnal_rx *krx_alloclist; /* stack in kqn_rxds */ EP_RCVR *krx_eprx; /* port to post receives to */ EP_RXD *krx_rxd; /* receive descriptor (for repost) */ #if MULTIRAIL_EKC @@ -169,10 +170,11 @@ typedef struct #define KRX_COMPLETING 3 /* waiting to be completed */ -typedef struct +typedef struct kqswnal_tx { struct list_head ktx_list; /* enqueue idle/active */ struct list_head ktx_delayed_list; /* enqueue delayedtxds */ + struct kqswnal_tx *ktx_alloclist; /* stack in kqn_txds */ unsigned int ktx_isnblk:1; /* reserved descriptor? */ unsigned int ktx_state:7; /* What I'm doing */ unsigned int ktx_firsttmpfrag:1; /* ktx_frags[0] is in my ebuffer ? 0 : 1 */ @@ -222,8 +224,8 @@ typedef struct char kqn_shuttingdown; /* I'm trying to shut down */ atomic_t kqn_nthreads; /* # threads running */ - kqswnal_rx_t *kqn_rxds; /* all the receive descriptors */ - kqswnal_tx_t *kqn_txds; /* all the transmit descriptors */ + kqswnal_rx_t *kqn_rxds; /* stack of all the receive descriptors */ + kqswnal_tx_t *kqn_txds; /* stack of all the transmit descriptors */ struct list_head kqn_idletxds; /* transmit descriptors free to use */ struct list_head kqn_nblk_idletxds; /* reserved free transmit descriptors */ diff --git a/lnet/klnds/scimaclnd/scimacnal.c b/lnet/klnds/scimaclnd/scimacnal.c index 75188e9..e77bd8e 100644 --- a/lnet/klnds/scimaclnd/scimacnal.c +++ b/lnet/klnds/scimaclnd/scimacnal.c @@ -205,7 +205,7 @@ static int kscimacnal_startup(nal_t *nal, ptl_pid_t requested_pid, } kscimacnal_data.ksci_nid = (ptl_nid_t)(ntohl(mac_physaddr)); - process_id.pid = requested_pid; + process_id.pid = 0; process_id.nid = kscimacnal_data.ksci_nid; CDEBUG(D_NET, "calling lib_init with nid "LPX64"\n", diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index 2a0ef11..7642770 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -1226,9 +1226,9 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type) conn2->ksnc_type != conn->ksnc_type || conn2->ksnc_incarnation != incarnation) continue; - + CWARN("Not creating duplicate connection to " - "%u.%u.%u.%u type %d\n", + "%u.%u.%u.%u type %d\n", HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_type); rc = -EALREADY; goto failed_2; @@ -1260,6 +1260,9 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type) break; } + /* Give conn a ref on sock->file since we're going to return success */ + get_file(sock->file); + conn->ksnc_peer = peer; /* conn takes my ref on peer */ conn->ksnc_incarnation = incarnation; peer->ksnp_last_alive = jiffies; @@ -1311,9 +1314,9 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type) ksocknal_putconnsock(conn); } - CWARN("New conn nid:"LPX64" [type:%d] %u.%u.%u.%u -> %u.%u.%u.%u/%d" + CWARN("New conn nid:"LPX64" %u.%u.%u.%u -> %u.%u.%u.%u/%d" " incarnation:"LPX64" sched[%d]/%d\n", - nid, conn->ksnc_type, HIPQUAD(conn->ksnc_myipaddr), + nid, HIPQUAD(conn->ksnc_myipaddr), HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port, incarnation, (int)(conn->ksnc_scheduler - ksocknal_data.ksnd_schedulers), irq); @@ -2054,8 +2057,7 @@ ksocknal_cmd(struct portals_cfg *pcfg, void * private) rc = -EINVAL; break; } - if (rc != 0) - fput (sock->file); + fput (sock->file); break; } case NAL_CMD_CLOSE_CONNECTION: { diff --git a/lnet/klnds/socklnd/socklnd.h b/lnet/klnds/socklnd/socklnd.h index 0a5266a..b8bbefd 100644 --- a/lnet/klnds/socklnd/socklnd.h +++ b/lnet/klnds/socklnd/socklnd.h @@ -66,9 +66,7 @@ #include #include #include -#include -#include #define SOCKNAL_N_AUTOCONNECTD 4 /* # socknal autoconnect daemons */ #define SOCKNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */ diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index b22d501..762133e 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -2324,17 +2324,34 @@ ksocknal_setup_sock (struct socket *sock) return (0); } -int -ksocknal_connect_peer (ksock_route_t *route, int type) +static int +ksocknal_connect_sock(struct socket **sockp, int *may_retry, + ksock_route_t *route, int local_port) { - struct sockaddr_in ipaddr; - mm_segment_t oldmm = get_fs(); - struct timeval tv; - int fd; + struct sockaddr_in locaddr; + struct sockaddr_in srvaddr; struct socket *sock; int rc; - + int option; + mm_segment_t oldmm = get_fs(); + struct timeval tv; + + memset(&locaddr, 0, sizeof(locaddr)); + locaddr.sin_family = AF_INET; + locaddr.sin_port = htons(local_port); + locaddr.sin_addr.s_addr = + (route->ksnr_myipaddr != 0) ? htonl(route->ksnr_myipaddr) + : INADDR_ANY; + + memset (&srvaddr, 0, sizeof (srvaddr)); + srvaddr.sin_family = AF_INET; + srvaddr.sin_port = htons (route->ksnr_port); + srvaddr.sin_addr.s_addr = htonl (route->ksnr_ipaddr); + + *may_retry = 0; + rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock); + *sockp = sock; if (rc != 0) { CERROR ("Can't create autoconnect socket: %d\n", rc); return (rc); @@ -2344,17 +2361,23 @@ ksocknal_connect_peer (ksock_route_t *route, int type) * from userspace. And we actually need the sock->file refcounting * that this gives you :) */ - fd = sock_map_fd (sock); - if (fd < 0) { + rc = sock_map_fd (sock); + if (rc < 0) { sock_release (sock); - CERROR ("sock_map_fd error %d\n", fd); - return (fd); + CERROR ("sock_map_fd error %d\n", rc); + return (rc); } - /* NB the fd now owns the ref on sock->file */ + /* NB the file descriptor (rc) now owns the ref on sock->file */ LASSERT (sock->file != NULL); LASSERT (file_count(sock->file) == 1); + get_file(sock->file); /* extra ref makes sock->file */ + sys_close(rc); /* survive this close */ + + /* Still got a single ref on sock->file */ + LASSERT (file_count(sock->file) == 1); + /* Set the socket timeouts, so our connection attempt completes in * finite time */ tv.tv_sec = ksocknal_tunables.ksnd_io_timeout; @@ -2367,7 +2390,7 @@ ksocknal_connect_peer (ksock_route_t *route, int type) if (rc != 0) { CERROR ("Can't set send timeout %d: %d\n", ksocknal_tunables.ksnd_io_timeout, rc); - goto out; + goto failed; } set_fs (KERNEL_DS); @@ -2377,53 +2400,83 @@ ksocknal_connect_peer (ksock_route_t *route, int type) if (rc != 0) { CERROR ("Can't set receive timeout %d: %d\n", ksocknal_tunables.ksnd_io_timeout, rc); - goto out; + goto failed; } - if (route->ksnr_myipaddr != 0) { - /* Bind to the local IP address */ - memset (&ipaddr, 0, sizeof (ipaddr)); - ipaddr.sin_family = AF_INET; - ipaddr.sin_port = htons (0); /* ANY */ - ipaddr.sin_addr.s_addr = htonl(route->ksnr_myipaddr); + set_fs (KERNEL_DS); + option = 1; + rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc); + goto failed; + } - rc = sock->ops->bind (sock, (struct sockaddr *)&ipaddr, - sizeof (ipaddr)); - if (rc != 0) { - CERROR ("Can't bind to local IP %u.%u.%u.%u: %d\n", - HIPQUAD(route->ksnr_myipaddr), rc); - goto out; - } + rc = sock->ops->bind(sock, + (struct sockaddr *)&locaddr, sizeof(locaddr)); + if (rc == -EADDRINUSE) { + CDEBUG(D_NET, "Port %d already in use\n", local_port); + *may_retry = 1; + goto failed; } - - memset (&ipaddr, 0, sizeof (ipaddr)); - ipaddr.sin_family = AF_INET; - ipaddr.sin_port = htons (route->ksnr_port); - ipaddr.sin_addr.s_addr = htonl (route->ksnr_ipaddr); - - rc = sock->ops->connect (sock, (struct sockaddr *)&ipaddr, - sizeof (ipaddr), sock->file->f_flags); if (rc != 0) { - CERROR ("Can't connect to nid "LPX64 - " local IP: %u.%u.%u.%u," - " remote IP: %u.%u.%u.%u/%d: %d\n", - route->ksnr_peer->ksnp_nid, - HIPQUAD(route->ksnr_myipaddr), - HIPQUAD(route->ksnr_ipaddr), - route->ksnr_port, rc); - goto out; + CERROR("Error trying to bind to reserved port %d: %d\n", + local_port, rc); + goto failed; } - rc = ksocknal_create_conn (route, sock, type); - if (rc == 0) { - /* Take an extra ref on sock->file to compensate for the - * upcoming close which will lose fd's ref on it. */ - get_file (sock->file); + rc = sock->ops->connect(sock, + (struct sockaddr *)&srvaddr, sizeof(srvaddr), + sock->file->f_flags); + if (rc == 0) + return 0; + + /* EADDRNOTAVAIL probably means we're already connected to the same + * peer/port on the same local port on a differently typed + * connection. Let our caller retry with a different local + * port... */ + *may_retry = (rc == -EADDRNOTAVAIL); + + CDEBUG(*may_retry ? D_NET : D_ERROR, + "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc, + HIPQUAD(route->ksnr_myipaddr), local_port, + HIPQUAD(route->ksnr_ipaddr), route->ksnr_port); + + failed: + fput(sock->file); + return rc; +} + +int +ksocknal_connect_peer (ksock_route_t *route, int type) +{ + struct socket *sock; + int rc; + int port; + int may_retry; + + /* Iterate through reserved ports. When typed connections are + * used, we will need to bind to multiple ports, but we only know + * this at connect time. But, by that time we've already called + * bind() so we need a new socket. */ + + for (port = 1023; port > 512; --port) { + + rc = ksocknal_connect_sock(&sock, &may_retry, route, port); + + if (rc == 0) { + rc = ksocknal_create_conn(route, sock, type); + fput(sock->file); + return rc; + } + + if (!may_retry) + return rc; } - out: - sys_close (fd); - return (rc); + CERROR("Out of ports trying to bind to a reserved port\n"); + return (-EADDRINUSE); } void @@ -2443,7 +2496,6 @@ ksocknal_autoconnect (ksock_route_t *route) LASSERT (type < SOCKNAL_CONN_NTYPES); rc = ksocknal_connect_peer (route, type); - if (rc != 0) break; diff --git a/lnet/libcfs/debug.c b/lnet/libcfs/debug.c index c56f76f..f571958 100644 --- a/lnet/libcfs/debug.c +++ b/lnet/libcfs/debug.c @@ -60,7 +60,7 @@ #endif unsigned int portal_subsystem_debug = ~0 - (S_PORTALS | S_QSWNAL | S_SOCKNAL | - S_GMNAL | S_OPENIBNAL); + S_GMNAL | S_IBNAL); EXPORT_SYMBOL(portal_subsystem_debug); unsigned int portal_debug = (D_WARNING | D_DLMTRACE | D_ERROR | D_EMERG | D_HA | @@ -97,6 +97,7 @@ int portals_do_debug_dumplog(void *arg) snprintf(debug_file_name, sizeof(debug_file_path) - 1, "%s.%ld.%ld", debug_file_path, CURRENT_SECONDS, (long)arg); + printk(KERN_ALERT "LustreError: dumping log to %s\n", debug_file_name); tracefile_dump_all_pages(debug_file_name); current->journal_info = journal_info; @@ -180,7 +181,7 @@ int portals_debug_clear_buffer(void) int portals_debug_mark_buffer(char *text) { CDEBUG(D_TRACE,"***************************************************\n"); - CWARN("DEBUG MARKER: %s\n", text); + CDEBUG(D_WARNING, "DEBUG MARKER: %s\n", text); CDEBUG(D_TRACE,"***************************************************\n"); return 0; @@ -251,62 +252,46 @@ void portals_run_lbug_upcall(char *file, const char *fn, const int line) char *portals_nid2str(int nal, ptl_nid_t nid, char *str) { if (nid == PTL_NID_ANY) { - snprintf(str, PTL_NALFMT_SIZE - 1, "%s", - "PTL_NID_ANY"); + snprintf(str, PTL_NALFMT_SIZE, "%s", "PTL_NID_ANY"); return str; } switch(nal){ /* XXX this could be a nal method of some sort, 'cept it's config * dependent whether (say) socknal NIDs are actually IP addresses... */ -#ifndef CRAY_PORTALS +#if !CRAY_PORTALS case TCPNAL: /* userspace NAL */ + case IIBNAL: case OPENIBNAL: case SOCKNAL: - snprintf(str, PTL_NALFMT_SIZE - 1, "%u:%u.%u.%u.%u", + snprintf(str, PTL_NALFMT_SIZE, "%u:%u.%u.%u.%u", (__u32)(nid >> 32), HIPQUAD(nid)); break; case QSWNAL: case GMNAL: - snprintf(str, PTL_NALFMT_SIZE - 1, "%u:%u", + snprintf(str, PTL_NALFMT_SIZE, "%u:%u", (__u32)(nid >> 32), (__u32)nid); break; #endif default: - snprintf(str, PTL_NALFMT_SIZE - 1, "?%d? %llx", + snprintf(str, PTL_NALFMT_SIZE, "?%x? %llx", nal, (long long)nid); break; } return str; } -/* bug #4615 */ + char *portals_id2str(int nal, ptl_process_id_t id, char *str) { - switch(nal){ -#ifndef CRAY_PORTALS - case TCPNAL: - /* userspace NAL */ - case OPENIBNAL: - case SOCKNAL: - snprintf(str, PTL_NALFMT_SIZE - 1, "%u:%u.%u.%u.%u,%u", - (__u32)(id.nid >> 32), HIPQUAD((id.nid)) , id.pid); - break; - case QSWNAL: - case GMNAL: - snprintf(str, PTL_NALFMT_SIZE - 1, "%u:%u,%u", - (__u32)(id.nid >> 32), (__u32)id.nid, id.pid); - break; -#endif - default: - snprintf(str, PTL_NALFMT_SIZE - 1, "?%d? %llx,%lx", - nal, (long long)id.nid, (long)id.pid ); - break; - } + int len; + + portals_nid2str(nal, id.nid, str); + len = strlen(str); + snprintf(str + len, PTL_NALFMT_SIZE, "-%u", id.pid); return str; } - #ifdef __KERNEL__ char stack_backtrace[LUSTRE_TRACE_SIZE]; spinlock_t stack_backtrace_lock = SPIN_LOCK_UNLOCKED; diff --git a/lnet/libcfs/module.c b/lnet/libcfs/module.c index 3703013..a2422e3 100644 --- a/lnet/libcfs/module.c +++ b/lnet/libcfs/module.c @@ -327,6 +327,8 @@ libcfs_nal_cmd(struct portals_cfg *pcfg) CDEBUG(D_IOCTL, "calling handler nal: %d, cmd: %d\n", nal, pcfg->pcfg_command); rc = cmd->nch_handler(pcfg, cmd->nch_private); + } else { + CERROR("invalid nal: %d, cmd: %d\n", nal, pcfg->pcfg_command); } up(&nal_cmd_sem); @@ -413,15 +415,15 @@ static int libcfs_ioctl(struct inode *inode, struct file *file, portals_debug_mark_buffer(data->ioc_inlbuf1); RETURN(0); #if LWT_SUPPORT - case IOC_PORTAL_LWT_CONTROL: + case IOC_PORTAL_LWT_CONTROL: err = lwt_control (data->ioc_flags, data->ioc_misc); break; - + case IOC_PORTAL_LWT_SNAPSHOT: { cycles_t now; int ncpu; int total_size; - + err = lwt_snapshot (&now, &ncpu, &total_size, data->ioc_pbuf1, data->ioc_plen1); data->ioc_nid = now; @@ -429,15 +431,15 @@ static int libcfs_ioctl(struct inode *inode, struct file *file, data->ioc_misc = total_size; /* Hedge against broken user/kernel typedefs (e.g. cycles_t) */ - data->ioc_nid = sizeof(lwt_event_t); - data->ioc_nid2 = offsetof(lwt_event_t, lwte_where); + data->ioc_nid2 = sizeof(lwt_event_t); + data->ioc_nid3 = offsetof(lwt_event_t, lwte_where); if (err == 0 && copy_to_user((char *)arg, data, sizeof (*data))) err = -EFAULT; break; } - + case IOC_PORTAL_LWT_LOOKUP_STRING: err = lwt_lookup_string (&data->ioc_count, data->ioc_pbuf1, data->ioc_pbuf2, data->ioc_plen2); @@ -456,7 +458,7 @@ static int libcfs_ioctl(struct inode *inode, struct file *file, break; } - if (copy_from_user(&pcfg, (void *)data->ioc_pbuf1, + if (copy_from_user(&pcfg, (void *)data->ioc_pbuf1, sizeof(pcfg))) { err = -EFAULT; break; @@ -467,7 +469,7 @@ static int libcfs_ioctl(struct inode *inode, struct file *file, err = libcfs_nal_cmd(&pcfg); if (err == 0 && - copy_to_user((char *)data->ioc_pbuf1, &pcfg, + copy_to_user((char *)data->ioc_pbuf1, &pcfg, sizeof (pcfg))) err = -EFAULT; break; diff --git a/lnet/libcfs/tracefile.c b/lnet/libcfs/tracefile.c index 562abcf..5759316 100644 --- a/lnet/libcfs/tracefile.c +++ b/lnet/libcfs/tracefile.c @@ -38,7 +38,6 @@ #include #include -#include #include #define TCD_MAX_PAGES 1280 @@ -190,7 +189,7 @@ static void print_to_console(struct ptldebug_header *hdr, int mask, char *buf, prefix = "Lustre"; ptype = KERN_INFO; } - + printk("%s%s: %d:%d:(%s:%d:%s()) %.*s", ptype, prefix, hdr->ph_pid, hdr->ph_extern_pid, file, hdr->ph_line_num, fn, len, buf); } @@ -455,7 +454,7 @@ int tracefile_dump_all_pages(char *filename) if (IS_ERR(filp)) { rc = PTR_ERR(filp); printk(KERN_ERR "LustreError: can't open %s for dump: rc %d\n", - filename, rc); + filename, rc); goto out; } @@ -773,6 +772,7 @@ int trace_write_debug_size(struct file *file, const char *buffer, "(%lu).\n", max * smp_num_cpus, num_physpages / 5 * 4); return count; } + for (i = 0; i < NR_CPUS; i++) { struct trace_cpu_data *tcd; tcd = &trace_data[i].tcd; diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 13451d9..d584f1c 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -83,7 +83,8 @@ lib_match_md(lib_nal_t *nal, int index, int op_mask, me->match_id.nid != src_nid) continue; - CDEBUG(D_NET,"match_id.pid [%x], src_pid [%x]\n", me->match_id.pid, src_pid); + CDEBUG(D_NET, "match_id.pid [%x], src_pid [%x]\n", + me->match_id.pid, src_pid); if (me->match_id.pid != PTL_PID_ANY && me->match_id.pid != src_pid) diff --git a/lnet/lnet/module.c b/lnet/lnet/module.c index eb41dfd..61ef372 100644 --- a/lnet/lnet/module.c +++ b/lnet/lnet/module.c @@ -83,7 +83,8 @@ static int kportal_ioctl(struct portal_ioctl_data *data, CDEBUG (D_IOCTL, "Getting nid for nal [%d]\n", data->ioc_nal); - err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL, NULL, &nih); + err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL, + NULL, &nih); if (!(err == PTL_OK || err == PTL_IFACE_DUP)) RETURN (-EINVAL); @@ -104,7 +105,8 @@ static int kportal_ioctl(struct portal_ioctl_data *data, CDEBUG (D_IOCTL, "fail nid: [%d] "LPU64" count %d\n", data->ioc_nal, data->ioc_nid, data->ioc_count); - err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL, NULL, &nih); + err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL, + NULL, &nih); if (!(err == PTL_OK || err == PTL_IFACE_DUP)) return (-EINVAL); diff --git a/lnet/router/proc.c b/lnet/router/proc.c index 0fe3b90..a1397d2 100644 --- a/lnet/router/proc.c +++ b/lnet/router/proc.c @@ -132,7 +132,7 @@ static int kpr_proc_routes_read(char *page, char **start, off_t off, *start = page + prd->skip; user_len = -prd->skip; - for (; prd->curr != &kpr_routes; prd->curr = prd->curr->next) { + while ((prd->curr != NULL) && (prd->curr != &kpr_routes)) { re = list_entry(prd->curr, kpr_route_entry_t, kpre_list); ge = re->kpre_gateway; @@ -144,11 +144,20 @@ static int kpr_proc_routes_read(char *page, char **start, off_t off, chunk_len += line_len; user_len += line_len; - /* The route table will exceed one page */ - if ((chunk_len > (PAGE_SIZE - 80)) || (user_len > count)) { - prd->curr = prd->curr->next; - break; + /* Abort the route list changed */ + if (prd->curr->next == NULL) { + prd->curr = NULL; + read_unlock(&kpr_rwlock); + return sprintf(page, "\nError: Routes Changed\n"); } + + prd->curr = prd->curr->next; + + /* The route table will exceed one page, break the while loop + * so the function can be re-called with a new page. + */ + if ((chunk_len > (PAGE_SIZE - 80)) || (user_len > count)) + break; } *eof = 0; diff --git a/lnet/ulnds/connection.c b/lnet/ulnds/connection.c index ed8dc08..b399fcf 100644 --- a/lnet/ulnds/connection.c +++ b/lnet/ulnds/connection.c @@ -331,10 +331,17 @@ connection force_tcp_connection(manager m, { connection conn; struct sockaddr_in addr; + struct sockaddr_in locaddr; unsigned int id[2]; struct timeval tv; __u64 incarnation; + int fd; + int option; + int rc; + int rport; + ptl_nid_t peernid = PTL_NID_ANY; + port = tcpnal_acceptor_port; id[0] = ip; @@ -343,49 +350,82 @@ connection force_tcp_connection(manager m, pthread_mutex_lock(&m->conn_lock); conn = hash_table_find(m->connections, id); - if (!conn) { - int fd; - int option; - ptl_nid_t peernid = PTL_NID_ANY; - - bzero((char *) &addr, sizeof(addr)); - addr.sin_family = AF_INET; - addr.sin_addr.s_addr = htonl(ip); - addr.sin_port = htons(port); - - if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { - perror("tcpnal socket failed"); - exit(-1); - } - if (connect(fd, (struct sockaddr *)&addr, - sizeof(struct sockaddr_in))) { - perror("tcpnal connect"); - return(0); - } + if (conn) + goto out; + memset(&addr, 0, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = htonl(ip); + addr.sin_port = htons(port); + + memset(&locaddr, 0, sizeof(locaddr)); + locaddr.sin_family = AF_INET; + locaddr.sin_addr.s_addr = INADDR_ANY; + + for (rport = IPPORT_RESERVED - 1; rport > IPPORT_RESERVED / 2; --rport) { + fd = socket(AF_INET, SOCK_STREAM, 0); + if (fd < 0) { + perror("tcpnal socket failed"); + goto out; + } + + option = 1; + rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, + &option, sizeof(option)); + if (rc != 0) { + perror ("Can't set SO_REUSEADDR for socket"); + close(fd); + goto out; + } + + locaddr.sin_port = htons(rport); + rc = bind(fd, (struct sockaddr *)&locaddr, sizeof(locaddr)); + if (rc == 0 || errno == EACCES) { + rc = connect(fd, (struct sockaddr *)&addr, + sizeof(struct sockaddr_in)); + if (rc == 0) { + break; + } else if (errno != EADDRINUSE && errno != EADDRNOTAVAIL) { + perror("Error connecting to remote host"); + close(fd); + goto out; + } + } else if (errno != EADDRINUSE) { + perror("Error binding to privileged port"); + close(fd); + goto out; + } + close(fd); + } + + if (rport == IPPORT_RESERVED / 2) { + fprintf(stderr, "Out of ports trying to bind to a reserved port\n"); + goto out; + } + #if 1 - option = 1; - setsockopt(fd, SOL_TCP, TCP_NODELAY, &option, sizeof(option)); - option = 1<<20; - setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &option, sizeof(option)); - option = 1<<20; - setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &option, sizeof(option)); + option = 1; + setsockopt(fd, SOL_TCP, TCP_NODELAY, &option, sizeof(option)); + option = 1<<20; + setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &option, sizeof(option)); + option = 1<<20; + setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &option, sizeof(option)); #endif - gettimeofday(&tv, NULL); - incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; + gettimeofday(&tv, NULL); + incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; - /* say hello */ - if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, incarnation)) + /* say hello */ + if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, incarnation)) exit(-1); + + conn = allocate_connection(m, ip, port, fd); + + /* let nal thread know this event right away */ + if (conn) + procbridge_wakeup_nal(pb); - conn = allocate_connection(m, ip, port, fd); - - /* let nal thread know this event right away */ - if (conn) - procbridge_wakeup_nal(pb); - } - +out: pthread_mutex_unlock(&m->conn_lock); return (conn); } diff --git a/lnet/ulnds/dispatch.h b/lnet/ulnds/dispatch.h index 34dd070..a8f916d9 100644 --- a/lnet/ulnds/dispatch.h +++ b/lnet/ulnds/dispatch.h @@ -37,3 +37,10 @@ void remove_io_handler (io_handler i); void init_unix_timer(void); void select_timer_block(when until); when now(void); + +/* + * hacking for CFS internal MPI testing + */ +#if !CRAY_PORTALS +#define ENABLE_SELECT_DISPATCH +#endif diff --git a/lnet/ulnds/procapi.c b/lnet/ulnds/procapi.c index f3843d7..6b471c0 100644 --- a/lnet/ulnds/procapi.c +++ b/lnet/ulnds/procapi.c @@ -107,6 +107,10 @@ nal_t procapi_nal = { ptl_nid_t tcpnal_mynid; +#ifdef ENABLE_SELECT_DISPATCH +procbridge __global_procbridge = NULL; +#endif + /* Function: procbridge_startup * * Arguments: pid: requested process id (port offset) @@ -163,6 +167,10 @@ int procbridge_startup (nal_t *nal, ptl_pid_t requested_pid, return PTL_FAIL; } +#ifdef ENABLE_SELECT_DISPATCH + __global_procbridge = p; +#endif + /* create nal thread */ if (pthread_create(&p->t, NULL, nal_thread, &args)) { perror("nal_init: pthread_create"); diff --git a/lnet/ulnds/select.c b/lnet/ulnds/select.c index c4ccae1..09e1542 100644 --- a/lnet/ulnds/select.c +++ b/lnet/ulnds/select.c @@ -34,8 +34,12 @@ #include #include #include +#include +#include +#include #include #include +#include static struct timeval beginning_of_epoch; @@ -95,40 +99,22 @@ void remove_io_handler (io_handler i) i->disabled=1; } -static void set_flag(io_handler n,fd_set *fds) +static void set_flag(io_handler n,fd_set *r, fd_set *w, fd_set *e) { - if (n->type & READ_HANDLER) FD_SET(n->fd, &fds[0]); - if (n->type & WRITE_HANDLER) FD_SET(n->fd,&fds[1]); - if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd, &fds[2]); + if (n->type & READ_HANDLER) FD_SET(n->fd, r); + if (n->type & WRITE_HANDLER) FD_SET(n->fd, w); + if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd, e); } - -/* Function: select_timer_block - * Arguments: until: an absolute time when the select should return - * - * This function dispatches the various file descriptors' handler - * functions, if the kernel indicates there is io available. - */ -void select_timer_block(when until) +static int prepare_fd_sets(fd_set *r, fd_set *w, fd_set *e) { - fd_set fds[3]; - struct timeval timeout; - struct timeval *timeout_pointer; - int result; io_handler j; io_handler *k; + int max = 0; - /* TODO: loop until the entire interval is expired*/ - if (until){ - when interval=until-now(); - timeout.tv_sec=(interval>>32); - timeout.tv_usec=((interval<<32)/1000000)>>32; - timeout_pointer=&timeout; - } else timeout_pointer=0; - - FD_ZERO(&fds[0]); - FD_ZERO(&fds[1]); - FD_ZERO(&fds[2]); + FD_ZERO(r); + FD_ZERO(w); + FD_ZERO(e); for (k=&io_handlers;*k;){ if ((*k)->disabled){ j=*k; @@ -136,24 +122,291 @@ void select_timer_block(when until) free(j); } if (*k) { - set_flag(*k,fds); + set_flag(*k,r,w,e); + if ((*k)->fd > max) + max = (*k)->fd; k=&(*k)->next; } } + return max + 1; +} + +static int execute_callbacks(fd_set *r, fd_set *w, fd_set *e) +{ + io_handler j; + int n = 0, t; + + for (j = io_handlers; j; j = j->next) { + if (j->disabled) + continue; + + t = 0; + if (FD_ISSET(j->fd, r) && (j->type & READ_HANDLER)) { + FD_CLR(j->fd, r); + t++; + } + if (FD_ISSET(j->fd, w) && (j->type & WRITE_HANDLER)) { + FD_CLR(j->fd, w); + t++; + } + if (FD_ISSET(j->fd, e) && (j->type & EXCEPTION_HANDLER)) { + FD_CLR(j->fd, e); + t++; + } + if (t == 0) + continue; + + if (!(*j->function)(j->argument)) + j->disabled = 1; + + n += t; + } + + return n; +} - result=select(FD_SETSIZE, &fds[0], &fds[1], &fds[2], timeout_pointer); +#ifdef ENABLE_SELECT_DISPATCH - if (result > 0) - for (j=io_handlers;j;j=j->next){ - if (!(j->disabled) && - ((FD_ISSET(j->fd, &fds[0]) && (j->type & READ_HANDLER)) || - (FD_ISSET(j->fd, &fds[1]) && (j->type & WRITE_HANDLER)) || - (FD_ISSET(j->fd, &fds[2]) && (j->type & EXCEPTION_HANDLER)))){ - if (!(*j->function)(j->argument)) - j->disabled=1; +static struct { + pthread_mutex_t mutex; + pthread_cond_t cond; + int submitted; + int nready; + int maxfd; + fd_set *rset; + fd_set *wset; + fd_set *eset; + struct timeval *timeout; + struct timeval submit_time; +} fd_extra = { + PTHREAD_MUTEX_INITIALIZER, + PTHREAD_COND_INITIALIZER, + 0, 0, 0, + NULL, NULL, NULL, NULL, +}; + +extern int liblustre_wait_event(int timeout); +extern procbridge __global_procbridge; + +/* + * this will intercept syscall select() of user apps + * such as MPI libs. + */ +int select(int n, fd_set *rset, fd_set *wset, fd_set *eset, + struct timeval *timeout) +{ + LASSERT(fd_extra.submitted == 0); + + fd_extra.nready = 0; + fd_extra.maxfd = n; + fd_extra.rset = rset; + fd_extra.wset = wset; + fd_extra.eset = eset; + fd_extra.timeout = timeout; + + liblustre_wait_event(0); + pthread_mutex_lock(&fd_extra.mutex); + gettimeofday(&fd_extra.submit_time, NULL); + fd_extra.submitted = 1; + LASSERT(__global_procbridge); + procbridge_wakeup_nal(__global_procbridge); + +again: + if (fd_extra.submitted) + pthread_cond_wait(&fd_extra.cond, &fd_extra.mutex); + pthread_mutex_unlock(&fd_extra.mutex); + + liblustre_wait_event(0); + + pthread_mutex_lock(&fd_extra.mutex); + if (fd_extra.submitted) + goto again; + pthread_mutex_unlock(&fd_extra.mutex); + + LASSERT(fd_extra.nready >= 0); + LASSERT(fd_extra.submitted == 0); + return fd_extra.nready; +} + +static int merge_fds(int max, fd_set *rset, fd_set *wset, fd_set *eset) +{ + int i; + + LASSERT(rset); + LASSERT(wset); + LASSERT(eset); + + for (i = 0; i < __FD_SETSIZE/__NFDBITS; i++) { + LASSERT(!fd_extra.rset || + !(__FDS_BITS(rset)[i] & __FDS_BITS(fd_extra.rset)[i])); + LASSERT(!fd_extra.wset || + !(__FDS_BITS(wset)[i] & __FDS_BITS(fd_extra.wset)[i])); + LASSERT(!fd_extra.eset || + !(__FDS_BITS(eset)[i] & __FDS_BITS(fd_extra.eset)[i])); + + if (fd_extra.rset && __FDS_BITS(fd_extra.rset)[i]) + __FDS_BITS(rset)[i] |= __FDS_BITS(fd_extra.rset)[i]; + if (fd_extra.wset && __FDS_BITS(fd_extra.wset)[i]) + __FDS_BITS(wset)[i] |= __FDS_BITS(fd_extra.wset)[i]; + if (fd_extra.eset && __FDS_BITS(fd_extra.eset)[i]) + __FDS_BITS(eset)[i] |= __FDS_BITS(fd_extra.eset)[i]; + } + + return (fd_extra.maxfd > max ? fd_extra.maxfd : max); +} + +static inline +int timeval_ge(struct timeval *tv1, struct timeval *tv2) +{ + LASSERT(tv1 && tv2); + return ((tv1->tv_sec - tv2->tv_sec) * 1000000 + + (tv1->tv_usec - tv2->tv_usec) >= 0); +} + +/* + * choose the most recent timeout value + */ +static struct timeval *choose_timeout(struct timeval *tv1, + struct timeval *tv2) +{ + if (!tv1) + return tv2; + else if (!tv2) + return tv1; + + if (timeval_ge(tv1, tv2)) + return tv2; + else + return tv1; +} + +/* Function: select_timer_block + * Arguments: until: an absolute time when the select should return + * + * This function dispatches the various file descriptors' handler + * functions, if the kernel indicates there is io available. + */ +void select_timer_block(when until) +{ + fd_set fds[3]; + struct timeval timeout; + struct timeval *timeout_pointer, *select_timeout; + int max, nready, nexec; + int fd_handling; + +again: + if (until) { + when interval; + + interval = until - now(); + timeout.tv_sec = (interval >> 32); + timeout.tv_usec = ((interval << 32) / 1000000) >> 32; + timeout_pointer = &timeout; + } else + timeout_pointer = NULL; + + fd_handling = 0; + max = prepare_fd_sets(&fds[0], &fds[1], &fds[2]); + select_timeout = timeout_pointer; + + pthread_mutex_lock(&fd_extra.mutex); + fd_handling = fd_extra.submitted; + pthread_mutex_unlock(&fd_extra.mutex); + if (fd_handling) { + max = merge_fds(max, &fds[0], &fds[1], &fds[2]); + select_timeout = choose_timeout(timeout_pointer, fd_extra.timeout); + } + + /* XXX only compile for linux */ +#if __WORDSIZE == 64 + nready = syscall(SYS_select, max, &fds[0], &fds[1], &fds[2], + select_timeout); +#else + nready = syscall(SYS__newselect, max, &fds[0], &fds[1], &fds[2], + select_timeout); +#endif + if (nready < 0) { + CERROR("select return err %d, errno %d\n", nready, errno); + return; + } + + if (nready) { + nexec = execute_callbacks(&fds[0], &fds[1], &fds[2]); + nready -= nexec; + } else + nexec = 0; + + /* even both nready & nexec are 0, we still need try to wakeup + * upper thread since it may have timed out + */ + if (fd_handling) { + LASSERT(nready >= 0); + + pthread_mutex_lock(&fd_extra.mutex); + if (nready) { + if (fd_extra.rset) + *fd_extra.rset = fds[0]; + if (fd_extra.wset) + *fd_extra.wset = fds[1]; + if (fd_extra.eset) + *fd_extra.eset = fds[2]; + fd_extra.nready = nready; + fd_extra.submitted = 0; + } else { + struct timeval t; + + fd_extra.nready = 0; + if (fd_extra.timeout) { + gettimeofday(&t, NULL); + if (timeval_ge(&t, &fd_extra.submit_time)) + fd_extra.submitted = 0; } } + + pthread_cond_signal(&fd_extra.cond); + pthread_mutex_unlock(&fd_extra.mutex); + } + + /* haven't found portals event, go back to loop if time + * is not expired */ + if (!nexec) { + if (timeout_pointer == NULL || now() >= until) + goto again; + } +} + +#else /* !ENABLE_SELECT_DISPATCH */ + +/* Function: select_timer_block + * Arguments: until: an absolute time when the select should return + * + * This function dispatches the various file descriptors' handler + * functions, if the kernel indicates there is io available. + */ +void select_timer_block(when until) +{ + fd_set fds[3]; + struct timeval timeout; + struct timeval *timeout_pointer; + int max, nready; + +again: + if (until) { + when interval; + interval = until - now(); + timeout.tv_sec = (interval >> 32); + timeout.tv_usec = ((interval << 32) / 1000000) >> 32; + timeout_pointer = &timeout; + } else + timeout_pointer = NULL; + + max = prepare_fd_sets(&fds[0], &fds[1], &fds[2]); + + nready = select(max, &fds[0], &fds[1], &fds[2], timeout_pointer); + if (nready > 0) + execute_callbacks(&fds[0], &fds[1], &fds[2]); } +#endif /* ENABLE_SELECT_DISPATCH */ /* Function: init_unix_timer() * is called to initialize the library diff --git a/lnet/ulnds/socklnd/connection.c b/lnet/ulnds/socklnd/connection.c index ed8dc08..b399fcf 100644 --- a/lnet/ulnds/socklnd/connection.c +++ b/lnet/ulnds/socklnd/connection.c @@ -331,10 +331,17 @@ connection force_tcp_connection(manager m, { connection conn; struct sockaddr_in addr; + struct sockaddr_in locaddr; unsigned int id[2]; struct timeval tv; __u64 incarnation; + int fd; + int option; + int rc; + int rport; + ptl_nid_t peernid = PTL_NID_ANY; + port = tcpnal_acceptor_port; id[0] = ip; @@ -343,49 +350,82 @@ connection force_tcp_connection(manager m, pthread_mutex_lock(&m->conn_lock); conn = hash_table_find(m->connections, id); - if (!conn) { - int fd; - int option; - ptl_nid_t peernid = PTL_NID_ANY; - - bzero((char *) &addr, sizeof(addr)); - addr.sin_family = AF_INET; - addr.sin_addr.s_addr = htonl(ip); - addr.sin_port = htons(port); - - if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { - perror("tcpnal socket failed"); - exit(-1); - } - if (connect(fd, (struct sockaddr *)&addr, - sizeof(struct sockaddr_in))) { - perror("tcpnal connect"); - return(0); - } + if (conn) + goto out; + memset(&addr, 0, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = htonl(ip); + addr.sin_port = htons(port); + + memset(&locaddr, 0, sizeof(locaddr)); + locaddr.sin_family = AF_INET; + locaddr.sin_addr.s_addr = INADDR_ANY; + + for (rport = IPPORT_RESERVED - 1; rport > IPPORT_RESERVED / 2; --rport) { + fd = socket(AF_INET, SOCK_STREAM, 0); + if (fd < 0) { + perror("tcpnal socket failed"); + goto out; + } + + option = 1; + rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, + &option, sizeof(option)); + if (rc != 0) { + perror ("Can't set SO_REUSEADDR for socket"); + close(fd); + goto out; + } + + locaddr.sin_port = htons(rport); + rc = bind(fd, (struct sockaddr *)&locaddr, sizeof(locaddr)); + if (rc == 0 || errno == EACCES) { + rc = connect(fd, (struct sockaddr *)&addr, + sizeof(struct sockaddr_in)); + if (rc == 0) { + break; + } else if (errno != EADDRINUSE && errno != EADDRNOTAVAIL) { + perror("Error connecting to remote host"); + close(fd); + goto out; + } + } else if (errno != EADDRINUSE) { + perror("Error binding to privileged port"); + close(fd); + goto out; + } + close(fd); + } + + if (rport == IPPORT_RESERVED / 2) { + fprintf(stderr, "Out of ports trying to bind to a reserved port\n"); + goto out; + } + #if 1 - option = 1; - setsockopt(fd, SOL_TCP, TCP_NODELAY, &option, sizeof(option)); - option = 1<<20; - setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &option, sizeof(option)); - option = 1<<20; - setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &option, sizeof(option)); + option = 1; + setsockopt(fd, SOL_TCP, TCP_NODELAY, &option, sizeof(option)); + option = 1<<20; + setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &option, sizeof(option)); + option = 1<<20; + setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &option, sizeof(option)); #endif - gettimeofday(&tv, NULL); - incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; + gettimeofday(&tv, NULL); + incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; - /* say hello */ - if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, incarnation)) + /* say hello */ + if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, incarnation)) exit(-1); + + conn = allocate_connection(m, ip, port, fd); + + /* let nal thread know this event right away */ + if (conn) + procbridge_wakeup_nal(pb); - conn = allocate_connection(m, ip, port, fd); - - /* let nal thread know this event right away */ - if (conn) - procbridge_wakeup_nal(pb); - } - +out: pthread_mutex_unlock(&m->conn_lock); return (conn); } diff --git a/lnet/ulnds/socklnd/dispatch.h b/lnet/ulnds/socklnd/dispatch.h index 34dd070..a8f916d9 100644 --- a/lnet/ulnds/socklnd/dispatch.h +++ b/lnet/ulnds/socklnd/dispatch.h @@ -37,3 +37,10 @@ void remove_io_handler (io_handler i); void init_unix_timer(void); void select_timer_block(when until); when now(void); + +/* + * hacking for CFS internal MPI testing + */ +#if !CRAY_PORTALS +#define ENABLE_SELECT_DISPATCH +#endif diff --git a/lnet/ulnds/socklnd/procapi.c b/lnet/ulnds/socklnd/procapi.c index f3843d7..6b471c0 100644 --- a/lnet/ulnds/socklnd/procapi.c +++ b/lnet/ulnds/socklnd/procapi.c @@ -107,6 +107,10 @@ nal_t procapi_nal = { ptl_nid_t tcpnal_mynid; +#ifdef ENABLE_SELECT_DISPATCH +procbridge __global_procbridge = NULL; +#endif + /* Function: procbridge_startup * * Arguments: pid: requested process id (port offset) @@ -163,6 +167,10 @@ int procbridge_startup (nal_t *nal, ptl_pid_t requested_pid, return PTL_FAIL; } +#ifdef ENABLE_SELECT_DISPATCH + __global_procbridge = p; +#endif + /* create nal thread */ if (pthread_create(&p->t, NULL, nal_thread, &args)) { perror("nal_init: pthread_create"); diff --git a/lnet/ulnds/socklnd/select.c b/lnet/ulnds/socklnd/select.c index c4ccae1..09e1542 100644 --- a/lnet/ulnds/socklnd/select.c +++ b/lnet/ulnds/socklnd/select.c @@ -34,8 +34,12 @@ #include #include #include +#include +#include +#include #include #include +#include static struct timeval beginning_of_epoch; @@ -95,40 +99,22 @@ void remove_io_handler (io_handler i) i->disabled=1; } -static void set_flag(io_handler n,fd_set *fds) +static void set_flag(io_handler n,fd_set *r, fd_set *w, fd_set *e) { - if (n->type & READ_HANDLER) FD_SET(n->fd, &fds[0]); - if (n->type & WRITE_HANDLER) FD_SET(n->fd,&fds[1]); - if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd, &fds[2]); + if (n->type & READ_HANDLER) FD_SET(n->fd, r); + if (n->type & WRITE_HANDLER) FD_SET(n->fd, w); + if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd, e); } - -/* Function: select_timer_block - * Arguments: until: an absolute time when the select should return - * - * This function dispatches the various file descriptors' handler - * functions, if the kernel indicates there is io available. - */ -void select_timer_block(when until) +static int prepare_fd_sets(fd_set *r, fd_set *w, fd_set *e) { - fd_set fds[3]; - struct timeval timeout; - struct timeval *timeout_pointer; - int result; io_handler j; io_handler *k; + int max = 0; - /* TODO: loop until the entire interval is expired*/ - if (until){ - when interval=until-now(); - timeout.tv_sec=(interval>>32); - timeout.tv_usec=((interval<<32)/1000000)>>32; - timeout_pointer=&timeout; - } else timeout_pointer=0; - - FD_ZERO(&fds[0]); - FD_ZERO(&fds[1]); - FD_ZERO(&fds[2]); + FD_ZERO(r); + FD_ZERO(w); + FD_ZERO(e); for (k=&io_handlers;*k;){ if ((*k)->disabled){ j=*k; @@ -136,24 +122,291 @@ void select_timer_block(when until) free(j); } if (*k) { - set_flag(*k,fds); + set_flag(*k,r,w,e); + if ((*k)->fd > max) + max = (*k)->fd; k=&(*k)->next; } } + return max + 1; +} + +static int execute_callbacks(fd_set *r, fd_set *w, fd_set *e) +{ + io_handler j; + int n = 0, t; + + for (j = io_handlers; j; j = j->next) { + if (j->disabled) + continue; + + t = 0; + if (FD_ISSET(j->fd, r) && (j->type & READ_HANDLER)) { + FD_CLR(j->fd, r); + t++; + } + if (FD_ISSET(j->fd, w) && (j->type & WRITE_HANDLER)) { + FD_CLR(j->fd, w); + t++; + } + if (FD_ISSET(j->fd, e) && (j->type & EXCEPTION_HANDLER)) { + FD_CLR(j->fd, e); + t++; + } + if (t == 0) + continue; + + if (!(*j->function)(j->argument)) + j->disabled = 1; + + n += t; + } + + return n; +} - result=select(FD_SETSIZE, &fds[0], &fds[1], &fds[2], timeout_pointer); +#ifdef ENABLE_SELECT_DISPATCH - if (result > 0) - for (j=io_handlers;j;j=j->next){ - if (!(j->disabled) && - ((FD_ISSET(j->fd, &fds[0]) && (j->type & READ_HANDLER)) || - (FD_ISSET(j->fd, &fds[1]) && (j->type & WRITE_HANDLER)) || - (FD_ISSET(j->fd, &fds[2]) && (j->type & EXCEPTION_HANDLER)))){ - if (!(*j->function)(j->argument)) - j->disabled=1; +static struct { + pthread_mutex_t mutex; + pthread_cond_t cond; + int submitted; + int nready; + int maxfd; + fd_set *rset; + fd_set *wset; + fd_set *eset; + struct timeval *timeout; + struct timeval submit_time; +} fd_extra = { + PTHREAD_MUTEX_INITIALIZER, + PTHREAD_COND_INITIALIZER, + 0, 0, 0, + NULL, NULL, NULL, NULL, +}; + +extern int liblustre_wait_event(int timeout); +extern procbridge __global_procbridge; + +/* + * this will intercept syscall select() of user apps + * such as MPI libs. + */ +int select(int n, fd_set *rset, fd_set *wset, fd_set *eset, + struct timeval *timeout) +{ + LASSERT(fd_extra.submitted == 0); + + fd_extra.nready = 0; + fd_extra.maxfd = n; + fd_extra.rset = rset; + fd_extra.wset = wset; + fd_extra.eset = eset; + fd_extra.timeout = timeout; + + liblustre_wait_event(0); + pthread_mutex_lock(&fd_extra.mutex); + gettimeofday(&fd_extra.submit_time, NULL); + fd_extra.submitted = 1; + LASSERT(__global_procbridge); + procbridge_wakeup_nal(__global_procbridge); + +again: + if (fd_extra.submitted) + pthread_cond_wait(&fd_extra.cond, &fd_extra.mutex); + pthread_mutex_unlock(&fd_extra.mutex); + + liblustre_wait_event(0); + + pthread_mutex_lock(&fd_extra.mutex); + if (fd_extra.submitted) + goto again; + pthread_mutex_unlock(&fd_extra.mutex); + + LASSERT(fd_extra.nready >= 0); + LASSERT(fd_extra.submitted == 0); + return fd_extra.nready; +} + +static int merge_fds(int max, fd_set *rset, fd_set *wset, fd_set *eset) +{ + int i; + + LASSERT(rset); + LASSERT(wset); + LASSERT(eset); + + for (i = 0; i < __FD_SETSIZE/__NFDBITS; i++) { + LASSERT(!fd_extra.rset || + !(__FDS_BITS(rset)[i] & __FDS_BITS(fd_extra.rset)[i])); + LASSERT(!fd_extra.wset || + !(__FDS_BITS(wset)[i] & __FDS_BITS(fd_extra.wset)[i])); + LASSERT(!fd_extra.eset || + !(__FDS_BITS(eset)[i] & __FDS_BITS(fd_extra.eset)[i])); + + if (fd_extra.rset && __FDS_BITS(fd_extra.rset)[i]) + __FDS_BITS(rset)[i] |= __FDS_BITS(fd_extra.rset)[i]; + if (fd_extra.wset && __FDS_BITS(fd_extra.wset)[i]) + __FDS_BITS(wset)[i] |= __FDS_BITS(fd_extra.wset)[i]; + if (fd_extra.eset && __FDS_BITS(fd_extra.eset)[i]) + __FDS_BITS(eset)[i] |= __FDS_BITS(fd_extra.eset)[i]; + } + + return (fd_extra.maxfd > max ? fd_extra.maxfd : max); +} + +static inline +int timeval_ge(struct timeval *tv1, struct timeval *tv2) +{ + LASSERT(tv1 && tv2); + return ((tv1->tv_sec - tv2->tv_sec) * 1000000 + + (tv1->tv_usec - tv2->tv_usec) >= 0); +} + +/* + * choose the most recent timeout value + */ +static struct timeval *choose_timeout(struct timeval *tv1, + struct timeval *tv2) +{ + if (!tv1) + return tv2; + else if (!tv2) + return tv1; + + if (timeval_ge(tv1, tv2)) + return tv2; + else + return tv1; +} + +/* Function: select_timer_block + * Arguments: until: an absolute time when the select should return + * + * This function dispatches the various file descriptors' handler + * functions, if the kernel indicates there is io available. + */ +void select_timer_block(when until) +{ + fd_set fds[3]; + struct timeval timeout; + struct timeval *timeout_pointer, *select_timeout; + int max, nready, nexec; + int fd_handling; + +again: + if (until) { + when interval; + + interval = until - now(); + timeout.tv_sec = (interval >> 32); + timeout.tv_usec = ((interval << 32) / 1000000) >> 32; + timeout_pointer = &timeout; + } else + timeout_pointer = NULL; + + fd_handling = 0; + max = prepare_fd_sets(&fds[0], &fds[1], &fds[2]); + select_timeout = timeout_pointer; + + pthread_mutex_lock(&fd_extra.mutex); + fd_handling = fd_extra.submitted; + pthread_mutex_unlock(&fd_extra.mutex); + if (fd_handling) { + max = merge_fds(max, &fds[0], &fds[1], &fds[2]); + select_timeout = choose_timeout(timeout_pointer, fd_extra.timeout); + } + + /* XXX only compile for linux */ +#if __WORDSIZE == 64 + nready = syscall(SYS_select, max, &fds[0], &fds[1], &fds[2], + select_timeout); +#else + nready = syscall(SYS__newselect, max, &fds[0], &fds[1], &fds[2], + select_timeout); +#endif + if (nready < 0) { + CERROR("select return err %d, errno %d\n", nready, errno); + return; + } + + if (nready) { + nexec = execute_callbacks(&fds[0], &fds[1], &fds[2]); + nready -= nexec; + } else + nexec = 0; + + /* even both nready & nexec are 0, we still need try to wakeup + * upper thread since it may have timed out + */ + if (fd_handling) { + LASSERT(nready >= 0); + + pthread_mutex_lock(&fd_extra.mutex); + if (nready) { + if (fd_extra.rset) + *fd_extra.rset = fds[0]; + if (fd_extra.wset) + *fd_extra.wset = fds[1]; + if (fd_extra.eset) + *fd_extra.eset = fds[2]; + fd_extra.nready = nready; + fd_extra.submitted = 0; + } else { + struct timeval t; + + fd_extra.nready = 0; + if (fd_extra.timeout) { + gettimeofday(&t, NULL); + if (timeval_ge(&t, &fd_extra.submit_time)) + fd_extra.submitted = 0; } } + + pthread_cond_signal(&fd_extra.cond); + pthread_mutex_unlock(&fd_extra.mutex); + } + + /* haven't found portals event, go back to loop if time + * is not expired */ + if (!nexec) { + if (timeout_pointer == NULL || now() >= until) + goto again; + } +} + +#else /* !ENABLE_SELECT_DISPATCH */ + +/* Function: select_timer_block + * Arguments: until: an absolute time when the select should return + * + * This function dispatches the various file descriptors' handler + * functions, if the kernel indicates there is io available. + */ +void select_timer_block(when until) +{ + fd_set fds[3]; + struct timeval timeout; + struct timeval *timeout_pointer; + int max, nready; + +again: + if (until) { + when interval; + interval = until - now(); + timeout.tv_sec = (interval >> 32); + timeout.tv_usec = ((interval << 32) / 1000000) >> 32; + timeout_pointer = &timeout; + } else + timeout_pointer = NULL; + + max = prepare_fd_sets(&fds[0], &fds[1], &fds[2]); + + nready = select(max, &fds[0], &fds[1], &fds[2], timeout_pointer); + if (nready > 0) + execute_callbacks(&fds[0], &fds[1], &fds[2]); } +#endif /* ENABLE_SELECT_DISPATCH */ /* Function: init_unix_timer() * is called to initialize the library diff --git a/lnet/ulnds/socklnd/tcplnd.c b/lnet/ulnds/socklnd/tcplnd.c index 6e9cca9..abb6d01 100644 --- a/lnet/ulnds/socklnd/tcplnd.c +++ b/lnet/ulnds/socklnd/tcplnd.c @@ -251,8 +251,6 @@ int tcpnal_init(bridge b) newly created junk */ return(PTL_NAL_FAILED); } - /* XXX cfs hack */ -// b->lib_nal->libnal_ni.ni_pid.pid=0; b->lower=m; return(PTL_OK); } diff --git a/lnet/ulnds/tcplnd.c b/lnet/ulnds/tcplnd.c index 6e9cca9..abb6d01 100644 --- a/lnet/ulnds/tcplnd.c +++ b/lnet/ulnds/tcplnd.c @@ -251,8 +251,6 @@ int tcpnal_init(bridge b) newly created junk */ return(PTL_NAL_FAILED); } - /* XXX cfs hack */ -// b->lib_nal->libnal_ni.ni_pid.pid=0; b->lower=m; return(PTL_OK); } diff --git a/lnet/utils/acceptor.c b/lnet/utils/acceptor.c index 8aea457..524d128 100644 --- a/lnet/utils/acceptor.c +++ b/lnet/utils/acceptor.c @@ -89,7 +89,11 @@ show_connection (int fd, __u32 net_ip) void usage (char *myname) { - fprintf (stderr, "Usage: %s [-N nal_id] port\n", myname); + fprintf (stderr, + "Usage: %s [-N nal_id] [-p] [-l] port\n\n" + " -l\tKeep stdin/stdout open\n" + " -p\tAllow connections from non-privileged ports\n", + myname); exit (1); } @@ -100,24 +104,27 @@ int main(int argc, char **argv) int c; int noclose = 0; int nal = SOCKNAL; + int rport; + int require_privports = 1; - while ((c = getopt (argc, argv, "N:l")) != -1) - switch (c) - { - case 'l': - noclose = 1; - break; - + while ((c = getopt (argc, argv, "N:lp")) != -1) { + switch (c) { case 'N': if (sscanf(optarg, "%d", &nal) != 1 || nal < 0 || nal > NAL_MAX_NR) usage(argv[0]); break; - + case 'l': + noclose = 1; + break; + case 'p': + require_privports = 0; + break; default: usage (argv[0]); break; } + } if (optind >= argc) usage (argv[0]); @@ -162,7 +169,7 @@ int main(int argc, char **argv) exit(1); } - rc = daemon(1, noclose); + rc = daemon(0, noclose); if (rc < 0) { perror("daemon(): "); exit(1); @@ -180,8 +187,8 @@ int main(int argc, char **argv) struct portals_cfg pcfg; #ifdef HAVE_LIBWRAP struct request_info request; - char addrstr[INET_ADDRSTRLEN]; #endif + char addrstr[INET_ADDRSTRLEN]; cfd = accept(fd, (struct sockaddr *)&clntaddr, &len); if ( cfd < 0 ) { @@ -203,6 +210,18 @@ int main(int argc, char **argv) continue; } #endif + + if (require_privports && ntohs(clntaddr.sin_port) >= IPPORT_RESERVED) { + inet_ntop(AF_INET, &clntaddr.sin_addr, + addrstr, INET_ADDRSTRLEN); + syslog(LOG_ERR, "Closing non-privileged connection from %s:%d\n", + addrstr, ntohs(clntaddr.sin_port)); + rc = close(cfd); + if (rc) + perror ("close un-privileged client failed"); + continue; + } + show_connection (cfd, clntaddr.sin_addr.s_addr); PCFG_INIT(pcfg, NAL_CMD_REGISTER_PEER_FD); diff --git a/lnet/utils/debug.c b/lnet/utils/debug.c index 36d8a04..5b65f24 100644 --- a/lnet/utils/debug.c +++ b/lnet/utils/debug.c @@ -29,9 +29,12 @@ #include #include +#ifdef HAVE_NETDB_H #include +#endif #include #include +#include "ioctl.h" #include #include #include @@ -45,12 +48,15 @@ #include #include +#ifdef HAVE_LINUX_VERSION_H #include #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) #define BUG() /* workaround for module.h includes */ #include #endif +#endif /* !HAVE_LINUX_VERSION_H */ + #include #include @@ -62,7 +68,7 @@ static char rawbuf[8192]; static char *buf = rawbuf; static int max = 8192; -//static int g_pfd = -1; +/*static int g_pfd = -1;*/ static int subsystem_mask = ~0; static int debug_mask = ~0; @@ -72,7 +78,7 @@ static const char *portal_debug_subsystems[] = {"undefined", "mdc", "mds", "osc", "ost", "class", "log", "llite", "rpc", "mgmt", "portals", "libcfs", "socknal", "qswnal", "pinger", "filter", "ptlbd", "echo", "ldlm", "lov", "gmnal", "router", "cobd", - "openibnal", "lmv", "smfs", "cmobd", NULL}; + "ibnal", NULL}; static const char *portal_debug_masks[] = {"trace", "inode", "super", "ext2", "malloc", "cache", "info", "ioctl", "blocks", "net", "warning", "buffs", "other", "dentry", "portals", @@ -371,15 +377,24 @@ int jt_dbg_debug_kernel(int argc, char **argv) fprintf(stderr, "usage: %s [file] [raw]\n", argv[0]); return 0; } - sprintf(filename, "%s.%lu.%u", argc > 1 ? argv[1] : "/tmp/lustre-log", - time(NULL), getpid()); - if (argc > 2) + if (argc > 2) { raw = atoi(argv[2]); + } else if (argc > 1 && (argv[1][0] == '0' || argv[1][0] == '1')) { + raw = atoi(argv[1]); + argc--; + } else { + sprintf(filename, "%s.%lu.%u", argc > 1 ? argv[1] : + "/tmp/lustre-log", time(NULL), getpid()); + } + unlink(filename); fd = open("/proc/sys/portals/dump_kernel", O_WRONLY); if (fd < 0) { + if (errno == ENOENT) /* no dump file created */ + return 0; + fprintf(stderr, "open(dump_kernel) failed: %s\n", strerror(errno)); return 1; @@ -477,25 +492,25 @@ const char debug_daemon_usage[]="usage: debug_daemon {start file [MB]|stop}\n"; int jt_dbg_debug_daemon(int argc, char **argv) { int rc, fd; - + if (argc <= 1) { fprintf(stderr, debug_daemon_usage); return 0; } - + fd = open("/proc/sys/portals/daemon_file", O_WRONLY); if (fd < 0) { fprintf(stderr, "open(daemon_file) failed: %s\n", strerror(errno)); return 1; } - + if (strcasecmp(argv[1], "start") == 0) { if (argc != 3) { fprintf(stderr, debug_daemon_usage); return 1; } - + rc = write(fd, argv[2], strlen(argv[2])); if (rc != strlen(argv[2])) { fprintf(stderr, "write(%s) failed: %s\n", argv[2], @@ -515,7 +530,7 @@ int jt_dbg_debug_daemon(int argc, char **argv) fprintf(stderr, debug_daemon_usage); return 1; } - + close(fd); return 0; } @@ -611,7 +626,6 @@ static struct mod_paths { {"obdfilter", "lustre/obdfilter"}, {"extN", "lustre/extN"}, {"lov", "lustre/lov"}, - {"lmv", "lustre/lmv"}, {"fsfilt_ext3", "lustre/lvfs"}, {"fsfilt_extN", "lustre/lvfs"}, {"fsfilt_reiserfs", "lustre/lvfs"}, @@ -623,13 +637,13 @@ static struct mod_paths { {"ptlbd", "lustre/ptlbd"}, {"mgmt_svc", "lustre/mgmt"}, {"mgmt_cli", "lustre/mgmt"}, - {"cobd", "lustre/cobd"}, - {"cmobd", "lustre/cmobd"}, + {"conf_obd", "lustre/obdclass"}, {NULL, NULL} }; static int jt_dbg_modules_2_4(int argc, char **argv) { +#ifdef HAVE_LINUX_VERSION_H #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) struct mod_paths *mp; char *path = ".."; @@ -665,9 +679,9 @@ static int jt_dbg_modules_2_4(int argc, char **argv) } return 0; -#else /* Headers are 2.6-only */ +#endif /* Headers are 2.6-only */ +#endif /* !HAVE_LINUX_VERSION_H */ return -EINVAL; -#endif } static int jt_dbg_modules_2_5(int argc, char **argv) diff --git a/lnet/utils/portals.c b/lnet/utils/portals.c index 1bde59f..d5d29dc 100644 --- a/lnet/utils/portals.c +++ b/lnet/utils/portals.c @@ -22,13 +22,17 @@ #include #include +#ifdef HAVE_NETDB_H #include +#endif #include +#ifdef HAVE_NETINET_TCP_H #include -#include +#endif #include #include #include +#include "ioctl.h" #include #include #include @@ -54,10 +58,6 @@ unsigned int portal_printk; static unsigned int g_nal = 0; -static int g_socket_txmem = 0; -static int g_socket_rxmem = 0; -static int g_socket_nonagle = 1; - typedef struct { char *name; @@ -70,6 +70,7 @@ static name2num_t nalnames[] = { {"elan", QSWNAL}, {"gm", GMNAL}, {"openib", OPENIBNAL}, + {"iib", IIBNAL}, {NULL, -1} }; @@ -209,6 +210,7 @@ nal2name (int nal) return ((e == NULL) ? "???" : e->name); } +#ifdef HAVE_GETHOSTBYNAME static struct hostent * ptl_gethostbyname(char * hname) { struct hostent *he; @@ -229,6 +231,7 @@ ptl_gethostbyname(char * hname) { } return he; } +#endif int ptl_parse_port (int *port, char *str) @@ -295,7 +298,9 @@ ptl_parse_ipquad (__u32 *ipaddrp, char *str) int ptl_parse_ipaddr (__u32 *ipaddrp, char *str) { +#ifdef HAVE_GETHOSTBYNAME struct hostent *he; +#endif if (!strcmp (str, "_all_")) { @@ -305,7 +310,8 @@ ptl_parse_ipaddr (__u32 *ipaddrp, char *str) if (ptl_parse_ipquad(ipaddrp, str) == 0) return (0); - + +#if HAVE_GETHOSTBYNAME if ((('a' <= str[0] && str[0] <= 'z') || ('A' <= str[0] && str[0] <= 'Z')) && (he = ptl_gethostbyname (str)) != NULL) @@ -315,6 +321,7 @@ ptl_parse_ipaddr (__u32 *ipaddrp, char *str) *ipaddrp = ntohl(addr); /* HOST byte order */ return (0); } +#endif return (-1); } @@ -322,6 +329,7 @@ ptl_parse_ipaddr (__u32 *ipaddrp, char *str) char * ptl_ipaddr_2_str (__u32 ipaddr, char *str, int lookup) { +#ifdef HAVE_GETHOSTBYNAME __u32 net_ip; struct hostent *he; @@ -333,7 +341,8 @@ ptl_ipaddr_2_str (__u32 ipaddr, char *str, int lookup) return (str); } } - +#endif + sprintf (str, "%d.%d.%d.%d", (ipaddr >> 24) & 0xff, (ipaddr >> 16) & 0xff, (ipaddr >> 8) & 0xff, ipaddr & 0xff); @@ -386,6 +395,7 @@ char * ptl_nid2str (char *buffer, ptl_nid_t nid) { __u64 nid64 = ptl_nid2u64(nid); +#ifdef HAVE_GETHOSTBYNAME struct hostent *he = 0; /* Don't try to resolve NIDs that are e.g. Elan host IDs. Assume @@ -400,6 +410,7 @@ ptl_nid2str (char *buffer, ptl_nid_t nid) if (he != NULL) sprintf(buffer, "%#x:%s", (int)(nid64 >> 32), he->h_name); else +#endif /* HAVE_GETHOSTBYNAME */ sprintf(buffer, LPX64, nid64); return (buffer); @@ -524,7 +535,6 @@ int jt_ptl_network(int argc, char **argv) return (-1); } - int jt_ptl_print_interfaces (int argc, char **argv) { @@ -563,6 +573,9 @@ jt_ptl_add_interface (int argc, char **argv) __u32 ipaddr; int rc; __u32 netmask = 0xffffff00; + int i; + int count; + char *end; if (argc < 2 || argc > 3) { fprintf (stderr, "usage: %s ipaddr [netmask]\n", argv[0]); @@ -576,13 +589,19 @@ jt_ptl_add_interface (int argc, char **argv) fprintf (stderr, "Can't parse ip: %s\n", argv[1]); return -1; } - - if (argc > 2 && - ptl_parse_ipquad(&netmask, argv[2]) != 0) { - fprintf (stderr, "Can't parse netmask: %s\n", argv[2]); - return -1; + + if (argc > 2 ) { + count = strtol(argv[2], &end, 0); + if (count > 0 && count < 32 && *end == 0) { + netmask = 0; + for (i = count; i > 0; i--) + netmask = netmask|(1<<(32-i)); + } else if (ptl_parse_ipquad(&netmask, argv[2]) != 0) { + fprintf (stderr, "Can't parse netmask: %s\n", argv[2]); + return -1; + } } - + PCFG_INIT(pcfg, NAL_CMD_ADD_INTERFACE); pcfg.pcfg_id = ipaddr; pcfg.pcfg_misc = netmask; @@ -593,7 +612,7 @@ jt_ptl_add_interface (int argc, char **argv) strerror (errno)); return -1; } - + return 0; } @@ -627,11 +646,11 @@ jt_ptl_del_interface (int argc, char **argv) strerror (errno)); return -1; } - + return 0; } -int +int jt_ptl_print_peers (int argc, char **argv) { struct portals_cfg pcfg; @@ -639,7 +658,7 @@ jt_ptl_print_peers (int argc, char **argv) int index; int rc; - if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, 0)) + if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0)) return -1; for (index = 0;;index++) { @@ -675,7 +694,7 @@ jt_ptl_add_peer (int argc, char **argv) int port = 0; int rc; - if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, 0)) + if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0)) return -1; if (g_nal_is_compatible(NULL, SOCKNAL, 0)) { @@ -685,7 +704,7 @@ jt_ptl_add_peer (int argc, char **argv) return 0; } } else if (argc != 2) { - fprintf (stderr, "usage(openib): %s nid\n", argv[0]); + fprintf (stderr, "usage(openib,iib): %s nid\n", argv[0]); return 0; } @@ -732,7 +751,7 @@ jt_ptl_del_peer (int argc, char **argv) int argidx; int rc; - if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, 0)) + if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0)) return -1; if (g_nal_is_compatible(NULL, SOCKNAL, 0)) { @@ -764,7 +783,7 @@ jt_ptl_del_peer (int argc, char **argv) } if (argc > argidx) { - if (!strcmp (argv[3], "single_share")) { + if (!strcmp (argv[argidx], "single_share")) { single_share = 1; } else { fprintf (stderr, "Unrecognised arg %s'\n", argv[3]); @@ -795,7 +814,7 @@ jt_ptl_print_connections (int argc, char **argv) int index; int rc; - if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, 0)) + if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0)) return -1; for (index = 0;;index++) { @@ -832,13 +851,19 @@ jt_ptl_print_connections (int argc, char **argv) int jt_ptl_connect(int argc, char **argv) { +#ifndef HAVE_CONNECT + /* no connect() support */ + return -1; +#else /* HAVE_CONNECT */ struct portals_cfg pcfg; struct sockaddr_in srvaddr; + struct sockaddr_in locaddr; __u32 ipaddr; char *flag; int fd, rc; int type = SOCKNAL_CONN_ANY; - int port; + int port, rport; + int o; if (argc < 3) { fprintf(stderr, "usage: %s ip port [type]\n", argv[0]); @@ -893,20 +918,48 @@ int jt_ptl_connect(int argc, char **argv) return (-1); } + memset(&locaddr, 0, sizeof(locaddr)); + locaddr.sin_family = AF_INET; + locaddr.sin_addr.s_addr = INADDR_ANY; + memset(&srvaddr, 0, sizeof(srvaddr)); srvaddr.sin_family = AF_INET; srvaddr.sin_port = htons(port); srvaddr.sin_addr.s_addr = htonl(ipaddr); - fd = socket(PF_INET, SOCK_STREAM, 0); - if ( fd < 0 ) { - fprintf(stderr, "socket() failed: %s\n", strerror(errno)); - return -1; + + for (rport = IPPORT_RESERVED - 1; rport > IPPORT_RESERVED / 2; --rport) { + fd = socket(PF_INET, SOCK_STREAM, 0); + if ( fd < 0 ) { + fprintf(stderr, "socket() failed: %s\n", strerror(errno)); + return -1; + } + + o = 1; + rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, + &o, sizeof(o)); + + locaddr.sin_port = htons(rport); + rc = bind(fd, (struct sockaddr *)&locaddr, sizeof(locaddr)); + if (rc == 0 || errno == EACCES) { + rc = connect(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr)); + if (rc == 0) { + break; + } else if (errno != EADDRINUSE) { + fprintf(stderr, "Error connecting to host: %s\n", strerror(errno)); + close(fd); + return -1; + } + } else if (errno != EADDRINUSE) { + fprintf(stderr, "Error binding to port %d: %d: %s\n", port, errno, strerror(errno)); + close(fd); + return -1; + } } - rc = connect(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr)); - if ( rc == -1 ) { - fprintf(stderr, "connect() failed: %s\n", strerror(errno)); + if (rport == IPPORT_RESERVED / 2) { + fprintf(stderr, + "Warning: all privileged ports are in use.\n"); return -1; } @@ -937,6 +990,7 @@ int jt_ptl_connect(int argc, char **argv) fprintf(stderr, "close failed: %d\n", rc); return 0; +#endif /* HAVE_CONNECT */ } int jt_ptl_disconnect(int argc, char **argv) @@ -951,7 +1005,7 @@ int jt_ptl_disconnect(int argc, char **argv) return 0; } - if (!g_nal_is_compatible (NULL, SOCKNAL, OPENIBNAL, 0)) + if (!g_nal_is_compatible (NULL, SOCKNAL, OPENIBNAL, IIBNAL, 0)) return 0; if (argc >= 2 && @@ -1491,11 +1545,11 @@ lwt_snapshot(cycles_t *now, int *ncpu, int *totalsize, } /* crappy overloads */ - if (data.ioc_nid != sizeof(lwt_event_t) || - data.ioc_nid2 != offsetof(lwt_event_t, lwte_where)) { + if (data.ioc_nid2 != sizeof(lwt_event_t) || + data.ioc_nid3 != offsetof(lwt_event_t, lwte_where)) { fprintf(stderr,"kernel/user LWT event mismatch %d(%d),%d(%d)\n", - (int)data.ioc_nid, sizeof(lwt_event_t), - (int)data.ioc_nid2, + (int)data.ioc_nid2, sizeof(lwt_event_t), + (int)data.ioc_nid3, (int)offsetof(lwt_event_t, lwte_where)); return (-1); } @@ -1573,12 +1627,21 @@ lwt_put_string(char *ustr) static int lwt_print(FILE *f, cycles_t t0, cycles_t tlast, double mhz, int cpu, lwt_event_t *e) { +#ifndef __WORDSIZE +# error "__WORDSIZE not defined" +#elif __WORDSIZE == 32 +# define XFMT "%#010lx" +#elif __WORDSIZE== 64 +# define XFMT "%#018lx" +#else +# error "Unexpected __WORDSIZE" +#endif char *where = lwt_get_string(e->lwte_where); if (where == NULL) return (-1); - fprintf(f, "%#010lx %#010lx %#010lx %#010lx: %#010lx %1d %10.6f %10.2f %s\n", + fprintf(f, XFMT" "XFMT" "XFMT" "XFMT": "XFMT" %2d %10.6f %10.2f %s\n", e->lwte_p1, e->lwte_p2, e->lwte_p3, e->lwte_p4, (long)e->lwte_task, cpu, (e->lwte_when - t0) / (mhz * 1000000.0), (t0 == e->lwte_when) ? 0.0 : (e->lwte_when - tlast) / mhz, @@ -1587,6 +1650,7 @@ lwt_print(FILE *f, cycles_t t0, cycles_t tlast, double mhz, int cpu, lwt_event_t lwt_put_string(where); return (0); +#undef XFMT } double diff --git a/lustre/configure.in b/lustre/configure.in index e2fca96..fe97687 100644 --- a/lustre/configure.in +++ b/lustre/configure.in @@ -218,6 +218,8 @@ portals/knals/Makefile portals/knals/autoMakefile portals/knals/gmnal/Makefile portals/knals/gmnal/autoMakefile +portals/knals/iibnal/Makefile +portals/knals/iibnal/autoMakefile portals/knals/openibnal/Makefile portals/knals/openibnal/autoMakefile portals/knals/qswnal/Makefile diff --git a/lustre/kernel_patches/patches/kksymoops-2.4.24.vanilla.patch b/lustre/kernel_patches/patches/kksymoops-2.4.24.vanilla.patch index c47d1ac..5ea2c92 100644 --- a/lustre/kernel_patches/patches/kksymoops-2.4.24.vanilla.patch +++ b/lustre/kernel_patches/patches/kksymoops-2.4.24.vanilla.patch @@ -170,7 +170,7 @@ Index: linux-2.4.24-b1_4/include/linux/kallsyms.h + Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + -+#ident "$Id: kksymoops-2.4.24.vanilla.patch,v 1.9 2004/10/24 17:00:18 yury Exp $" ++#ident "$Id: kksymoops-2.4.24.vanilla.patch,v 1.10 2004/10/29 15:04:35 eeb Exp $" + +#ifndef MODUTILS_KALLSYMS_H +#define MODUTILS_KALLSYMS_H 1 diff --git a/lustre/portals/archdep.m4 b/lustre/portals/archdep.m4 index d2bd1a1..021fa68 100644 --- a/lustre/portals/archdep.m4 +++ b/lustre/portals/archdep.m4 @@ -14,26 +14,107 @@ AC_MSG_RESULT([$enable_inkernel]) AM_CONDITIONAL(INKERNEL, test x$enable_inkernel = xyes) # -------- are we building against an external portals? ------- -AC_MSG_CHECKING([if Cray portals should be used]) +AC_MSG_CHECKING([for Cray portals]) AC_ARG_WITH([cray-portals], AC_HELP_STRING([--with-cray-portals=path], [path to cray portals]), [ if test "$with_cray_portals" != no; then - if test -r $with_cray_portals/include/portals/api.h ; then - CRAY_PORTALS_PATH=$with_cray_portals - CRAY_PORTALS_INCLUDE="-I$with_cray_portals/include" - AC_DEFINE(CRAY_PORTALS, 1, [Building with Cray Portals]) - else - AC_MSG_ERROR([--with-cray-portals specified badly]) - fi - fi + CRAY_PORTALS_PATH=$with_cray_portals + CRAY_PORTALS_INCLUDES="$with_cray_portals/include" + CRAY_PORTALS_LIBS="$with_cray_portals" + fi ],[with_cray_portals=no]) AC_SUBST(CRAY_PORTALS_PATH) -AC_MSG_RESULT([$with_cray_portals]) +AC_MSG_RESULT([$CRAY_PORTALS_PATH]) + +AC_MSG_CHECKING([for Cray portals includes]) +AC_ARG_WITH([cray-portals-includes], + AC_HELP_STRING([--with-cray-portals-includes=path], + [path to cray portals includes]), + [ + if test "$with_cray_portals_includes" != no; then + CRAY_PORTALS_INCLUDES="$with_cray_portals_includes" + fi + ]) +AC_SUBST(CRAY_PORTALS_INCLUDES) +AC_MSG_RESULT([$CRAY_PORTALS_INCLUDES]) + +AC_MSG_CHECKING([for Cray portals libs]) +AC_ARG_WITH([cray-portals-libs], + AC_HELP_STRING([--with-cray-portals-libs=path], + [path to cray portals libs]), + [ + if test "$with_cray_portals_libs" != no; then + CRAY_PORTALS_LIBS="$with_cray_portals_libs" + fi + ]) +AC_SUBST(CRAY_PORTALS_LIBS) +AC_MSG_RESULT([$CRAY_PORTALS_LIBS]) + +if test x$CRAY_PORTALS_INCLUDES != x ; then + if test ! -r $CRAY_PORTALS_INCLUDES/portals/api.h ; then + AC_MSG_ERROR([Cray portals headers were not found in $CRAY_PORTALS_INCLUDES. Please check the paths passed to --with-cray-portals or --with-cray-portals-includes.]) + fi +fi +if test x$CRAY_PORTALS_LIBS != x ; then + if test ! -r $CRAY_PORTALS_LIBS/libportals.a ; then + AC_MSG_ERROR([Cray portals libraries were not found in $CRAY_PORTALS_LIBS. Please check the paths passed to --with-cray-portals or --with-cray-portals-libs.]) + fi +fi +AC_MSG_CHECKING([whether to use Cray portals]) +if test x$CRAY_PORTALS_INCLUDES != x -a x$CRAY_PORTALS_LIBS != x ; then + with_cray_portals=yes + AC_DEFINE(CRAY_PORTALS, 1, [Building with Cray Portals]) + CRAY_PORTALS_INCLUDES="-I$CRAY_PORTALS_INCLUDES" +else + with_cray_portals=no +fi +AC_MSG_RESULT([$with_cray_portals]) AM_CONDITIONAL(CRAY_PORTALS, test x$with_cray_portals != xno) +# ---------------------------------------- +# some tests for catamount-like systems +# ---------------------------------------- +AC_ARG_ENABLE([sysio_init], + AC_HELP_STRING([--disable-sysio-init], + [call sysio init functions when initializing liblustre]), + [],[enable_sysio_init=yes]) +AC_MSG_CHECKING([whether to initialize libsysio]) +AC_MSG_RESULT([$enable_sysio_init]) +if test x$enable_sysio_init != xno ; then + AC_DEFINE([INIT_SYSIO], 1, [call sysio init functions]) +fi + +AC_ARG_ENABLE([urandom], + AC_HELP_STRING([--disable-urandom], + [disable use of /dev/urandom for liblustre]), + [],[enable_urandom=yes]) +AC_MSG_CHECKING([whether to use /dev/urandom for liblustre]) +AC_MSG_RESULT([$enable_urandom]) +if test x$enable_urandom != xno ; then + AC_DEFINE([LIBLUSTRE_USE_URANDOM], 1, [use /dev/urandom for random data]) +fi + +# -------- check for -lcap and -lpthread ---- +if test x$enable_liblustre = xyes ; then + AC_CHECK_LIB([cap], [cap_get_proc], + [ + CAP_LIBS="-lcap" + AC_DEFINE([HAVE_LIBCAP], 1, [use libcap]) + ], + [CAP_LIBS=""]) + AC_SUBST(CAP_LIBS) + AC_CHECK_LIB([pthread], [pthread_create], + [ + PTHREAD_LIBS="-lpthread" + AC_DEFINE([HAVE_LIBPTHREAD], 1, [use libpthread]) + ], + [PTHREAD_LIBS=""]) + AC_SUBST(PTHREAD_LIBS) +fi + # -------- enable tests and utils? ------- if test x$enable_tests = xno ; then AC_MSG_NOTICE([disabling tests]) @@ -128,7 +209,7 @@ AM_CONDITIONAL(USE_QUILT, test x$QUILT != xno) # ------- Makeflags ------------------ -CPPFLAGS="$CRAY_PORTALS_INCLUDE $CRAY_PORTALS_COMMANDLINE -I\$(top_srcdir)/include -I\$(top_srcdir)/portals/include" +CPPFLAGS="$CPPFLAGS $CRAY_PORTALS_INCLUDES -I\$(top_srcdir)/include -I\$(top_srcdir)/portals/include" # liblustre are all the same LLCPPFLAGS="-D__arch_lib__ -D_LARGEFILE64_SOURCE=1" @@ -146,7 +227,7 @@ if test x$enable_ldiskfs = xyes ; then AC_DEFINE(CONFIG_LDISKFS_FS_SECURITY, 1, [enable fs security]) fi -EXTRA_KCFLAGS="-g $CRAY_PORTALS_INCLUDE $CRAY_PORTALS_COMMANDLINE -I$PWD/portals/include -I$PWD/include" +EXTRA_KCFLAGS="-g $CRAY_PORTALS_INCLUDES -I$PWD/portals/include -I$PWD/include" # these are like AC_TRY_COMPILE, but try to build modules against the # kernel, inside the kernel-tests directory @@ -408,6 +489,35 @@ if test x$enable_modules != xno ; then AC_SUBST(OPENIBCPPFLAGS) AC_SUBST(OPENIBNAL) + #### Infinicon IB + AC_MSG_CHECKING([if Infinicon IB kernel headers are present]) + # for how the only infinicon ib build has headers in /usr/include/iba + IIBCPPFLAGS="-I/usr/include -DIN_TREE_BUILD" + EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="$EXTRA_KCFLAGS $IIBCPPFLAGS" + LUSTRE_MODULE_TRY_COMPILE( + [ + #include + ],[ + IBT_INTERFACE_UNION interfaces; + FSTATUS rc; + + rc = IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2, + &interfaces); + + return rc == FSUCCESS ? 0 : 1; + ],[ + AC_MSG_RESULT([yes]) + IIBNAL="iibnal" + ],[ + AC_MSG_RESULT([no]) + IIBNAL="" + IIBCPPFLAGS="" + ]) + EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save" + AC_SUBST(IIBCPPFLAGS) + AC_SUBST(IIBNAL) + # ---------- Red Hat 2.4.18 has iobuf->dovary -------------- # But other kernels don't @@ -667,15 +777,34 @@ fi AM_CONDITIONAL(BUILD_QSWNAL, test x$QSWNAL = "xqswnal") AM_CONDITIONAL(BUILD_GMNAL, test x$GMNAL = "xgmnal") AM_CONDITIONAL(BUILD_OPENIBNAL, test x$OPENIBNAL = "xopenibnal") +AM_CONDITIONAL(BUILD_IIBNAL, test x$IIBNAL = "xiibnal") + +# portals/utils/portals.c +AC_CHECK_HEADERS([netdb.h netinet/tcp.h asm/types.h]) +AC_CHECK_FUNCS([gethostbyname socket connect]) + +# portals/utils/debug.c +AC_CHECK_HEADERS([linux/version.h]) + +# include/liblustre.h +AC_CHECK_HEADERS([asm/page.h sys/user.h stdint.h]) + +# liblustre/llite_lib.h +AC_CHECK_HEADERS([xtio.h file.h]) + +# liblustre/dir.c +AC_CHECK_HEADERS([linux/types.h sys/types.h linux/unistd.h unistd.h]) + +# liblustre/lutil.c +AC_CHECK_HEADERS([netinet/in.h arpa/inet.h catamount/data.h]) +AC_CHECK_FUNCS([inet_ntoa]) CPPFLAGS="-include \$(top_builddir)/include/config.h $CPPFLAGS" EXTRA_KCFLAGS="-include $PWD/include/config.h $EXTRA_KCFLAGS" AC_SUBST(EXTRA_KCFLAGS) -#echo "KCPPFLAGS: $KCPPFLAGS" -#echo "KCFLAGS: $KCFLAGS" -#echo "LLCPPFLAGS: $LLCPPFLAGS" -#echo "LLCFLAGS: $LLCFLAGS" -#echo "MOD_LINK: $MOD_LINK" -#echo "CFLAGS: $CFLAGS" -#echo "CPPFLAGS: $CPPFLAGS" +echo "CPPFLAGS: $CPPFLAGS" +echo "LLCPPFLAGS: $LLCPPFLAGS" +echo "CFLAGS: $CFLAGS" +echo "EXTRA_KCFLAGS: $EXTRA_KCFLAGS" +echo "LLCFLAGS: $LLCFLAGS" diff --git a/lustre/portals/build.m4 b/lustre/portals/build.m4 index 861bb4a..f158396 100644 --- a/lustre/portals/build.m4 +++ b/lustre/portals/build.m4 @@ -61,6 +61,13 @@ case "$CC_VERSION" in "gcc version 2.96 20000731 (Mandrake Linux 8.1 2.96-0.62mdk)") bad_cc ;; + # unpatched 'gcc' on rh9. miscompiles a + # struct = (type) { .member = value, }; + # asignment in the iibnal where the struct is a mix + # of u64 and u32 bit-fields. + "gcc version 3.2.2 20030222 (Red Hat Linux 3.2.2-5)") + bad_cc + ;; *) AC_MSG_RESULT([no known problems]) ;; @@ -116,3 +123,5 @@ else LIBWRAP="" fi AC_SUBST(LIBWRAP) + +AC_SUBST(LIBS) diff --git a/lustre/portals/include/linux/.cvsignore b/lustre/portals/include/linux/.cvsignore new file mode 100644 index 0000000..282522d --- /dev/null +++ b/lustre/portals/include/linux/.cvsignore @@ -0,0 +1,2 @@ +Makefile +Makefile.in diff --git a/lustre/portals/include/linux/kp30.h b/lustre/portals/include/linux/kp30.h index db63a08..4e24c71d 100644 --- a/lustre/portals/include/linux/kp30.h +++ b/lustre/portals/include/linux/kp30.h @@ -294,7 +294,6 @@ extern void kportal_blockallsigs (void); # include # include # include -# include # ifndef DEBUG_SUBSYSTEM # define DEBUG_SUBSYSTEM S_UNDEFINED # endif @@ -320,6 +319,11 @@ void portals_debug_dumplog(void); printf("%02x:%06x (@%lu %s:%s,l. %d %d %lu): " format, \ (subsys), (mask), (long)time(0), file, fn, line, \ getpid() , stack, ## a); + +#undef CWARN +#undef CERROR +#define CWARN(format, a...) CDEBUG(D_WARNING, format, ## a) +#define CERROR(format, a...) CDEBUG(D_ERROR, format, ## a) #endif /* support decl needed both by kernel and liblustre */ @@ -338,6 +342,16 @@ char *portals_id2str(int nal, ptl_process_id_t nid, char *str); #define LWT_MEMORY (16<<20) #if !KLWT_SUPPORT +# if defined(__KERNEL__) +# if !defined(BITS_PER_LONG) +# error "BITS_PER_LONG not defined" +# endif +# elif !defined(__WORDSIZE) +# error "__WORDSIZE not defined" +# else +# define BITS_PER_LONG __WORDSIZE +# endif + /* kernel hasn't defined this? */ typedef struct { long long lwte_when; @@ -572,49 +586,42 @@ static inline int portal_ioctl_getdata(char *buf, char *end, void *arg) data = (struct portal_ioctl_data *)buf; err = copy_from_user(buf, (void *)arg, sizeof(*hdr)); - if ( err ) { - EXIT; - return err; - } + if (err) + RETURN(err); if (hdr->ioc_version != PORTAL_IOCTL_VERSION) { - CERROR ("PORTALS: version mismatch kernel vs application\n"); - return -EINVAL; + CERROR("PORTALS: version mismatch kernel vs application\n"); + RETURN(-EINVAL); } if (hdr->ioc_len + buf >= end) { - CERROR ("PORTALS: user buffer exceeds kernel buffer\n"); - return -EINVAL; + CERROR("PORTALS: user buffer exceeds kernel buffer\n"); + RETURN(-EINVAL); } if (hdr->ioc_len < sizeof(struct portal_ioctl_data)) { - CERROR ("PORTALS: user buffer too small for ioctl\n"); - return -EINVAL; + CERROR("PORTALS: user buffer too small for ioctl\n"); + RETURN(-EINVAL); } err = copy_from_user(buf, (void *)arg, hdr->ioc_len); - if ( err ) { - EXIT; - return err; - } + if (err) + RETURN(err); if (portal_ioctl_is_invalid(data)) { - CERROR ("PORTALS: ioctl not correctly formatted\n"); - return -EINVAL; + CERROR("PORTALS: ioctl not correctly formatted\n"); + RETURN(-EINVAL); } - if (data->ioc_inllen1) { + if (data->ioc_inllen1) data->ioc_inlbuf1 = &data->ioc_bulk[0]; - } - if (data->ioc_inllen2) { + if (data->ioc_inllen2) data->ioc_inlbuf2 = &data->ioc_bulk[0] + size_round(data->ioc_inllen1); - } - EXIT; - return 0; + RETURN(0); } #endif @@ -645,10 +652,11 @@ enum { TCPNAL = 5, ROUTER = 6, OPENIBNAL = 7, + IIBNAL = 8, NAL_ENUM_END_MARKER }; -#define PTL_NALFMT_SIZE 30 /* %u:%u.%u.%u.%u,%u (10+4+4+4+3+4+1) */ +#define PTL_NALFMT_SIZE 32 /* %u:%u.%u.%u.%u,%u (10+4+4+4+3+5+1) */ #define NAL_MAX_NR (NAL_ENUM_END_MARKER - 1) diff --git a/lustre/portals/include/linux/libcfs.h b/lustre/portals/include/linux/libcfs.h index d1a5c44..8317f14 100644 --- a/lustre/portals/include/linux/libcfs.h +++ b/lustre/portals/include/linux/libcfs.h @@ -4,7 +4,11 @@ #ifndef _LIBCFS_H #define _LIBCFS_H +#ifdef HAVE_ASM_TYPES_H #include +#else +#include "types.h" +#endif #ifdef __KERNEL__ # include @@ -62,7 +66,6 @@ extern unsigned int portal_stack; extern unsigned int portal_debug; extern unsigned int portal_printk; -#include struct ptldebug_header { __u32 ph_len; __u32 ph_flags; @@ -102,7 +105,7 @@ struct ptldebug_header { #define S_GMNAL 0x00080000 #define S_PTLROUTER 0x00100000 #define S_COBD 0x00200000 -#define S_OPENIBNAL 0x00400000 +#define S_IBNAL 0x00400000 /* All IB NALs */ #define S_SM 0x00800000 #define S_ASOBD 0x01000000 #define S_LMV 0x02000000 @@ -185,8 +188,40 @@ do { \ CDEBUG_STACK, format, ## a); \ } while (0) -#define CWARN(format, a...) CDEBUG(D_WARNING, format, ## a) -#define CERROR(format, a...) CDEBUG(D_ERROR, format, ## a) +#define CDEBUG_MAX_LIMIT 600 +#define CDEBUG_LIMIT(cdebug_mask, cdebug_format, a...) \ +do { \ + static unsigned long cdebug_next; \ + static int cdebug_count, cdebug_delay = 1; \ + \ + CHECK_STACK(CDEBUG_STACK); \ + if (time_after(jiffies, cdebug_next)) { \ + portals_debug_msg(DEBUG_SUBSYSTEM, cdebug_mask, __FILE__, \ + __FUNCTION__, __LINE__, CDEBUG_STACK, \ + cdebug_format, ## a); \ + if (cdebug_count) { \ + portals_debug_msg(DEBUG_SUBSYSTEM, cdebug_mask, \ + __FILE__, __FUNCTION__, __LINE__, \ + CDEBUG_STACK, cdebug_format, ## a); \ + cdebug_count = 0; \ + } \ + if (time_after(jiffies, cdebug_next+(CDEBUG_MAX_LIMIT+10)*HZ))\ + cdebug_delay = cdebug_delay > 8 ? cdebug_delay/8 : 1; \ + else \ + cdebug_delay = cdebug_delay*2 >= CDEBUG_MAX_LIMIT*HZ ?\ + CDEBUG_MAX_LIMIT*HZ : cdebug_delay*2; \ + cdebug_next = jiffies + cdebug_delay; \ + } else { \ + portals_debug_msg(DEBUG_SUBSYSTEM, \ + portal_debug & ~(D_EMERG|D_ERROR|D_WARNING),\ + __FILE__, __FUNCTION__, __LINE__, \ + CDEBUG_STACK, cdebug_format, ## a); \ + cdebug_count++; \ + } \ +} while (0) + +#define CWARN(format, a...) CDEBUG_LIMIT(D_WARNING, format, ## a) +#define CERROR(format, a...) CDEBUG_LIMIT(D_ERROR, format, ## a) #define CEMERG(format, a...) CDEBUG(D_EMERG, format, ## a) #define GOTO(label, rc) \ @@ -229,14 +264,13 @@ do { \ /* initial pid */ # if CRAY_PORTALS /* + * 1) ptl_pid_t in cray portals is only 16 bits, not 32 bits, therefore this + * is too big. * - * 1) ptl_pid_t in cray portals is only 16 bits, not 32 bits, therefore this is too - * big. - * - * 2) the implementation of ernal in cray portals further restricts the pid space - * that may be used to 0 <= pid <= 255 (an 8 bit value). Returns an error at nal - * init time for any pid outside this range. Other nals in cray portals don't have - * this restriction. + * 2) the implementation of ernal in cray portals further restricts the pid + * space that may be used to 0 <= pid <= 255 (an 8 bit value). Returns + * an error at nal init time for any pid outside this range. Other nals + * in cray portals don't have this restriction. * */ #define LUSTRE_PTL_PID 9 # else diff --git a/lustre/portals/include/linux/portals_compat25.h b/lustre/portals/include/linux/portals_compat25.h index 7fe6dfc..5a43a45 100644 --- a/lustre/portals/include/linux/portals_compat25.h +++ b/lustre/portals/include/linux/portals_compat25.h @@ -28,6 +28,8 @@ call_usermodehelper(path, argv, envp, 1) # define RECALC_SIGPENDING recalc_sigpending() # define CURRENT_SECONDS get_seconds() +# define smp_num_cpus NR_CPUS + #elif defined(CONFIG_RH_2_4_20) /* RH 2.4.x */ diff --git a/lustre/portals/include/portals/.cvsignore b/lustre/portals/include/portals/.cvsignore new file mode 100644 index 0000000..282522d --- /dev/null +++ b/lustre/portals/include/portals/.cvsignore @@ -0,0 +1,2 @@ +Makefile +Makefile.in diff --git a/lustre/portals/include/portals/build_check.h b/lustre/portals/include/portals/build_check.h index 5db1352..c219d2a 100644 --- a/lustre/portals/include/portals/build_check.h +++ b/lustre/portals/include/portals/build_check.h @@ -1,7 +1,7 @@ #ifndef _BUILD_CHECK_H #define _BUILD_CHECK_H -#ifdef CRAY_PORTALS +#if CRAY_PORTALS #error "an application got to me instead of cray's includes" #endif diff --git a/lustre/portals/include/portals/ptlctl.h b/lustre/portals/include/portals/ptlctl.h index a81a371..cfddde2 100644 --- a/lustre/portals/include/portals/ptlctl.h +++ b/lustre/portals/include/portals/ptlctl.h @@ -31,8 +31,6 @@ #define PORTALS_DEV_PATH "/dev/portals" #define OBD_DEV_ID 1 #define OBD_DEV_PATH "/dev/obd" -#define SMFS_DEV_ID 2 -#define SMFS_DEV_PATH "/dev/snapdev" int ptl_name2nal(char *str); int ptl_parse_ipaddr (__u32 *ipaddrp, char *str); @@ -41,9 +39,6 @@ char * ptl_nid2str (char *buffer, ptl_nid_t nid); int ptl_initialize(int argc, char **argv); int jt_ptl_network(int argc, char **argv); -int jt_ptl_print_autoconnects (int argc, char **argv); -int jt_ptl_add_autoconnect (int argc, char **argv); -int jt_ptl_del_autoconnect (int argc, char **argv); int jt_ptl_print_interfaces(int argc, char **argv); int jt_ptl_add_interface(int argc, char **argv); int jt_ptl_del_interface(int argc, char **argv); @@ -62,9 +57,6 @@ int jt_ptl_add_uuid(int argc, char **argv); int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility */ int jt_ptl_close_uuid(int argc, char **argv); int jt_ptl_del_uuid(int argc, char **argv); -int jt_ptl_rxmem (int argc, char **argv); -int jt_ptl_txmem (int argc, char **argv); -int jt_ptl_nagle (int argc, char **argv); int jt_ptl_add_route (int argc, char **argv); int jt_ptl_del_route (int argc, char **argv); int jt_ptl_notify_router (int argc, char **argv); diff --git a/lustre/portals/knals/Makefile.in b/lustre/portals/knals/Makefile.in index 2a01119..9763d14 100644 --- a/lustre/portals/knals/Makefile.in +++ b/lustre/portals/knals/Makefile.in @@ -1,5 +1,6 @@ @BUILD_GMNAL_TRUE@subdir-m += gmnal @BUILD_OPENIBNAL_TRUE@subdir-m += openibnal +@BUILD_IIBNAL_TRUE@subdir-m += iibnal @BUILD_QSWNAL_TRUE@subdir-m += qswnal subdir-m += socknal diff --git a/lustre/portals/knals/autoMakefile.am b/lustre/portals/knals/autoMakefile.am index 002c169..0090364 100644 --- a/lustre/portals/knals/autoMakefile.am +++ b/lustre/portals/knals/autoMakefile.am @@ -3,4 +3,4 @@ # This code is issued under the GNU General Public License. # See the file COPYING in this distribution -SUBDIRS = gmnal openibnal qswnal socknal +SUBDIRS = gmnal iibnal openibnal qswnal socknal diff --git a/lustre/portals/knals/iibnal/.cvsignore b/lustre/portals/knals/iibnal/.cvsignore new file mode 100644 index 0000000..5ed596b --- /dev/null +++ b/lustre/portals/knals/iibnal/.cvsignore @@ -0,0 +1,10 @@ +.deps +Makefile +.*.cmd +autoMakefile.in +autoMakefile +*.ko +*.mod.c +.*.flags +.tmp_versions +.depend diff --git a/lustre/portals/knals/iibnal/Makefile.in b/lustre/portals/knals/iibnal/Makefile.in new file mode 100644 index 0000000..e7934e2 --- /dev/null +++ b/lustre/portals/knals/iibnal/Makefile.in @@ -0,0 +1,6 @@ +MODULES := kiibnal +kiibnal-objs := iibnal.o iibnal_cb.o + +EXTRA_POST_CFLAGS := @IIBCPPFLAGS@ + +@INCLUDE_RULES@ diff --git a/lustre/portals/knals/iibnal/Makefile.mk b/lustre/portals/knals/iibnal/Makefile.mk new file mode 100644 index 0000000..0459a20 --- /dev/null +++ b/lustre/portals/knals/iibnal/Makefile.mk @@ -0,0 +1,10 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include $(src)/../../Kernelenv + +obj-y += kiibnal.o +kiibnal-objs := iibnal.o iibnal_cb.o + diff --git a/lustre/portals/knals/iibnal/autoMakefile.am b/lustre/portals/knals/iibnal/autoMakefile.am new file mode 100644 index 0000000..251df66 --- /dev/null +++ b/lustre/portals/knals/iibnal/autoMakefile.am @@ -0,0 +1,15 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +if MODULES +if !CRAY_PORTALS +if BUILD_IIBNAL +modulenet_DATA = kiibnal$(KMODEXT) +endif +endif +endif + +MOSTLYCLEANFILES = *.o *.ko *.mod.c +DIST_SOURCES = $(kiibnal-objs:%.o=%.c) iibnal.h diff --git a/lustre/portals/knals/iibnal/iibnal.c b/lustre/portals/knals/iibnal/iibnal.c new file mode 100644 index 0000000..09908c9 --- /dev/null +++ b/lustre/portals/knals/iibnal/iibnal.c @@ -0,0 +1,1713 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * Author: Eric Barton + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "iibnal.h" + +nal_t kibnal_api; +ptl_handle_ni_t kibnal_ni; +kib_tunables_t kibnal_tunables; + +kib_data_t kibnal_data = { + .kib_service_id = IBNAL_SERVICE_NUMBER, +}; + +#ifdef CONFIG_SYSCTL +#define IBNAL_SYSCTL 202 + +#define IBNAL_SYSCTL_TIMEOUT 1 + +static ctl_table kibnal_ctl_table[] = { + {IBNAL_SYSCTL_TIMEOUT, "timeout", + &kibnal_tunables.kib_io_timeout, sizeof (int), + 0644, NULL, &proc_dointvec}, + { 0 } +}; + +static ctl_table kibnal_top_ctl_table[] = { + {IBNAL_SYSCTL, "iibnal", NULL, 0, 0555, kibnal_ctl_table}, + { 0 } +}; +#endif + +#ifdef unused +void +print_service(IB_SERVICE_RECORD *service, char *tag, int rc) +{ + char name[32]; + + if (service == NULL) + { + CWARN("tag : %s\n" + "status : %d (NULL)\n", tag, rc); + return; + } + strncpy (name, service->ServiceName, sizeof(name)-1); + name[sizeof(name)-1] = 0; + + CWARN("tag : %s\n" + "status : %d\n" + "service id: "LPX64"\n" + "name : %s\n" + "NID : "LPX64"\n", tag, rc, + service->RID.ServiceID, name, + *kibnal_service_nid_field(service)); +} +#endif + +static void +kibnal_service_setunset_done (void *arg, FABRIC_OPERATION_DATA *fod, + FSTATUS frc, uint32 madrc) +{ + *(FSTATUS *)arg = frc; + up (&kibnal_data.kib_nid_signal); +} + +#if IBNAL_CHECK_ADVERT +static void +kibnal_service_query_done (void *arg, QUERY *qry, + QUERY_RESULT_VALUES *qry_result) +{ + FSTATUS frc = qry_result->Status; + + if (frc != FSUCCESS && + qry_result->ResultDataSize == 0) + frc = FERROR; + + *(FSTATUS *)arg = frc; + up (&kibnal_data.kib_nid_signal); +} + +static void +kibnal_check_advert (void) +{ + QUERY *qry; + IB_SERVICE_RECORD *svc; + FSTATUS frc; + FSTATUS frc2; + + PORTAL_ALLOC(qry, sizeof(*qry)); + if (qry == NULL) + return; + + memset (qry, 0, sizeof(*qry)); + qry->InputType = InputTypeServiceRecord; + qry->OutputType = OutputTypeServiceRecord; + qry->InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK; + svc = &qry->InputValue.ServiceRecordValue.ServiceRecord; + kibnal_set_service_keys(svc, kibnal_data.kib_nid); + + frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd, + kibnal_data.kib_port_guid, + qry, + kibnal_service_query_done, + NULL, &frc2); + if (frc != FSUCCESS && frc != FPENDING) { + CERROR ("Immediate error %d checking SM service\n", frc); + } else { + down (&kibnal_data.kib_nid_signal); + frc = frc2; + + if (frc != 0) + CERROR ("Error %d checking SM service\n", rc); + } + + return (rc); +} +#endif + +static void fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type) +{ + IB_SERVICE_RECORD *svc; + + memset (fod, 0, sizeof(*fod)); + fod->Type = type; + + svc = &fod->Value.ServiceRecordValue.ServiceRecord; + svc->RID.ServiceID = kibnal_data.kib_service_id; + svc->RID.ServiceGID.Type.Global.InterfaceID = kibnal_data.kib_port_guid; + svc->RID.ServiceGID.Type.Global.SubnetPrefix = DEFAULT_SUBNET_PREFIX; + svc->RID.ServiceP_Key = kibnal_data.kib_port_pkey; + svc->ServiceLease = 0xffffffff; + + kibnal_set_service_keys(svc, kibnal_data.kib_nid); +} + +static int +kibnal_advertise (void) +{ + FABRIC_OPERATION_DATA *fod; + IB_SERVICE_RECORD *svc; + FSTATUS frc; + FSTATUS frc2; + + LASSERT (kibnal_data.kib_nid != PTL_NID_ANY); + + PORTAL_ALLOC(fod, sizeof(*fod)); + if (fod == NULL) + return (-ENOMEM); + + fill_fod(fod, FabOpSetServiceRecord); + svc = &fod->Value.ServiceRecordValue.ServiceRecord; + + CDEBUG(D_NET, "Advertising service id "LPX64" %s:"LPX64"\n", + svc->RID.ServiceID, + svc->ServiceName, *kibnal_service_nid_field(svc)); + + frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd, + kibnal_data.kib_port_guid, + fod, kibnal_service_setunset_done, + NULL, &frc2); + + if (frc != FSUCCESS && frc != FPENDING) { + CERROR ("Immediate error %d advertising NID "LPX64"\n", + frc, kibnal_data.kib_nid); + goto out; + } + + down (&kibnal_data.kib_nid_signal); + + frc = frc2; + if (frc != FSUCCESS) + CERROR ("Error %d advertising BUD "LPX64"\n", + frc, kibnal_data.kib_nid); +out: + PORTAL_FREE(fod, sizeof(*fod)); + return (frc == FSUCCESS) ? 0 : -EINVAL; +} + +static void +kibnal_unadvertise (int expect_success) +{ + FABRIC_OPERATION_DATA *fod; + IB_SERVICE_RECORD *svc; + FSTATUS frc; + FSTATUS frc2; + + LASSERT (kibnal_data.kib_nid != PTL_NID_ANY); + + PORTAL_ALLOC(fod, sizeof(*fod)); + if (fod == NULL) + return; + + fill_fod(fod, FabOpDeleteServiceRecord); + svc = &fod->Value.ServiceRecordValue.ServiceRecord; + + CDEBUG(D_NET, "Unadvertising service %s:"LPX64"\n", + svc->ServiceName, *kibnal_service_nid_field(svc)); + + frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd, + kibnal_data.kib_port_guid, + fod, kibnal_service_setunset_done, + NULL, &frc2); + + if (frc != FSUCCESS && frc != FPENDING) { + CERROR ("Immediate error %d unadvertising NID "LPX64"\n", + frc, kibnal_data.kib_nid); + goto out; + } + + down (&kibnal_data.kib_nid_signal); + + if ((frc2 == FSUCCESS) == !!expect_success) + goto out; + + if (expect_success) + CERROR("Error %d unadvertising NID "LPX64"\n", + frc2, kibnal_data.kib_nid); + else + CWARN("Removed conflicting NID "LPX64"\n", + kibnal_data.kib_nid); + out: + PORTAL_FREE(fod, sizeof(*fod)); +} + +static int +kibnal_set_mynid(ptl_nid_t nid) +{ + struct timeval tv; + lib_ni_t *ni = &kibnal_lib.libnal_ni; + int rc; + FSTATUS frc; + + CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", + nid, ni->ni_pid.nid); + + do_gettimeofday(&tv); + + down (&kibnal_data.kib_nid_mutex); + + if (nid == kibnal_data.kib_nid) { + /* no change of NID */ + up (&kibnal_data.kib_nid_mutex); + return (0); + } + + CDEBUG(D_NET, "NID "LPX64"("LPX64")\n", + kibnal_data.kib_nid, nid); + + if (kibnal_data.kib_nid != PTL_NID_ANY) { + + kibnal_unadvertise (1); + + frc = iibt_cm_cancel(kibnal_data.kib_cep); + if (frc != FSUCCESS && frc != FPENDING) + CERROR ("Error %d stopping listener\n", frc); + + frc = iibt_cm_destroy_cep(kibnal_data.kib_cep); + if (frc != FSUCCESS) + CERROR ("Error %d destroying CEP\n", frc); + + kibnal_data.kib_cep = NULL; + } + + kibnal_data.kib_nid = ni->ni_pid.nid = nid; + kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; + + /* Delete all existing peers and their connections after new + * NID/incarnation set to ensure no old connections in our brave + * new world. */ + kibnal_del_peer (PTL_NID_ANY, 0); + + if (kibnal_data.kib_nid == PTL_NID_ANY) { + /* No new NID to install */ + up (&kibnal_data.kib_nid_mutex); + return (0); + } + + /* remove any previous advert (crashed node etc) */ + kibnal_unadvertise(0); + + kibnal_data.kib_cep = iibt_cm_create_cep(CM_RC_TYPE); + if (kibnal_data.kib_cep == NULL) { + CERROR ("Can't create CEP\n"); + rc = -ENOMEM; + } else { + CM_LISTEN_INFO info; + memset (&info, 0, sizeof(info)); + info.ListenAddr.EndPt.SID = kibnal_data.kib_service_id; + + frc = iibt_cm_listen(kibnal_data.kib_cep, &info, + kibnal_listen_callback, NULL); + if (frc != FSUCCESS && frc != FPENDING) { + CERROR ("iibt_cm_listen error: %d\n", frc); + rc = -EINVAL; + } else { + rc = 0; + } + } + + if (rc == 0) { + rc = kibnal_advertise(); + if (rc == 0) { +#if IBNAL_CHECK_ADVERT + kibnal_check_advert(); +#endif + up (&kibnal_data.kib_nid_mutex); + return (0); + } + + iibt_cm_cancel (kibnal_data.kib_cep); + iibt_cm_destroy_cep (kibnal_data.kib_cep); + /* remove any peers that sprung up while I failed to + * advertise myself */ + kibnal_del_peer (PTL_NID_ANY, 0); + } + + kibnal_data.kib_nid = PTL_NID_ANY; + up (&kibnal_data.kib_nid_mutex); + return (rc); +} + +kib_peer_t * +kibnal_create_peer (ptl_nid_t nid) +{ + kib_peer_t *peer; + + LASSERT (nid != PTL_NID_ANY); + + PORTAL_ALLOC (peer, sizeof (*peer)); + if (peer == NULL) + return (NULL); + + memset(peer, 0, sizeof(*peer)); /* zero flags etc */ + + peer->ibp_nid = nid; + atomic_set (&peer->ibp_refcount, 1); /* 1 ref for caller */ + + INIT_LIST_HEAD (&peer->ibp_list); /* not in the peer table yet */ + INIT_LIST_HEAD (&peer->ibp_conns); + INIT_LIST_HEAD (&peer->ibp_tx_queue); + + peer->ibp_reconnect_time = jiffies; + peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; + + atomic_inc (&kibnal_data.kib_npeers); + return (peer); +} + +void +kibnal_destroy_peer (kib_peer_t *peer) +{ + + LASSERT (atomic_read (&peer->ibp_refcount) == 0); + LASSERT (peer->ibp_persistence == 0); + LASSERT (!kibnal_peer_active(peer)); + LASSERT (peer->ibp_connecting == 0); + LASSERT (list_empty (&peer->ibp_conns)); + LASSERT (list_empty (&peer->ibp_tx_queue)); + + PORTAL_FREE (peer, sizeof (*peer)); + + /* NB a peer's connections keep a reference on their peer until + * they are destroyed, so we can be assured that _all_ state to do + * with this peer has been cleaned up when its refcount drops to + * zero. */ + atomic_dec (&kibnal_data.kib_npeers); +} + +/* the caller is responsible for accounting for the additional reference + * that this creates */ +kib_peer_t * +kibnal_find_peer_locked (ptl_nid_t nid) +{ + struct list_head *peer_list = kibnal_nid2peerlist (nid); + struct list_head *tmp; + kib_peer_t *peer; + + list_for_each (tmp, peer_list) { + + peer = list_entry (tmp, kib_peer_t, ibp_list); + + LASSERT (peer->ibp_persistence != 0 || /* persistent peer */ + peer->ibp_connecting != 0 || /* creating conns */ + !list_empty (&peer->ibp_conns)); /* active conn */ + + if (peer->ibp_nid != nid) + continue; + + CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n", + peer, nid, atomic_read (&peer->ibp_refcount)); + return (peer); + } + return (NULL); +} + +kib_peer_t * +kibnal_get_peer (ptl_nid_t nid) +{ + kib_peer_t *peer; + + read_lock (&kibnal_data.kib_global_lock); + peer = kibnal_find_peer_locked (nid); + if (peer != NULL) /* +1 ref for caller? */ + kib_peer_addref(peer); + read_unlock (&kibnal_data.kib_global_lock); + + return (peer); +} + +void +kibnal_unlink_peer_locked (kib_peer_t *peer) +{ + LASSERT (peer->ibp_persistence == 0); + LASSERT (list_empty(&peer->ibp_conns)); + + LASSERT (kibnal_peer_active(peer)); + list_del_init (&peer->ibp_list); + /* lose peerlist's ref */ + kib_peer_decref(peer); +} + +static int +kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep) +{ + kib_peer_t *peer; + struct list_head *ptmp; + int i; + + read_lock (&kibnal_data.kib_global_lock); + + for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { + + list_for_each (ptmp, &kibnal_data.kib_peers[i]) { + + peer = list_entry (ptmp, kib_peer_t, ibp_list); + LASSERT (peer->ibp_persistence != 0 || + peer->ibp_connecting != 0 || + !list_empty (&peer->ibp_conns)); + + if (index-- > 0) + continue; + + *nidp = peer->ibp_nid; + *persistencep = peer->ibp_persistence; + + read_unlock (&kibnal_data.kib_global_lock); + return (0); + } + } + + read_unlock (&kibnal_data.kib_global_lock); + return (-ENOENT); +} + +static int +kibnal_add_persistent_peer (ptl_nid_t nid) +{ + unsigned long flags; + kib_peer_t *peer; + kib_peer_t *peer2; + + if (nid == PTL_NID_ANY) + return (-EINVAL); + + peer = kibnal_create_peer (nid); + if (peer == NULL) + return (-ENOMEM); + + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + + peer2 = kibnal_find_peer_locked (nid); + if (peer2 != NULL) { + kib_peer_decref (peer); + peer = peer2; + } else { + /* peer table takes existing ref on peer */ + list_add_tail (&peer->ibp_list, + kibnal_nid2peerlist (nid)); + } + + peer->ibp_persistence++; + + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + return (0); +} + +static void +kibnal_del_peer_locked (kib_peer_t *peer, int single_share) +{ + struct list_head *ctmp; + struct list_head *cnxt; + kib_conn_t *conn; + + if (!single_share) + peer->ibp_persistence = 0; + else if (peer->ibp_persistence > 0) + peer->ibp_persistence--; + + if (peer->ibp_persistence != 0) + return; + + list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { + conn = list_entry(ctmp, kib_conn_t, ibc_list); + + kibnal_close_conn_locked (conn, 0); + } + + /* NB peer unlinks itself when last conn is closed */ +} + +int +kibnal_del_peer (ptl_nid_t nid, int single_share) +{ + unsigned long flags; + struct list_head *ptmp; + struct list_head *pnxt; + kib_peer_t *peer; + int lo; + int hi; + int i; + int rc = -ENOENT; + + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + + if (nid != PTL_NID_ANY) + lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; + else { + lo = 0; + hi = kibnal_data.kib_peer_hash_size - 1; + } + + for (i = lo; i <= hi; i++) { + list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) { + peer = list_entry (ptmp, kib_peer_t, ibp_list); + LASSERT (peer->ibp_persistence != 0 || + peer->ibp_connecting != 0 || + !list_empty (&peer->ibp_conns)); + + if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid)) + continue; + + kibnal_del_peer_locked (peer, single_share); + rc = 0; /* matched something */ + + if (single_share) + goto out; + } + } + out: + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + + return (rc); +} + +static kib_conn_t * +kibnal_get_conn_by_idx (int index) +{ + kib_peer_t *peer; + struct list_head *ptmp; + kib_conn_t *conn; + struct list_head *ctmp; + int i; + + read_lock (&kibnal_data.kib_global_lock); + + for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { + list_for_each (ptmp, &kibnal_data.kib_peers[i]) { + + peer = list_entry (ptmp, kib_peer_t, ibp_list); + LASSERT (peer->ibp_persistence > 0 || + peer->ibp_connecting != 0 || + !list_empty (&peer->ibp_conns)); + + list_for_each (ctmp, &peer->ibp_conns) { + if (index-- > 0) + continue; + + conn = list_entry (ctmp, kib_conn_t, ibc_list); + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); + read_unlock (&kibnal_data.kib_global_lock); + return (conn); + } + } + } + + read_unlock (&kibnal_data.kib_global_lock); + return (NULL); +} + +kib_conn_t * +kibnal_create_conn (void) +{ + kib_conn_t *conn; + int i; + __u64 vaddr = 0; + __u64 vaddr_base; + int page_offset; + int ipage; + int rc; + FSTATUS frc; + union { + IB_QP_ATTRIBUTES_CREATE qp_create; + IB_QP_ATTRIBUTES_MODIFY qp_attr; + } params; + + PORTAL_ALLOC (conn, sizeof (*conn)); + if (conn == NULL) { + CERROR ("Can't allocate connection\n"); + return (NULL); + } + + /* zero flags, NULL pointers etc... */ + memset (conn, 0, sizeof (*conn)); + + INIT_LIST_HEAD (&conn->ibc_tx_queue); + INIT_LIST_HEAD (&conn->ibc_active_txs); + spin_lock_init (&conn->ibc_lock); + + atomic_inc (&kibnal_data.kib_nconns); + /* well not really, but I call destroy() on failure, which decrements */ + + PORTAL_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t)); + if (conn->ibc_rxs == NULL) + goto failed; + memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t)); + + rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES, 1); + if (rc != 0) + goto failed; + + vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr; + + for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) { + struct page *page = conn->ibc_rx_pages->ibp_pages[ipage]; + kib_rx_t *rx = &conn->ibc_rxs[i]; + + rx->rx_conn = conn; + rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + + page_offset); + + if (kibnal_whole_mem()) + rx->rx_vaddr = kibnal_page2phys(page) + + page_offset + + kibnal_data.kib_md.md_addr; + else + rx->rx_vaddr = vaddr; + + vaddr += IBNAL_MSG_SIZE; + LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES); + + page_offset += IBNAL_MSG_SIZE; + LASSERT (page_offset <= PAGE_SIZE); + + if (page_offset == PAGE_SIZE) { + page_offset = 0; + ipage++; + LASSERT (ipage <= IBNAL_RX_MSG_PAGES); + } + } + + params.qp_create = (IB_QP_ATTRIBUTES_CREATE) { + .Type = QPTypeReliableConnected, + .SendQDepth = IBNAL_TX_MAX_SG * + IBNAL_MSG_QUEUE_SIZE, + .RecvQDepth = IBNAL_MSG_QUEUE_SIZE, + .SendDSListDepth = 1, + .RecvDSListDepth = 1, + .SendCQHandle = kibnal_data.kib_cq, + .RecvCQHandle = kibnal_data.kib_cq, + .PDHandle = kibnal_data.kib_pd, + .SendSignaledCompletions = TRUE, + }; + frc = iibt_qp_create(kibnal_data.kib_hca, ¶ms.qp_create, NULL, + &conn->ibc_qp, &conn->ibc_qp_attrs); + if (rc != 0) { + CERROR ("Failed to create queue pair: %d\n", rc); + goto failed; + } + + /* Mark QP created */ + conn->ibc_state = IBNAL_CONN_INIT_QP; + + params.qp_attr = (IB_QP_ATTRIBUTES_MODIFY) { + .RequestState = QPStateInit, + .Attrs = (IB_QP_ATTR_PORTGUID | + IB_QP_ATTR_PKEYINDEX | + IB_QP_ATTR_ACCESSCONTROL), + .PortGUID = kibnal_data.kib_port_guid, + .PkeyIndex = 0, + .AccessControl = { + .s = { + .RdmaWrite = 1, + .RdmaRead = 1, + }, + }, + }; + rc = iibt_qp_modify(conn->ibc_qp, ¶ms.qp_attr, NULL); + if (rc != 0) { + CERROR ("Failed to modify queue pair: %d\n", rc); + goto failed; + } + + /* 1 ref for caller */ + atomic_set (&conn->ibc_refcount, 1); + return (conn); + + failed: + kibnal_destroy_conn (conn); + return (NULL); +} + +void +kibnal_destroy_conn (kib_conn_t *conn) +{ + int rc; + FSTATUS frc; + + CDEBUG (D_NET, "connection %p\n", conn); + + LASSERT (atomic_read (&conn->ibc_refcount) == 0); + LASSERT (list_empty(&conn->ibc_tx_queue)); + LASSERT (list_empty(&conn->ibc_active_txs)); + LASSERT (conn->ibc_nsends_posted == 0); + LASSERT (conn->ibc_connreq == NULL); + + switch (conn->ibc_state) { + case IBNAL_CONN_DISCONNECTED: + /* called after connection sequence initiated */ + /* fall through */ + + case IBNAL_CONN_INIT_QP: + /* _destroy includes an implicit Reset of the QP which + * discards posted work */ + rc = iibt_qp_destroy(conn->ibc_qp); + if (rc != 0) + CERROR("Can't destroy QP: %d\n", rc); + /* fall through */ + + case IBNAL_CONN_INIT_NOTHING: + break; + + default: + LASSERT (0); + } + + if (conn->ibc_cep != NULL) { + frc = iibt_cm_destroy_cep(conn->ibc_cep); + if (frc != 0) + CERROR("Can't destroy CEP %p: %d\n", conn->ibc_cep, + frc); + } + + if (conn->ibc_rx_pages != NULL) + kibnal_free_pages(conn->ibc_rx_pages); + + if (conn->ibc_rxs != NULL) + PORTAL_FREE(conn->ibc_rxs, + IBNAL_RX_MSGS * sizeof(kib_rx_t)); + + if (conn->ibc_peer != NULL) + kib_peer_decref(conn->ibc_peer); + + PORTAL_FREE(conn, sizeof (*conn)); + + atomic_dec(&kibnal_data.kib_nconns); + + if (atomic_read (&kibnal_data.kib_nconns) == 0 && + kibnal_data.kib_shutdown) { + /* I just nuked the last connection on shutdown; wake up + * everyone so they can exit. */ + wake_up_all(&kibnal_data.kib_sched_waitq); + wake_up_all(&kibnal_data.kib_connd_waitq); + } +} + +void +kibnal_put_conn (kib_conn_t *conn) +{ + unsigned long flags; + + CDEBUG (D_NET, "putting conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + + LASSERT (atomic_read (&conn->ibc_refcount) > 0); + if (!atomic_dec_and_test (&conn->ibc_refcount)) + return; + + /* must disconnect before dropping the final ref */ + LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECTED); + + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); + + list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns); + wake_up (&kibnal_data.kib_connd_waitq); + + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); +} + +static int +kibnal_close_peer_conns_locked (kib_peer_t *peer, int why) +{ + kib_conn_t *conn; + struct list_head *ctmp; + struct list_head *cnxt; + int count = 0; + + list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { + conn = list_entry (ctmp, kib_conn_t, ibc_list); + + count++; + kibnal_close_conn_locked (conn, why); + } + + return (count); +} + +int +kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation) +{ + kib_conn_t *conn; + struct list_head *ctmp; + struct list_head *cnxt; + int count = 0; + + list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { + conn = list_entry (ctmp, kib_conn_t, ibc_list); + + if (conn->ibc_incarnation == incarnation) + continue; + + CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n", + peer->ibp_nid, conn->ibc_incarnation, incarnation); + + count++; + kibnal_close_conn_locked (conn, -ESTALE); + } + + return (count); +} + +static int +kibnal_close_matching_conns (ptl_nid_t nid) +{ + unsigned long flags; + kib_peer_t *peer; + struct list_head *ptmp; + struct list_head *pnxt; + int lo; + int hi; + int i; + int count = 0; + + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + + if (nid != PTL_NID_ANY) + lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; + else { + lo = 0; + hi = kibnal_data.kib_peer_hash_size - 1; + } + + for (i = lo; i <= hi; i++) { + list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) { + + peer = list_entry (ptmp, kib_peer_t, ibp_list); + LASSERT (peer->ibp_persistence != 0 || + peer->ibp_connecting != 0 || + !list_empty (&peer->ibp_conns)); + + if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid)) + continue; + + count += kibnal_close_peer_conns_locked (peer, 0); + } + } + + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + + /* wildcards always succeed */ + if (nid == PTL_NID_ANY) + return (0); + + return (count == 0 ? -ENOENT : 0); +} + +static int +kibnal_cmd(struct portals_cfg *pcfg, void * private) +{ + int rc = -EINVAL; + ENTRY; + + LASSERT (pcfg != NULL); + + switch(pcfg->pcfg_command) { + case NAL_CMD_GET_PEER: { + ptl_nid_t nid = 0; + int share_count = 0; + + rc = kibnal_get_peer_info(pcfg->pcfg_count, + &nid, &share_count); + pcfg->pcfg_nid = nid; + pcfg->pcfg_size = 0; + pcfg->pcfg_id = 0; + pcfg->pcfg_misc = 0; + pcfg->pcfg_count = 0; + pcfg->pcfg_wait = share_count; + break; + } + case NAL_CMD_ADD_PEER: { + rc = kibnal_add_persistent_peer (pcfg->pcfg_nid); + break; + } + case NAL_CMD_DEL_PEER: { + rc = kibnal_del_peer (pcfg->pcfg_nid, + /* flags == single_share */ + pcfg->pcfg_flags != 0); + break; + } + case NAL_CMD_GET_CONN: { + kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count); + + if (conn == NULL) + rc = -ENOENT; + else { + rc = 0; + pcfg->pcfg_nid = conn->ibc_peer->ibp_nid; + pcfg->pcfg_id = 0; + pcfg->pcfg_misc = 0; + pcfg->pcfg_flags = 0; + kibnal_put_conn (conn); + } + break; + } + case NAL_CMD_CLOSE_CONNECTION: { + rc = kibnal_close_matching_conns (pcfg->pcfg_nid); + break; + } + case NAL_CMD_REGISTER_MYNID: { + if (pcfg->pcfg_nid == PTL_NID_ANY) + rc = -EINVAL; + else + rc = kibnal_set_mynid (pcfg->pcfg_nid); + break; + } + } + + RETURN(rc); +} + +void +kibnal_free_pages (kib_pages_t *p) +{ + int npages = p->ibp_npages; + int rc; + int i; + + if (p->ibp_mapped) { + rc = iibt_deregister_memory(p->ibp_handle); + if (rc != 0) + CERROR ("Deregister error: %d\n", rc); + } + + for (i = 0; i < npages; i++) + if (p->ibp_pages[i] != NULL) + __free_page(p->ibp_pages[i]); + + PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages])); +} + +int +kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write) +{ + kib_pages_t *p; + __u64 *phys_pages; + int i; + FSTATUS frc; + IB_ACCESS_CONTROL access; + + memset(&access, 0, sizeof(access)); + access.s.MWBindable = 1; + access.s.LocalWrite = 1; + access.s.RdmaRead = 1; + access.s.RdmaWrite = 1; + + PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages])); + if (p == NULL) { + CERROR ("Can't allocate buffer %d\n", npages); + return (-ENOMEM); + } + + memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages])); + p->ibp_npages = npages; + + for (i = 0; i < npages; i++) { + p->ibp_pages[i] = alloc_page (GFP_KERNEL); + if (p->ibp_pages[i] == NULL) { + CERROR ("Can't allocate page %d of %d\n", i, npages); + kibnal_free_pages(p); + return (-ENOMEM); + } + } + + if (kibnal_whole_mem()) + goto out; + + PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages)); + if (phys_pages == NULL) { + CERROR ("Can't allocate physarray for %d pages\n", npages); + /* XXX free ibp_pages? */ + kibnal_free_pages(p); + return (-ENOMEM); + } + + /* if we were using the _contig_ registration variant we would have + * an array of PhysAddr/Length pairs, but the discontiguous variant + * just takes the PhysAddr */ + for (i = 0; i < npages; i++) + phys_pages[i] = kibnal_page2phys(p->ibp_pages[i]); + + frc = iibt_register_physical_memory(kibnal_data.kib_hca, + 0, /* requested vaddr */ + phys_pages, npages, + 0, /* offset */ + kibnal_data.kib_pd, + access, + &p->ibp_handle, &p->ibp_vaddr, + &p->ibp_lkey, &p->ibp_rkey); + + PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages)); + + if (frc != FSUCCESS) { + CERROR ("Error %d mapping %d pages\n", frc, npages); + kibnal_free_pages(p); + return (-ENOMEM); + } + + CDEBUG(D_NET, "registered %d pages; handle: %p vaddr "LPX64" " + "lkey %x rkey %x\n", npages, p->ibp_handle, + p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey); + + p->ibp_mapped = 1; +out: + *pp = p; + return (0); +} + +static int +kibnal_setup_tx_descs (void) +{ + int ipage = 0; + int page_offset = 0; + __u64 vaddr; + __u64 vaddr_base; + struct page *page; + kib_tx_t *tx; + int i; + int rc; + + /* pre-mapped messages are not bigger than 1 page */ + LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE); + + /* No fancy arithmetic when we do the buffer calculations */ + LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0); + + rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES, + 0); + if (rc != 0) + return (rc); + + /* ignored for the whole_mem case */ + vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr; + + for (i = 0; i < IBNAL_TX_MSGS; i++) { + page = kibnal_data.kib_tx_pages->ibp_pages[ipage]; + tx = &kibnal_data.kib_tx_descs[i]; + + memset (tx, 0, sizeof(*tx)); /* zero flags etc */ + + tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + + page_offset); + + if (kibnal_whole_mem()) + tx->tx_vaddr = kibnal_page2phys(page) + + page_offset + + kibnal_data.kib_md.md_addr; + else + tx->tx_vaddr = vaddr; + + tx->tx_isnblk = (i >= IBNAL_NTX); + tx->tx_mapped = KIB_TX_UNMAPPED; + + CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", + i, tx, tx->tx_msg, tx->tx_vaddr); + + if (tx->tx_isnblk) + list_add (&tx->tx_list, + &kibnal_data.kib_idle_nblk_txs); + else + list_add (&tx->tx_list, + &kibnal_data.kib_idle_txs); + + vaddr += IBNAL_MSG_SIZE; + LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES); + + page_offset += IBNAL_MSG_SIZE; + LASSERT (page_offset <= PAGE_SIZE); + + if (page_offset == PAGE_SIZE) { + page_offset = 0; + ipage++; + LASSERT (ipage <= IBNAL_TX_MSG_PAGES); + } + } + + return (0); +} + +static void +kibnal_api_shutdown (nal_t *nal) +{ + int i; + int rc; + + if (nal->nal_refct != 0) { + /* This module got the first ref */ + PORTAL_MODULE_UNUSE; + return; + } + + CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", + atomic_read (&portal_kmemory)); + + LASSERT(nal == &kibnal_api); + + switch (kibnal_data.kib_init) { + default: + CERROR ("Unexpected state %d\n", kibnal_data.kib_init); + LBUG(); + + case IBNAL_INIT_ALL: + /* stop calls to nal_cmd */ + libcfs_nal_cmd_unregister(IIBNAL); + /* No new peers */ + + /* resetting my NID to unadvertises me, removes my + * listener and nukes all current peers */ + kibnal_set_mynid (PTL_NID_ANY); + + /* Wait for all peer state to clean up (crazy) */ + i = 2; + while (atomic_read (&kibnal_data.kib_npeers) != 0) { + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + "waiting for %d peers to disconnect (can take a few seconds)\n", + atomic_read (&kibnal_data.kib_npeers)); + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (HZ); + } + /* fall through */ + + case IBNAL_INIT_CQ: + rc = iibt_cq_destroy(kibnal_data.kib_cq); + if (rc != 0) + CERROR ("Destroy CQ error: %d\n", rc); + /* fall through */ + + case IBNAL_INIT_TXD: + kibnal_free_pages (kibnal_data.kib_tx_pages); + /* fall through */ + + case IBNAL_INIT_MR: + if (kibnal_data.kib_md.md_handle != NULL) { + rc = iibt_deregister_memory(kibnal_data.kib_md.md_handle); + if (rc != FSUCCESS) + CERROR ("Deregister memory: %d\n", rc); + } + /* fall through */ + +#if IBNAL_FMR + case IBNAL_INIT_FMR: + rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool); + if (rc != 0) + CERROR ("Destroy FMR pool error: %d\n", rc); + /* fall through */ +#endif + case IBNAL_INIT_PD: + rc = iibt_pd_free(kibnal_data.kib_pd); + if (rc != 0) + CERROR ("Destroy PD error: %d\n", rc); + /* fall through */ + + case IBNAL_INIT_SD: + rc = iibt_sd_deregister(kibnal_data.kib_sd); + if (rc != 0) + CERROR ("Deregister SD error: %d\n", rc); + /* fall through */ + + case IBNAL_INIT_PORT: + /* XXX ??? */ + /* fall through */ + + case IBNAL_INIT_PORTATTRS: + PORTAL_FREE(kibnal_data.kib_hca_attrs.PortAttributesList, + kibnal_data.kib_hca_attrs.PortAttributesListSize); + /* fall through */ + + case IBNAL_INIT_HCA: + rc = iibt_close_hca(kibnal_data.kib_hca); + if (rc != 0) + CERROR ("Close HCA error: %d\n", rc); + /* fall through */ + + case IBNAL_INIT_LIB: + lib_fini(&kibnal_lib); + /* fall through */ + + case IBNAL_INIT_DATA: + /* Module refcount only gets to zero when all peers + * have been closed so all lists must be empty */ + LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0); + LASSERT (kibnal_data.kib_peers != NULL); + for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { + LASSERT (list_empty (&kibnal_data.kib_peers[i])); + } + LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0); + LASSERT (list_empty (&kibnal_data.kib_sched_rxq)); + LASSERT (list_empty (&kibnal_data.kib_sched_txq)); + LASSERT (list_empty (&kibnal_data.kib_connd_conns)); + LASSERT (list_empty (&kibnal_data.kib_connd_peers)); + + /* flag threads to terminate; wake and wait for them to die */ + kibnal_data.kib_shutdown = 1; + wake_up_all (&kibnal_data.kib_sched_waitq); + wake_up_all (&kibnal_data.kib_connd_waitq); + + i = 2; + while (atomic_read (&kibnal_data.kib_nthreads) != 0) { + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + "Waiting for %d threads to terminate\n", + atomic_read (&kibnal_data.kib_nthreads)); + set_current_state (TASK_INTERRUPTIBLE); + schedule_timeout (HZ); + } + /* fall through */ + + case IBNAL_INIT_NOTHING: + break; + } + + if (kibnal_data.kib_tx_descs != NULL) + PORTAL_FREE (kibnal_data.kib_tx_descs, + IBNAL_TX_MSGS * sizeof(kib_tx_t)); + + if (kibnal_data.kib_peers != NULL) + PORTAL_FREE (kibnal_data.kib_peers, + sizeof (struct list_head) * + kibnal_data.kib_peer_hash_size); + + CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", + atomic_read (&portal_kmemory)); + printk(KERN_INFO "Lustre: Infinicon IB NAL unloaded (final mem %d)\n", + atomic_read(&portal_kmemory)); + + kibnal_data.kib_init = IBNAL_INIT_NOTHING; +} + +#define roundup_power(val, power) \ + ( (val + (__u64)(power - 1)) & ~((__u64)(power - 1)) ) + +/* this isn't very portable or sturdy in the face of funny mem/bus configs */ +static __u64 max_phys_mem(IB_CA_ATTRIBUTES *ca_attr) +{ + struct sysinfo si; + __u64 ret; + + /* XXX we don't bother with first-gen cards */ + if (ca_attr->VendorId == 0xd0b7 && ca_attr->DeviceId == 0x3101) + return 0ULL; + + si_meminfo(&si); + ret = (__u64)max(si.totalram, max_mapnr) * si.mem_unit; + return roundup_power(ret, 128 * 1024 * 1024); +} +#undef roundup_power + +static int +kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, + ptl_ni_limits_t *requested_limits, + ptl_ni_limits_t *actual_limits) +{ + ptl_process_id_t process_id; + int pkmem = atomic_read(&portal_kmemory); + IB_PORT_ATTRIBUTES *pattr; + FSTATUS frc; + int rc; + int n; + int i; + + LASSERT (nal == &kibnal_api); + + if (nal->nal_refct != 0) { + if (actual_limits != NULL) + *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits; + /* This module got the first ref */ + PORTAL_MODULE_USE; + return (PTL_OK); + } + + LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING); + + frc = IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2, + &kibnal_data.kib_interfaces); + if (frc != FSUCCESS) { + CERROR("IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2) = %d\n", + frc); + return -ENOSYS; + } + + init_MUTEX (&kibnal_data.kib_nid_mutex); + init_MUTEX_LOCKED (&kibnal_data.kib_nid_signal); + kibnal_data.kib_nid = PTL_NID_ANY; + + rwlock_init(&kibnal_data.kib_global_lock); + + kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE; + PORTAL_ALLOC (kibnal_data.kib_peers, + sizeof (struct list_head) * kibnal_data.kib_peer_hash_size); + if (kibnal_data.kib_peers == NULL) { + goto failed; + } + for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) + INIT_LIST_HEAD(&kibnal_data.kib_peers[i]); + + spin_lock_init (&kibnal_data.kib_connd_lock); + INIT_LIST_HEAD (&kibnal_data.kib_connd_peers); + INIT_LIST_HEAD (&kibnal_data.kib_connd_conns); + init_waitqueue_head (&kibnal_data.kib_connd_waitq); + + spin_lock_init (&kibnal_data.kib_sched_lock); + INIT_LIST_HEAD (&kibnal_data.kib_sched_txq); + INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq); + init_waitqueue_head (&kibnal_data.kib_sched_waitq); + + spin_lock_init (&kibnal_data.kib_tx_lock); + INIT_LIST_HEAD (&kibnal_data.kib_idle_txs); + INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs); + init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq); + + PORTAL_ALLOC (kibnal_data.kib_tx_descs, + IBNAL_TX_MSGS * sizeof(kib_tx_t)); + if (kibnal_data.kib_tx_descs == NULL) { + CERROR ("Can't allocate tx descs\n"); + goto failed; + } + + /* lists/ptrs/locks initialised */ + kibnal_data.kib_init = IBNAL_INIT_DATA; + /*****************************************************/ + + process_id.pid = 0; + process_id.nid = kibnal_data.kib_nid; + + rc = lib_init(&kibnal_lib, nal, process_id, + requested_limits, actual_limits); + if (rc != PTL_OK) { + CERROR("lib_init failed: error %d\n", rc); + goto failed; + } + + /* lib interface initialised */ + kibnal_data.kib_init = IBNAL_INIT_LIB; + /*****************************************************/ + + for (i = 0; i < IBNAL_N_SCHED; i++) { + rc = kibnal_thread_start (kibnal_scheduler, (void *)i); + if (rc != 0) { + CERROR("Can't spawn iibnal scheduler[%d]: %d\n", + i, rc); + goto failed; + } + } + + rc = kibnal_thread_start (kibnal_connd, NULL); + if (rc != 0) { + CERROR ("Can't spawn iibnal connd: %d\n", rc); + goto failed; + } + + n = sizeof(kibnal_data.kib_hca_guids) / + sizeof(kibnal_data.kib_hca_guids[0]); + frc = iibt_get_hca_guids(&n, kibnal_data.kib_hca_guids); + if (frc != FSUCCESS) { + CERROR ("Can't get channel adapter guids: %d\n", frc); + goto failed; + } + if (n == 0) { + CERROR ("No channel adapters found\n"); + goto failed; + } + + /* Infinicon has per-HCA rather than per CQ completion handlers */ + frc = iibt_open_hca(kibnal_data.kib_hca_guids[0], + kibnal_ca_callback, + kibnal_ca_async_callback, + &kibnal_data.kib_hca, + &kibnal_data.kib_hca); + if (frc != FSUCCESS) { + CERROR ("Can't open CA[0]: %d\n", frc); + goto failed; + } + + /* Channel Adapter opened */ + kibnal_data.kib_init = IBNAL_INIT_HCA; + /*****************************************************/ + + kibnal_data.kib_hca_attrs.PortAttributesList = NULL; + kibnal_data.kib_hca_attrs.PortAttributesListSize = 0; + frc = iibt_query_hca(kibnal_data.kib_hca, + &kibnal_data.kib_hca_attrs, NULL); + if (frc != FSUCCESS) { + CERROR ("Can't size port attrs: %d\n", frc); + goto failed; + } + + PORTAL_ALLOC(kibnal_data.kib_hca_attrs.PortAttributesList, + kibnal_data.kib_hca_attrs.PortAttributesListSize); + if (kibnal_data.kib_hca_attrs.PortAttributesList == NULL) + goto failed; + + /* Port attrs allocated */ + kibnal_data.kib_init = IBNAL_INIT_PORTATTRS; + /*****************************************************/ + + frc = iibt_query_hca(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs, + NULL); + if (frc != FSUCCESS) { + CERROR ("Can't get port attrs for CA 0: %d\n", frc); + goto failed; + } + + for (i = 0, pattr = kibnal_data.kib_hca_attrs.PortAttributesList; + pattr != NULL; + i++, pattr = pattr->Next) { + switch (pattr->PortState) { + default: + CERROR("Unexpected port[%d] state %d\n", + i, pattr->PortState); + continue; + case PortStateDown: + CDEBUG(D_NET, "port[%d] Down\n", i); + continue; + case PortStateInit: + CDEBUG(D_NET, "port[%d] Init\n", i); + continue; + case PortStateArmed: + CDEBUG(D_NET, "port[%d] Armed\n", i); + continue; + + case PortStateActive: + CDEBUG(D_NET, "port[%d] Active\n", i); + kibnal_data.kib_port = i; + kibnal_data.kib_port_guid = pattr->GUID; + kibnal_data.kib_port_pkey = pattr->PkeyTable[0]; + break; + } + break; + } + + if (pattr == NULL) { + CERROR ("Can't find an active port\n"); + goto failed; + } + + CDEBUG(D_NET, "got guid "LPX64"\n", kibnal_data.kib_port_guid); + + /* Active port found */ + kibnal_data.kib_init = IBNAL_INIT_PORT; + /*****************************************************/ + + frc = iibt_sd_register(&kibnal_data.kib_sd, NULL); + if (frc != FSUCCESS) { + CERROR ("Can't register with SD: %d\n", frc); + goto failed; + } + + /* Registered with SD OK */ + kibnal_data.kib_init = IBNAL_INIT_SD; + /*****************************************************/ + + frc = iibt_pd_allocate(kibnal_data.kib_hca, 0, &kibnal_data.kib_pd); + if (frc != FSUCCESS) { + CERROR ("Can't create PD: %d\n", rc); + goto failed; + } + + /* flag PD initialised */ + kibnal_data.kib_init = IBNAL_INIT_PD; + /*****************************************************/ + +#if IBNAL_FMR + { + const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK; + struct ib_fmr_pool_param params = { + .max_pages_per_fmr = PTL_MTU/PAGE_SIZE, + .access = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ), + .pool_size = pool_size, + .dirty_watermark = (pool_size * 3)/4, + .flush_function = NULL, + .flush_arg = NULL, + .cache = 1, + }; + rc = ib_fmr_pool_create(kibnal_data.kib_pd, ¶ms, + &kibnal_data.kib_fmr_pool); + if (rc != 0) { + CERROR ("Can't create FMR pool size %d: %d\n", + pool_size, rc); + goto failed; + } + } + + /* flag FMR pool initialised */ + kibnal_data.kib_init = IBNAL_INIT_FMR; +#endif + /*****************************************************/ + if (IBNAL_WHOLE_MEM) { + IB_MR_PHYS_BUFFER phys; + IB_ACCESS_CONTROL access; + kib_md_t *md = &kibnal_data.kib_md; + + memset(&access, 0, sizeof(access)); + access.s.MWBindable = 1; + access.s.LocalWrite = 1; + access.s.RdmaRead = 1; + access.s.RdmaWrite = 1; + + phys.PhysAddr = 0; + phys.Length = max_phys_mem(&kibnal_data.kib_hca_attrs); + if (phys.Length == 0) { + CERROR ("couldn't determine the end of phys mem\n"); + goto failed; + } + + rc = iibt_register_contig_physical_memory(kibnal_data.kib_hca, + 0, + &phys, 1, + 0, + kibnal_data.kib_pd, + access, + &md->md_handle, + &md->md_addr, + &md->md_lkey, + &md->md_rkey); + if (rc != FSUCCESS) { + CERROR("registering physical memory failed: %d\n", + rc); + CERROR("falling back to registration per-rdma\n"); + md->md_handle = NULL; + } else { + CDEBUG(D_NET, "registered "LPU64" bytes of mem\n", + phys.Length); + kibnal_data.kib_init = IBNAL_INIT_MR; + } + } + + /*****************************************************/ + + rc = kibnal_setup_tx_descs(); + if (rc != 0) { + CERROR ("Can't register tx descs: %d\n", rc); + goto failed; + } + + /* flag TX descs initialised */ + kibnal_data.kib_init = IBNAL_INIT_TXD; + /*****************************************************/ + + { + uint32 nentries; + + frc = iibt_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES, + &kibnal_data.kib_cq, &kibnal_data.kib_cq, + &nentries); + if (frc != FSUCCESS) { + CERROR ("Can't create RX CQ: %d\n", frc); + goto failed; + } + + /* flag CQ initialised */ + kibnal_data.kib_init = IBNAL_INIT_CQ; + + if (nentries < IBNAL_CQ_ENTRIES) { + CERROR ("CQ only has %d entries, need %d\n", + nentries, IBNAL_CQ_ENTRIES); + goto failed; + } + + rc = iibt_cq_rearm(kibnal_data.kib_cq, CQEventSelNextWC); + if (rc != 0) { + CERROR ("Failed to re-arm completion queue: %d\n", rc); + goto failed; + } + } + + /*****************************************************/ + + rc = libcfs_nal_cmd_register(IIBNAL, &kibnal_cmd, NULL); + if (rc != 0) { + CERROR ("Can't initialise command interface (rc = %d)\n", rc); + goto failed; + } + + /* flag everything initialised */ + kibnal_data.kib_init = IBNAL_INIT_ALL; + /*****************************************************/ + + printk(KERN_INFO "Lustre: Infinicon IB NAL loaded " + "(initial mem %d)\n", pkmem); + + return (PTL_OK); + + failed: + kibnal_api_shutdown (&kibnal_api); + return (PTL_FAIL); +} + +void __exit +kibnal_module_fini (void) +{ +#ifdef CONFIG_SYSCTL + if (kibnal_tunables.kib_sysctl != NULL) + unregister_sysctl_table (kibnal_tunables.kib_sysctl); +#endif + PtlNIFini(kibnal_ni); + + ptl_unregister_nal(IIBNAL); +} + +int __init +kibnal_module_init (void) +{ + int rc; + + if (sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN) { + CERROR("sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN\n"); + return -EINVAL; + } + + /* the following must be sizeof(int) for proc_dointvec() */ + if (sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)) { + CERROR("sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)\n"); + return -EINVAL; + } + + kibnal_api.nal_ni_init = kibnal_api_startup; + kibnal_api.nal_ni_fini = kibnal_api_shutdown; + + /* Initialise dynamic tunables to defaults once only */ + kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT; + + rc = ptl_register_nal(IIBNAL, &kibnal_api); + if (rc != PTL_OK) { + CERROR("Can't register IBNAL: %d\n", rc); + return (-ENOMEM); /* or something... */ + } + + /* Pure gateways want the NAL started up at module load time... */ + rc = PtlNIInit(IIBNAL, 0, NULL, NULL, &kibnal_ni); + if (rc != PTL_OK && rc != PTL_IFACE_DUP) { + ptl_unregister_nal(IIBNAL); + return (-ENODEV); + } + +#ifdef CONFIG_SYSCTL + /* Press on regardless even if registering sysctl doesn't work */ + kibnal_tunables.kib_sysctl = + register_sysctl_table (kibnal_top_ctl_table, 0); +#endif + return (0); +} + +MODULE_AUTHOR("Cluster File Systems, Inc. "); +MODULE_DESCRIPTION("Kernel Infinicon IB NAL v0.01"); +MODULE_LICENSE("GPL"); + +module_init(kibnal_module_init); +module_exit(kibnal_module_fini); + diff --git a/lustre/portals/knals/iibnal/iibnal.h b/lustre/portals/knals/iibnal/iibnal.h new file mode 100644 index 0000000..0a25a9a --- /dev/null +++ b/lustre/portals/knals/iibnal/iibnal.h @@ -0,0 +1,892 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * Author: Eric Barton + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_IBNAL + +#include +#include +#include +#include + +#include + +#define GCC_VERSION (__GNUC__ * 10000 \ + + __GNUC_MINOR__ * 100 \ + + __GNUC_PATCHLEVEL__) + +/* Test for GCC > 3.2.2 */ +#if GCC_VERSION <= 30202 +/* GCC 3.2.2, and presumably several versions before it, will + * miscompile this driver. See + * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9853. */ +#error Invalid GCC version. Must use GCC >= 3.2.3 +#endif + +#define IBNAL_SERVICE_NAME "iibnal" +#define IBNAL_SERVICE_NUMBER 0x11b9a1 + +#if CONFIG_SMP +# define IBNAL_N_SCHED num_online_cpus() /* # schedulers */ +#else +# define IBNAL_N_SCHED 1 /* # schedulers */ +#endif + +#define IBNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */ +#define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */ + +#define IBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */ + +#define IBNAL_MSG_QUEUE_SIZE 8 /* # messages/RDMAs in-flight */ +#define IBNAL_CREDIT_HIGHWATER 7 /* when to eagerly return credits */ +/* 7 indicates infinite retry attempts, Infinicon recommended 5 */ +#define IBNAL_RETRY 5 /* # times to retry */ +#define IBNAL_RNR_RETRY 5 /* */ +#define IBNAL_CM_RETRY 5 /* # times to retry connection */ +#define IBNAL_FLOW_CONTROL 1 +#define IBNAL_ACK_TIMEOUT 20 /* supposedly 4 secs */ + +#define IBNAL_NTX 64 /* # tx descs */ +/* this had to be dropped down so that we only register < 255 pages per + * region. this will change if we register all memory. */ +#define IBNAL_NTX_NBLK 128 /* # reserved tx descs */ + +#define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */ + +#define IBNAL_RESCHED 100 /* # scheduler loops before reschedule */ + +#define IBNAL_CONCURRENT_PEERS 1000 /* # nodes all talking at once to me */ + +/* default vals for runtime tunables */ +#define IBNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */ + +/************************/ +/* derived constants... */ + +/* TX messages (shared by all connections) */ +#define IBNAL_TX_MSGS (IBNAL_NTX + IBNAL_NTX_NBLK) +#define IBNAL_TX_MSG_BYTES (IBNAL_TX_MSGS * IBNAL_MSG_SIZE) +#define IBNAL_TX_MSG_PAGES ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) + +#define IBNAL_TX_MAX_SG (PTL_MD_MAX_IOV + 1) + +/* RX messages (per connection) */ +#define IBNAL_RX_MSGS IBNAL_MSG_QUEUE_SIZE +#define IBNAL_RX_MSG_BYTES (IBNAL_RX_MSGS * IBNAL_MSG_SIZE) +#define IBNAL_RX_MSG_PAGES ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) + + +/* we may have up to 2 completions per transmit + + 1 completion per receive, per connection */ +#define IBNAL_CQ_ENTRIES ((2*IBNAL_TX_MSGS) + \ + (IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS)) + +#define IBNAL_RDMA_BASE 0x0eeb0000 +#define IBNAL_FMR 0 +#define IBNAL_WHOLE_MEM 1 +#define IBNAL_CKSUM 0 +//#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_PROCESS +#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_INTERRUPT + +/* XXX I have no idea. */ +#define IBNAL_STARTING_PSN 1 + +typedef struct +{ + int kib_io_timeout; /* comms timeout (seconds) */ + struct ctl_table_header *kib_sysctl; /* sysctl interface */ +} kib_tunables_t; + +/* some of these have specific types in the stack that just map back + * to the uFOO types, like IB_{L,R}_KEY. */ +typedef struct +{ + int ibp_npages; /* # pages */ + int ibp_mapped; /* mapped? */ + __u64 ibp_vaddr; /* mapped region vaddr */ + __u32 ibp_lkey; /* mapped region lkey */ + __u32 ibp_rkey; /* mapped region rkey */ + IB_HANDLE ibp_handle; /* mapped region handle */ + struct page *ibp_pages[0]; +} kib_pages_t; + +typedef struct +{ + IB_HANDLE md_handle; + __u32 md_lkey; + __u32 md_rkey; + __u64 md_addr; +} kib_md_t __attribute__((packed)); + +typedef struct +{ + int kib_init; /* initialisation state */ + __u64 kib_incarnation; /* which one am I */ + int kib_shutdown; /* shut down? */ + atomic_t kib_nthreads; /* # live threads */ + + __u64 kib_service_id; /* service number I listen on */ + __u64 kib_port_guid; /* my GUID (lo 64 of GID)*/ + __u16 kib_port_pkey; /* my pkey, whatever that is */ + ptl_nid_t kib_nid; /* my NID */ + struct semaphore kib_nid_mutex; /* serialise NID ops */ + struct semaphore kib_nid_signal; /* signal completion */ + IB_HANDLE kib_cep; /* connection end point */ + + rwlock_t kib_global_lock; /* stabilize peer/conn ops */ + + struct list_head *kib_peers; /* hash table of all my known peers */ + int kib_peer_hash_size; /* size of kib_peers */ + atomic_t kib_npeers; /* # peers extant */ + atomic_t kib_nconns; /* # connections extant */ + + struct list_head kib_connd_conns; /* connections to progress */ + struct list_head kib_connd_peers; /* peers waiting for a connection */ + wait_queue_head_t kib_connd_waitq; /* connection daemons sleep here */ + unsigned long kib_connd_waketime; /* when connd will wake */ + spinlock_t kib_connd_lock; /* serialise */ + + wait_queue_head_t kib_sched_waitq; /* schedulers sleep here */ + struct list_head kib_sched_txq; /* tx requiring attention */ + struct list_head kib_sched_rxq; /* rx requiring attention */ + spinlock_t kib_sched_lock; /* serialise */ + + struct kib_tx *kib_tx_descs; /* all the tx descriptors */ + kib_pages_t *kib_tx_pages; /* premapped tx msg pages */ + + struct list_head kib_idle_txs; /* idle tx descriptors */ + struct list_head kib_idle_nblk_txs; /* idle reserved tx descriptors */ + wait_queue_head_t kib_idle_tx_waitq; /* block here for tx descriptor */ + __u64 kib_next_tx_cookie; /* RDMA completion cookie */ + spinlock_t kib_tx_lock; /* serialise */ + + IB_HANDLE kib_hca; /* The HCA */ + int kib_port; /* port on the device */ + IB_HANDLE kib_pd; /* protection domain */ + IB_HANDLE kib_sd; /* SD handle */ + IB_HANDLE kib_cq; /* completion queue */ + kib_md_t kib_md; /* full-mem registration */ + + void *kib_listen_handle; /* where I listen for connections */ + + IBT_INTERFACE_UNION kib_interfaces; /* The Infinicon IBT interface */ + + uint64 kib_hca_guids[8]; /* all the HCA guids */ + IB_CA_ATTRIBUTES kib_hca_attrs; /* where to get HCA attrs */ + FABRIC_OPERATION_DATA kib_fabopdata; /* (un)advertise service record */ +} kib_data_t; + +#define IBNAL_INIT_NOTHING 0 +#define IBNAL_INIT_DATA 1 +#define IBNAL_INIT_LIB 2 +#define IBNAL_INIT_HCA 3 +#define IBNAL_INIT_PORTATTRS 4 +#define IBNAL_INIT_PORT 5 +#define IBNAL_INIT_SD 6 +#define IBNAL_INIT_PD 7 +#define IBNAL_INIT_FMR 8 +#define IBNAL_INIT_MR 9 +#define IBNAL_INIT_TXD 10 +#define IBNAL_INIT_CQ 11 +#define IBNAL_INIT_ALL 12 + +/************************************************************************ + * Wire message structs. + * These are sent in sender's byte order (i.e. receiver flips). + * CAVEAT EMPTOR: other structs communicated between nodes (e.g. MAD + * private data and SM service info), is LE on the wire. + */ + +/* also kib_md_t above */ + +typedef struct +{ + __u32 rd_key; /* remote key */ + __u32 rd_nob; /* # of bytes */ + __u64 rd_addr; /* remote io vaddr */ +} kib_rdma_desc_t __attribute__((packed)); + +typedef struct +{ + ptl_hdr_t ibim_hdr; /* portals header */ + char ibim_payload[0]; /* piggy-backed payload */ +} kib_immediate_msg_t __attribute__((packed)); + +/* these arrays serve two purposes during rdma. they are built on the passive + * side and sent to the active side as remote arguments. On the active side + * the descs are used as a data structure on the way to local gather items. + * the different roles result in split local/remote meaning of desc->rd_key */ +typedef struct +{ + ptl_hdr_t ibrm_hdr; /* portals header */ + __u64 ibrm_cookie; /* opaque completion cookie */ + __u32 ibrm_num_descs; /* how many descs */ + kib_rdma_desc_t ibrm_desc[0]; /* where to suck/blow */ +} kib_rdma_msg_t __attribute__((packed)); + +#define kib_rdma_msg_len(num_descs) \ + offsetof(kib_msg_t, ibm_u.rdma.ibrm_desc[num_descs]) + +typedef struct +{ + __u64 ibcm_cookie; /* opaque completion cookie */ + __u32 ibcm_status; /* completion status */ +} kib_completion_msg_t __attribute__((packed)); + +typedef struct +{ + __u32 ibm_magic; /* I'm an openibnal message */ + __u16 ibm_version; /* this is my version number */ + __u8 ibm_type; /* msg type */ + __u8 ibm_credits; /* returned credits */ +#if IBNAL_CKSUM + __u32 ibm_nob; + __u32 ibm_cksum; +#endif + union { + kib_immediate_msg_t immediate; + kib_rdma_msg_t rdma; + kib_completion_msg_t completion; + } ibm_u __attribute__((packed)); +} kib_msg_t __attribute__((packed)); + +#define IBNAL_MSG_MAGIC 0x0be91b91 /* unique magic */ +#define IBNAL_MSG_VERSION 1 /* current protocol version */ + +#define IBNAL_MSG_NOOP 0xd0 /* nothing (just credits) */ +#define IBNAL_MSG_IMMEDIATE 0xd1 /* portals hdr + payload */ +#define IBNAL_MSG_PUT_RDMA 0xd2 /* portals PUT hdr + source rdma desc */ +#define IBNAL_MSG_PUT_DONE 0xd3 /* signal PUT rdma completion */ +#define IBNAL_MSG_GET_RDMA 0xd4 /* portals GET hdr + sink rdma desc */ +#define IBNAL_MSG_GET_DONE 0xd5 /* signal GET rdma completion */ + +/***********************************************************************/ + +typedef struct kib_rx /* receive message */ +{ + struct list_head rx_list; /* queue for attention */ + struct kib_conn *rx_conn; /* owning conn */ + int rx_rdma; /* RDMA completion posted? */ + int rx_posted; /* posted? */ + __u64 rx_vaddr; /* pre-mapped buffer (hca vaddr) */ + kib_msg_t *rx_msg; /* pre-mapped buffer (host vaddr) */ + IB_WORK_REQ rx_wrq; + IB_LOCAL_DATASEGMENT rx_gl; /* and it's memory */ +} kib_rx_t; + +typedef struct kib_tx /* transmit message */ +{ + struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */ + int tx_isnblk; /* I'm reserved for non-blocking sends */ + struct kib_conn *tx_conn; /* owning conn */ + int tx_mapped; /* mapped for RDMA? */ + int tx_sending; /* # tx callbacks outstanding */ + int tx_status; /* completion status */ + unsigned long tx_deadline; /* completion deadline */ + int tx_passive_rdma; /* peer sucks/blows */ + int tx_passive_rdma_wait; /* waiting for peer to complete */ + __u64 tx_passive_rdma_cookie; /* completion cookie */ + lib_msg_t *tx_libmsg[2]; /* lib msgs to finalize on completion */ + kib_md_t tx_md; /* RDMA mapping (active/passive) */ + __u64 tx_vaddr; /* pre-mapped buffer (hca vaddr) */ + kib_msg_t *tx_msg; /* pre-mapped buffer (host vaddr) */ + int tx_nsp; /* # send work items */ + IB_WORK_REQ tx_wrq[IBNAL_TX_MAX_SG]; /* send work items... */ + IB_LOCAL_DATASEGMENT tx_gl[IBNAL_TX_MAX_SG]; /* ...and their memory */ +} kib_tx_t; + +#define KIB_TX_UNMAPPED 0 +#define KIB_TX_MAPPED 1 +#define KIB_TX_MAPPED_FMR 2 + +typedef struct kib_wire_connreq +{ + __u32 wcr_magic; /* I'm an openibnal connreq */ + __u16 wcr_version; /* this is my version number */ + __u16 wcr_queue_depth; /* this is my receive queue size */ + __u64 wcr_nid; /* peer's NID */ + __u64 wcr_incarnation; /* peer's incarnation */ +} kib_wire_connreq_t; + +typedef struct kib_gid +{ + __u64 hi, lo; +} kib_gid_t; + +typedef struct kib_connreq +{ + /* connection-in-progress */ + struct kib_conn *cr_conn; + kib_wire_connreq_t cr_wcr; + __u64 cr_tid; + IB_SERVICE_RECORD cr_service; + kib_gid_t cr_gid; + IB_PATH_RECORD cr_path; + CM_REQUEST_INFO cr_cmreq; + CM_CONN_INFO cr_discarded; + CM_REJECT_INFO cr_rej_info; +} kib_connreq_t; + +typedef struct kib_conn +{ + struct kib_peer *ibc_peer; /* owning peer */ + struct list_head ibc_list; /* stash on peer's conn list */ + __u64 ibc_incarnation; /* which instance of the peer */ + atomic_t ibc_refcount; /* # users */ + int ibc_state; /* what's happening */ + atomic_t ibc_nob; /* # bytes buffered */ + int ibc_nsends_posted; /* # uncompleted sends */ + int ibc_credits; /* # credits I have */ + int ibc_outstanding_credits; /* # credits to return */ + int ibc_rcvd_disconnect;/* received discon request */ + int ibc_sent_disconnect;/* sent discon request */ + struct list_head ibc_tx_queue; /* send queue */ + struct list_head ibc_active_txs; /* active tx awaiting completion */ + spinlock_t ibc_lock; /* serialise */ + kib_rx_t *ibc_rxs; /* the rx descs */ + kib_pages_t *ibc_rx_pages; /* premapped rx msg pages */ + IB_HANDLE ibc_qp; /* queue pair */ + IB_HANDLE ibc_cep; /* connection ID? */ + IB_QP_ATTRIBUTES_QUERY ibc_qp_attrs; /* QP attrs */ + kib_connreq_t *ibc_connreq; /* connection request state */ +} kib_conn_t; + +#define IBNAL_CONN_INIT_NOTHING 0 /* initial state */ +#define IBNAL_CONN_INIT_QP 1 /* ibc_qp set up */ +#define IBNAL_CONN_CONNECTING 2 /* started to connect */ +#define IBNAL_CONN_ESTABLISHED 3 /* connection established */ +#define IBNAL_CONN_SEND_DREQ 4 /* to send disconnect req */ +#define IBNAL_CONN_DREQ 5 /* sent disconnect req */ +#define IBNAL_CONN_DREP 6 /* sent disconnect rep */ +#define IBNAL_CONN_DISCONNECTED 7 /* no more QP or CM traffic */ + +#define KIB_ASSERT_CONN_STATE(conn, state) do { \ + LASSERTF((conn)->ibc_state == state, "%d\n", conn->ibc_state); \ +} while (0) + +#define KIB_ASSERT_CONN_STATE_RANGE(conn, low, high) do { \ + LASSERTF(low <= high, "%d %d\n", low, high); \ + LASSERTF((conn)->ibc_state >= low && (conn)->ibc_state <= high, \ + "%d\n", conn->ibc_state); \ +} while (0) + +typedef struct kib_peer +{ + struct list_head ibp_list; /* stash on global peer list */ + struct list_head ibp_connd_list; /* schedule on kib_connd_peers */ + ptl_nid_t ibp_nid; /* who's on the other end(s) */ + atomic_t ibp_refcount; /* # users */ + int ibp_persistence; /* "known" peer refs */ + struct list_head ibp_conns; /* all active connections */ + struct list_head ibp_tx_queue; /* msgs waiting for a conn */ + int ibp_connecting; /* connecting+accepting */ + unsigned long ibp_reconnect_time; /* when reconnect may be attempted */ + unsigned long ibp_reconnect_interval; /* exponential backoff */ +} kib_peer_t; + + +extern lib_nal_t kibnal_lib; +extern kib_data_t kibnal_data; +extern kib_tunables_t kibnal_tunables; + +/******************************************************************************/ +/* Infinicon IBT interface wrappers */ +#define IIBT_IF (kibnal_data.kib_interfaces.ver2) + +static inline FSTATUS +iibt_get_hca_guids(uint32 *hca_count, EUI64 *hca_guid_list) +{ + return IIBT_IF.GetCaGuids(hca_count, hca_guid_list); +} + +static inline FSTATUS +iibt_open_hca(EUI64 hca_guid, + IB_COMPLETION_CALLBACK completion_callback, + IB_ASYNC_EVENT_CALLBACK async_event_callback, + void *arg, + IB_HANDLE *handle) +{ + return IIBT_IF.Vpi.OpenCA(hca_guid, completion_callback, + async_event_callback, arg, handle); +} + +static inline FSTATUS +iibt_query_hca(IB_HANDLE hca_handle, IB_CA_ATTRIBUTES *attrs, void **argp) +{ + return IIBT_IF.Vpi.QueryCA(hca_handle, attrs, argp); +} + +static inline FSTATUS +iibt_close_hca(IB_HANDLE hca_handle) +{ + return IIBT_IF.Vpi.CloseCA(hca_handle); +} + +static inline FSTATUS +iibt_pd_allocate(IB_HANDLE hca_handle, __u32 max_avs, IB_HANDLE *pd_handle) +{ + return IIBT_IF.Vpi.AllocatePD(hca_handle, max_avs, pd_handle); +} + +static inline FSTATUS +iibt_pd_free(IB_HANDLE pd_handle) +{ + return IIBT_IF.Vpi.FreePD(pd_handle); +} + +static inline FSTATUS +iibt_register_physical_memory(IB_HANDLE hca_handle, + IB_VIRT_ADDR requested_io_va, + void *phys_buffers, uint64 nphys_buffers, + uint32 io_va_offset, IB_HANDLE pd_handle, + IB_ACCESS_CONTROL access, + IB_HANDLE *mem_handle, + IB_VIRT_ADDR *actual_io_va, + IB_L_KEY *lkey, IB_R_KEY *rkey) +{ + return IIBT_IF.Vpi.RegisterPhysMemRegion(hca_handle, requested_io_va, + phys_buffers, nphys_buffers, + io_va_offset, pd_handle, + access, + mem_handle, actual_io_va, + lkey, rkey); +} + +static inline FSTATUS +iibt_register_contig_physical_memory(IB_HANDLE hca_handle, + IB_VIRT_ADDR requested_io_va, + IB_MR_PHYS_BUFFER *phys_buffers, + uint64 nphys_buffers, + uint32 io_va_offset, IB_HANDLE pd_handle, + IB_ACCESS_CONTROL access, + IB_HANDLE *mem_handle, + IB_VIRT_ADDR *actual_io_va, + IB_L_KEY *lkey, IB_R_KEY *rkey) +{ + return IIBT_IF.Vpi.RegisterContigPhysMemRegion(hca_handle, + requested_io_va, + phys_buffers, + nphys_buffers, + io_va_offset, pd_handle, + access, + mem_handle, actual_io_va, + lkey, rkey); +} + +static inline FSTATUS +iibt_register_memory(IB_HANDLE hca_handle, + void *virt_addr, unsigned int length, + IB_HANDLE pd_handle, + IB_ACCESS_CONTROL access, + IB_HANDLE *mem_handle, + IB_L_KEY *lkey, IB_R_KEY *rkey) +{ + return IIBT_IF.Vpi.RegisterMemRegion(hca_handle, + virt_addr, length, + pd_handle, + access, + mem_handle, + lkey, rkey); +} + +static inline FSTATUS +iibt_deregister_memory(IB_HANDLE mem_handle) +{ + return IIBT_IF.Vpi.DeregisterMemRegion(mem_handle); +} + +static inline FSTATUS +iibt_cq_create(IB_HANDLE hca_handle, uint32 requested_size, + void *arg, IB_HANDLE *cq_handle, uint32 *actual_size) +{ + return IIBT_IF.Vpi.CreateCQ(hca_handle, requested_size, + arg, cq_handle, actual_size); +} + +static inline FSTATUS +iibt_cq_poll(IB_HANDLE cq_handle, IB_WORK_COMPLETION *wc) +{ + return IIBT_IF.Vpi.PollCQ(cq_handle, wc); +} + +static inline FSTATUS +iibt_cq_rearm(IB_HANDLE cq_handle, IB_CQ_EVENT_SELECT select) +{ + return IIBT_IF.Vpi.RearmCQ(cq_handle, select); +} + +static inline FSTATUS +iibt_cq_destroy(IB_HANDLE cq_handle) +{ + return IIBT_IF.Vpi.DestroyCQ(cq_handle); +} + +static inline FSTATUS +iibt_qp_create(IB_HANDLE hca_handle, IB_QP_ATTRIBUTES_CREATE *create_attr, + void *arg, IB_HANDLE *cq_handle, + IB_QP_ATTRIBUTES_QUERY *query_attr) +{ + return IIBT_IF.Vpi.CreateQP(hca_handle, create_attr, arg, cq_handle, + query_attr); +} + +static inline FSTATUS +iibt_qp_query(IB_HANDLE qp_handle, IB_QP_ATTRIBUTES_QUERY *query_attr, + void **arg_ptr) +{ + return IIBT_IF.Vpi.QueryQP(qp_handle, query_attr, arg_ptr); +} + +static inline FSTATUS +iibt_qp_modify(IB_HANDLE qp_handle, IB_QP_ATTRIBUTES_MODIFY *modify_attr, + IB_QP_ATTRIBUTES_QUERY *query_attr) +{ + return IIBT_IF.Vpi.ModifyQP(qp_handle, modify_attr, query_attr); +} + +static inline FSTATUS +iibt_qp_destroy(IB_HANDLE qp_handle) +{ + return IIBT_IF.Vpi.DestroyQP(qp_handle); +} + +static inline FSTATUS +iibt_postrecv(IB_HANDLE qp_handle, IB_WORK_REQ *work_req) +{ + return IIBT_IF.Vpi.PostRecv(qp_handle, work_req); +} + +static inline FSTATUS +iibt_postsend(IB_HANDLE qp_handle, IB_WORK_REQ *work_req) +{ + return IIBT_IF.Vpi.PostSend(qp_handle, work_req); +} + +static inline FSTATUS +iibt_sd_register(IB_HANDLE *sd_handle, CLIENT_CONTROL_PARAMETERS *p) +{ + return IIBT_IF.Sdi.Register(sd_handle, p); +} + +static inline FSTATUS +iibt_sd_deregister(IB_HANDLE sd_handle) +{ + return IIBT_IF.Sdi.Deregister(sd_handle); +} + +static inline FSTATUS +iibt_sd_port_fabric_operation(IB_HANDLE sd_handle, EUI64 port_guid, + FABRIC_OPERATION_DATA *fod, + PFABRIC_OPERATION_CALLBACK callback, + COMMAND_CONTROL_PARAMETERS *p, void *arg) +{ + return IIBT_IF.Sdi.PortFabricOperation(sd_handle, port_guid, + fod, callback, p, arg); +} + +static inline FSTATUS +iibt_sd_query_port_fabric_information(IB_HANDLE sd_handle, EUI64 port_guid, + QUERY *qry, + PQUERY_CALLBACK callback, + COMMAND_CONTROL_PARAMETERS *p, void *arg) +{ + return IIBT_IF.Sdi.QueryPortFabricInformation(sd_handle, port_guid, + qry, callback, p, arg); +} + +static inline IB_HANDLE +iibt_cm_create_cep(CM_CEP_TYPE type) +{ + return IIBT_IF.Cmi.CmCreateCEP(type); +} + +static inline FSTATUS +iibt_cm_modify_cep(IB_HANDLE cep, uint32 attr, char* value, uint32 len, + uint32 offset) +{ + return IIBT_IF.Cmi.CmModifyCEP(cep, attr, value, len, offset); +} + +static inline FSTATUS +iibt_cm_destroy_cep(IB_HANDLE cep_handle) +{ + return IIBT_IF.Cmi.CmDestroyCEP(cep_handle); +} + +static inline FSTATUS +iibt_cm_listen(IB_HANDLE cep, CM_LISTEN_INFO *info, + PFN_CM_CALLBACK callback, void *arg) +{ + return IIBT_IF.Cmi.CmListen(cep, info, callback, arg); +} + +static inline FSTATUS +iibt_cm_cancel(IB_HANDLE cep) +{ + return IIBT_IF.Cmi.CmCancel(cep); +} + +static inline FSTATUS +iibt_cm_accept(IB_HANDLE cep, + CM_CONN_INFO *send_info, CM_CONN_INFO *recv_info, + PFN_CM_CALLBACK callback, void *arg, + IB_HANDLE *new_cep) +{ + return IIBT_IF.Cmi.CmAccept(cep, + send_info, recv_info, + callback, arg, new_cep); +} + +static inline FSTATUS +iibt_cm_reject(IB_HANDLE cep, CM_REJECT_INFO *rej) +{ + return IIBT_IF.Cmi.CmReject(cep, rej); +} + +static inline FSTATUS +iibt_cm_disconnect(IB_HANDLE cep, CM_DREQUEST_INFO *req, + CM_DREPLY_INFO *reply) +{ + return IIBT_IF.Cmi.CmDisconnect(cep, req, reply); +} + +static inline FSTATUS +iibt_cm_connect (IB_HANDLE cep, CM_REQUEST_INFO *req, + PFN_CM_CALLBACK callback, void *arg) +{ + return IIBT_IF.Cmi.CmConnect (cep, req, callback, arg); +} + +static inline int wrq_signals_completion(IB_WORK_REQ *wrq) +{ + return wrq->Req.SendRC.Options.s.SignaledCompletion == 1; +} + + +/******************************************************************************/ + +/* these are purposely avoiding using local vars so they don't increase + * stack consumption. */ + +#define kib_peer_addref(peer) do { \ + LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n", \ + atomic_read(&peer->ibp_refcount)); \ + CDEBUG(D_NET, "++peer[%p] -> "LPX64" (%d)\n", \ + peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \ + atomic_inc(&peer->ibp_refcount); \ +} while (0) + +#define kib_peer_decref(peer) do { \ + LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n", \ + atomic_read(&peer->ibp_refcount)); \ + CDEBUG(D_NET, "--peer[%p] -> "LPX64" (%d)\n", \ + peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \ + if (atomic_dec_and_test (&peer->ibp_refcount)) { \ + CDEBUG (D_NET, "destroying peer "LPX64" %p\n", \ + peer->ibp_nid, peer); \ + kibnal_destroy_peer (peer); \ + } \ +} while (0) + +/******************************************************************************/ + +static inline struct list_head * +kibnal_nid2peerlist (ptl_nid_t nid) +{ + unsigned int hash = ((unsigned int)nid) % kibnal_data.kib_peer_hash_size; + + return (&kibnal_data.kib_peers [hash]); +} + +static inline int +kibnal_peer_active(kib_peer_t *peer) +{ + /* Am I in the peer hash table? */ + return (!list_empty(&peer->ibp_list)); +} + +static inline void +kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn) +{ + /* CAVEAT EMPTOR: tx takes caller's ref on conn */ + + LASSERT (tx->tx_nsp > 0); /* work items set up */ + LASSERT (tx->tx_conn == NULL); /* only set here */ + + tx->tx_conn = conn; + tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ; + list_add_tail(&tx->tx_list, &conn->ibc_tx_queue); +} + +#define KIBNAL_SERVICE_KEY_MASK (IB_SERVICE_RECORD_COMP_SERVICENAME | \ + IB_SERVICE_RECORD_COMP_SERVICEDATA8_1 | \ + IB_SERVICE_RECORD_COMP_SERVICEDATA8_2 | \ + IB_SERVICE_RECORD_COMP_SERVICEDATA8_3 | \ + IB_SERVICE_RECORD_COMP_SERVICEDATA8_4 | \ + IB_SERVICE_RECORD_COMP_SERVICEDATA8_5 | \ + IB_SERVICE_RECORD_COMP_SERVICEDATA8_6 | \ + IB_SERVICE_RECORD_COMP_SERVICEDATA8_7 | \ + IB_SERVICE_RECORD_COMP_SERVICEDATA8_8) + +static inline __u64* +kibnal_service_nid_field(IB_SERVICE_RECORD *srv) +{ + /* must be consistent with KIBNAL_SERVICE_KEY_MASK */ + return (__u64 *)srv->ServiceData8; +} + + +static inline void +kibnal_set_service_keys(IB_SERVICE_RECORD *srv, ptl_nid_t nid) +{ + LASSERT (strlen(IBNAL_SERVICE_NAME) < sizeof(srv->ServiceName)); + memset (srv->ServiceName, 0, sizeof(srv->ServiceName)); + strcpy (srv->ServiceName, IBNAL_SERVICE_NAME); + + *kibnal_service_nid_field(srv) = cpu_to_le64(nid); +} + +#if 0 +static inline void +kibnal_show_rdma_attr (kib_conn_t *conn) +{ + struct ib_qp_attribute qp_attr; + int rc; + + memset (&qp_attr, 0, sizeof(qp_attr)); + rc = ib_qp_query(conn->ibc_qp, &qp_attr); + if (rc != 0) { + CERROR ("Can't get qp attrs: %d\n", rc); + return; + } + + CWARN ("RDMA CAPABILITY: write %s read %s\n", + (qp_attr.valid_fields & TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE) ? + (qp_attr.enable_rdma_write ? "enabled" : "disabled") : "invalid", + (qp_attr.valid_fields & TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE) ? + (qp_attr.enable_rdma_read ? "enabled" : "disabled") : "invalid"); +} +#endif + +#if CONFIG_X86 +static inline __u64 +kibnal_page2phys (struct page *p) +{ + __u64 page_number = p - mem_map; + + return (page_number << PAGE_SHIFT); +} +#else +# error "no page->phys" +#endif + +/* CAVEAT EMPTOR: + * We rely on tx/rx descriptor alignment to allow us to use the lowest bit + * of the work request id as a flag to determine if the completion is for a + * transmit or a receive. It seems that that the CQ entry's 'op' field + * isn't always set correctly on completions that occur after QP teardown. */ + +static inline __u64 +kibnal_ptr2wreqid (void *ptr, int isrx) +{ + unsigned long lptr = (unsigned long)ptr; + + LASSERT ((lptr & 1) == 0); + return (__u64)(lptr | (isrx ? 1 : 0)); +} + +static inline void * +kibnal_wreqid2ptr (__u64 wreqid) +{ + return (void *)(((unsigned long)wreqid) & ~1UL); +} + +static inline int +kibnal_wreqid_is_rx (__u64 wreqid) +{ + return (wreqid & 1) != 0; +} + +static inline int +kibnal_whole_mem(void) +{ + return kibnal_data.kib_md.md_handle != NULL; +} + +extern kib_peer_t *kibnal_create_peer (ptl_nid_t nid); +extern void kibnal_destroy_peer (kib_peer_t *peer); +extern int kibnal_del_peer (ptl_nid_t nid, int single_share); +extern kib_peer_t *kibnal_find_peer_locked (ptl_nid_t nid); +extern void kibnal_unlink_peer_locked (kib_peer_t *peer); +extern int kibnal_close_stale_conns_locked (kib_peer_t *peer, + __u64 incarnation); +extern kib_conn_t *kibnal_create_conn (void); +extern void kibnal_put_conn (kib_conn_t *conn); +extern void kibnal_destroy_conn (kib_conn_t *conn); +void kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg); + +extern int kibnal_alloc_pages (kib_pages_t **pp, int npages, int access); +extern void kibnal_free_pages (kib_pages_t *p); + +extern void kibnal_check_sends (kib_conn_t *conn); +extern void kibnal_close_conn_locked (kib_conn_t *conn, int error); +extern void kibnal_destroy_conn (kib_conn_t *conn); +extern int kibnal_thread_start (int (*fn)(void *arg), void *arg); +extern int kibnal_scheduler(void *arg); +extern int kibnal_connd (void *arg); +extern void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob); +extern void kibnal_close_conn (kib_conn_t *conn, int why); +extern void kibnal_start_active_rdma (int type, int status, + kib_rx_t *rx, lib_msg_t *libmsg, + unsigned int niov, + struct iovec *iov, ptl_kiov_t *kiov, + size_t offset, size_t nob); + +void kibnal_ca_async_callback (void *ca_arg, IB_EVENT_RECORD *ev); +void kibnal_ca_callback (void *ca_arg, void *cq_arg); diff --git a/lustre/portals/knals/iibnal/iibnal_cb.c b/lustre/portals/knals/iibnal/iibnal_cb.c new file mode 100644 index 0000000..a827ba5 --- /dev/null +++ b/lustre/portals/knals/iibnal/iibnal_cb.c @@ -0,0 +1,3018 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * Author: Eric Barton + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "iibnal.h" + +/* + * LIB functions follow + * + */ +static void +kibnal_schedule_tx_done (kib_tx_t *tx) +{ + unsigned long flags; + + spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags); + + list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq); + wake_up (&kibnal_data.kib_sched_waitq); + + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); +} + +static void +kibnal_tx_done (kib_tx_t *tx) +{ + ptl_err_t ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL; + unsigned long flags; + int i; + FSTATUS frc; + + LASSERT (tx->tx_sending == 0); /* mustn't be awaiting callback */ + LASSERT (!tx->tx_passive_rdma_wait); /* mustn't be awaiting RDMA */ + + switch (tx->tx_mapped) { + default: + LBUG(); + + case KIB_TX_UNMAPPED: + break; + + case KIB_TX_MAPPED: + if (in_interrupt()) { + /* can't deregister memory in IRQ context... */ + kibnal_schedule_tx_done(tx); + return; + } + frc = iibt_deregister_memory(tx->tx_md.md_handle); + LASSERT (frc == FSUCCESS); + tx->tx_mapped = KIB_TX_UNMAPPED; + break; + +#if IBNAL_FMR + case KIB_TX_MAPPED_FMR: + if (in_interrupt() && tx->tx_status != 0) { + /* can't flush FMRs in IRQ context... */ + kibnal_schedule_tx_done(tx); + return; + } + + rc = ib_fmr_deregister(tx->tx_md.md_handle.fmr); + LASSERT (rc == 0); + + if (tx->tx_status != 0) + ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool); + tx->tx_mapped = KIB_TX_UNMAPPED; + break; +#endif + } + + for (i = 0; i < 2; i++) { + /* tx may have up to 2 libmsgs to finalise */ + if (tx->tx_libmsg[i] == NULL) + continue; + + lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc); + tx->tx_libmsg[i] = NULL; + } + + if (tx->tx_conn != NULL) { + kibnal_put_conn (tx->tx_conn); + tx->tx_conn = NULL; + } + + tx->tx_nsp = 0; + tx->tx_passive_rdma = 0; + tx->tx_status = 0; + + spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); + + if (tx->tx_isnblk) { + list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs); + } else { + list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs); + wake_up (&kibnal_data.kib_idle_tx_waitq); + } + + spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); +} + +static kib_tx_t * +kibnal_get_idle_tx (int may_block) +{ + unsigned long flags; + kib_tx_t *tx = NULL; + ENTRY; + + for (;;) { + spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); + + /* "normal" descriptor is free */ + if (!list_empty (&kibnal_data.kib_idle_txs)) { + tx = list_entry (kibnal_data.kib_idle_txs.next, + kib_tx_t, tx_list); + break; + } + + if (!may_block) { + /* may dip into reserve pool */ + if (list_empty (&kibnal_data.kib_idle_nblk_txs)) { + CERROR ("reserved tx desc pool exhausted\n"); + break; + } + + tx = list_entry (kibnal_data.kib_idle_nblk_txs.next, + kib_tx_t, tx_list); + break; + } + + /* block for idle tx */ + spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); + + wait_event (kibnal_data.kib_idle_tx_waitq, + !list_empty (&kibnal_data.kib_idle_txs) || + kibnal_data.kib_shutdown); + } + + if (tx != NULL) { + list_del (&tx->tx_list); + + /* Allocate a new passive RDMA completion cookie. It might + * not be needed, but we've got a lock right now and we're + * unlikely to wrap... */ + tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++; + + LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); + LASSERT (tx->tx_nsp == 0); + LASSERT (tx->tx_sending == 0); + LASSERT (tx->tx_status == 0); + LASSERT (tx->tx_conn == NULL); + LASSERT (!tx->tx_passive_rdma); + LASSERT (!tx->tx_passive_rdma_wait); + LASSERT (tx->tx_libmsg[0] == NULL); + LASSERT (tx->tx_libmsg[1] == NULL); + } + + spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); + + RETURN(tx); +} + +static int +kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) +{ + /* I would guess that if kibnal_get_peer (nid) == NULL, + and we're not routing, then 'nid' is very distant :) */ + if ( nal->libnal_ni.ni_pid.nid == nid ) { + *dist = 0; + } else { + *dist = 1; + } + + return 0; +} + +static void +kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status) +{ + struct list_head *ttmp; + unsigned long flags; + int idle; + + spin_lock_irqsave (&conn->ibc_lock, flags); + + list_for_each (ttmp, &conn->ibc_active_txs) { + kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list); + + LASSERT (tx->tx_passive_rdma || + !tx->tx_passive_rdma_wait); + + LASSERT (tx->tx_passive_rdma_wait || + tx->tx_sending != 0); + + if (!tx->tx_passive_rdma_wait || + tx->tx_passive_rdma_cookie != cookie) + continue; + + CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status); + + tx->tx_status = status; + tx->tx_passive_rdma_wait = 0; + idle = (tx->tx_sending == 0); + + if (idle) + list_del (&tx->tx_list); + + spin_unlock_irqrestore (&conn->ibc_lock, flags); + + /* I could be racing with tx callbacks. It's whoever + * _makes_ tx idle that frees it */ + if (idle) + kibnal_tx_done (tx); + return; + } + + spin_unlock_irqrestore (&conn->ibc_lock, flags); + + CERROR ("Unmatched (late?) RDMA completion "LPX64" from "LPX64"\n", + cookie, conn->ibc_peer->ibp_nid); +} + +static __u32 +kibnal_lkey(kib_pages_t *ibp) +{ + if (kibnal_whole_mem()) + return kibnal_data.kib_md.md_lkey; + + return ibp->ibp_lkey; +} + +static void +kibnal_post_rx (kib_rx_t *rx, int do_credits) +{ + kib_conn_t *conn = rx->rx_conn; + int rc = 0; + unsigned long flags; + FSTATUS frc; + ENTRY; + + rx->rx_gl = (IB_LOCAL_DATASEGMENT) { + .Address = rx->rx_vaddr, + .Length = IBNAL_MSG_SIZE, + .Lkey = kibnal_lkey(conn->ibc_rx_pages), + }; + + rx->rx_wrq = (IB_WORK_REQ) { + .Operation = WROpRecv, + .DSListDepth = 1, + .MessageLen = IBNAL_MSG_SIZE, + .WorkReqId = kibnal_ptr2wreqid(rx, 1), + .DSList = &rx->rx_gl, + }; + + KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED, + IBNAL_CONN_DREP); + LASSERT (!rx->rx_posted); + rx->rx_posted = 1; + mb(); + + if (conn->ibc_state != IBNAL_CONN_ESTABLISHED) + rc = -ECONNABORTED; + else { + frc = iibt_postrecv(conn->ibc_qp, &rx->rx_wrq); + if (frc != FSUCCESS) { + CDEBUG(D_NET, "post failed %d\n", frc); + rc = -EINVAL; + } + CDEBUG(D_NET, "posted rx %p\n", &rx->rx_wrq); + } + + if (rc == 0) { + if (do_credits) { + spin_lock_irqsave(&conn->ibc_lock, flags); + conn->ibc_outstanding_credits++; + spin_unlock_irqrestore(&conn->ibc_lock, flags); + + kibnal_check_sends(conn); + } + EXIT; + return; + } + + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { + CERROR ("Error posting receive -> "LPX64": %d\n", + conn->ibc_peer->ibp_nid, rc); + kibnal_close_conn (rx->rx_conn, rc); + } else { + CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n", + conn->ibc_peer->ibp_nid, rc); + } + + /* Drop rx's ref */ + kibnal_put_conn (conn); + EXIT; +} + +#if IBNAL_CKSUM +static inline __u32 kibnal_cksum (void *ptr, int nob) +{ + char *c = ptr; + __u32 sum = 0; + + while (nob-- > 0) + sum = ((sum << 1) | (sum >> 31)) + *c++; + + return (sum); +} +#endif + +static void hexdump(char *string, void *ptr, int len) +{ + unsigned char *c = ptr; + int i; + + return; + + if (len < 0 || len > 2048) { + printk("XXX what the hell? %d\n",len); + return; + } + + printk("%d bytes of '%s' from 0x%p\n", len, string, ptr); + + for (i = 0; i < len;) { + printk("%02x",*(c++)); + i++; + if (!(i & 15)) { + printk("\n"); + } else if (!(i&1)) { + printk(" "); + } + } + + if(len & 15) { + printk("\n"); + } +} + +static void +kibnal_rx_callback (IB_WORK_COMPLETION *wc) +{ + kib_rx_t *rx = (kib_rx_t *)kibnal_wreqid2ptr(wc->WorkReqId); + kib_msg_t *msg = rx->rx_msg; + kib_conn_t *conn = rx->rx_conn; + int nob = wc->Length; + const int base_nob = offsetof(kib_msg_t, ibm_u); + int credits; + int flipped; + unsigned long flags; + __u32 i; +#if IBNAL_CKSUM + __u32 msg_cksum; + __u32 computed_cksum; +#endif + + /* we set the QP to erroring after we've finished disconnecting, + * maybe we should do so sooner. */ + KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED, + IBNAL_CONN_DISCONNECTED); + + CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); + LASSERT (rx->rx_posted); + rx->rx_posted = 0; + mb(); + + /* receives complete with error in any case after we've started + * disconnecting */ + if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) + goto failed; + + if (wc->Status != WRStatusSuccess) { + CERROR("Rx from "LPX64" failed: %d\n", + conn->ibc_peer->ibp_nid, wc->Status); + goto failed; + } + + if (nob < base_nob) { + CERROR ("Short rx from "LPX64": %d < expected %d\n", + conn->ibc_peer->ibp_nid, nob, base_nob); + goto failed; + } + + hexdump("rx", rx->rx_msg, sizeof(kib_msg_t)); + + /* Receiver does any byte flipping if necessary... */ + + if (msg->ibm_magic == IBNAL_MSG_MAGIC) { + flipped = 0; + } else { + if (msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) { + CERROR ("Unrecognised magic: %08x from "LPX64"\n", + msg->ibm_magic, conn->ibc_peer->ibp_nid); + goto failed; + } + flipped = 1; + __swab16s (&msg->ibm_version); + LASSERT (sizeof(msg->ibm_type) == 1); + LASSERT (sizeof(msg->ibm_credits) == 1); + } + + if (msg->ibm_version != IBNAL_MSG_VERSION) { + CERROR ("Incompatible msg version %d (%d expected)\n", + msg->ibm_version, IBNAL_MSG_VERSION); + goto failed; + } + +#if IBNAL_CKSUM + if (nob != msg->ibm_nob) { + CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->ibm_nob); + goto failed; + } + + msg_cksum = le32_to_cpu(msg->ibm_cksum); + msg->ibm_cksum = 0; + computed_cksum = kibnal_cksum (msg, nob); + + if (msg_cksum != computed_cksum) { + CERROR ("Checksum failure %d: (%d expected)\n", + computed_cksum, msg_cksum); +// goto failed; + } + CDEBUG(D_NET, "cksum %x, nob %d\n", computed_cksum, nob); +#endif + + /* Have I received credits that will let me send? */ + credits = msg->ibm_credits; + if (credits != 0) { + spin_lock_irqsave(&conn->ibc_lock, flags); + conn->ibc_credits += credits; + spin_unlock_irqrestore(&conn->ibc_lock, flags); + + kibnal_check_sends(conn); + } + + switch (msg->ibm_type) { + case IBNAL_MSG_NOOP: + kibnal_post_rx (rx, 1); + return; + + case IBNAL_MSG_IMMEDIATE: + if (nob < base_nob + sizeof (kib_immediate_msg_t)) { + CERROR ("Short IMMEDIATE from "LPX64": %d\n", + conn->ibc_peer->ibp_nid, nob); + goto failed; + } + break; + + case IBNAL_MSG_PUT_RDMA: + case IBNAL_MSG_GET_RDMA: + if (nob < base_nob + sizeof (kib_rdma_msg_t)) { + CERROR ("Short RDMA msg from "LPX64": %d\n", + conn->ibc_peer->ibp_nid, nob); + goto failed; + } + if (flipped) + __swab32(msg->ibm_u.rdma.ibrm_num_descs); + + CDEBUG(D_NET, "%d RDMA: cookie "LPX64":\n", + msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie); + + if ((msg->ibm_u.rdma.ibrm_num_descs > PTL_MD_MAX_IOV) || + (kib_rdma_msg_len(msg->ibm_u.rdma.ibrm_num_descs) > + min(nob, IBNAL_MSG_SIZE))) { + CERROR ("num_descs %d too large\n", + msg->ibm_u.rdma.ibrm_num_descs); + goto failed; + } + + for(i = 0; i < msg->ibm_u.rdma.ibrm_num_descs; i++) { + kib_rdma_desc_t *desc = &msg->ibm_u.rdma.ibrm_desc[i]; + + if (flipped) { + __swab32(desc->rd_key); + __swab32(desc->rd_nob); + __swab64(desc->rd_addr); + } + + CDEBUG(D_NET, " key %x, " "addr "LPX64", nob %u\n", + desc->rd_key, desc->rd_addr, desc->rd_nob); + } + break; + + case IBNAL_MSG_PUT_DONE: + case IBNAL_MSG_GET_DONE: + if (nob < base_nob + sizeof (kib_completion_msg_t)) { + CERROR ("Short COMPLETION msg from "LPX64": %d\n", + conn->ibc_peer->ibp_nid, nob); + goto failed; + } + if (flipped) + __swab32s(&msg->ibm_u.completion.ibcm_status); + + CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n", + msg->ibm_type, msg->ibm_u.completion.ibcm_cookie, + msg->ibm_u.completion.ibcm_status); + + kibnal_complete_passive_rdma (conn, + msg->ibm_u.completion.ibcm_cookie, + msg->ibm_u.completion.ibcm_status); + kibnal_post_rx (rx, 1); + return; + + default: + CERROR ("Can't parse type from "LPX64": %d\n", + conn->ibc_peer->ibp_nid, msg->ibm_type); + goto failed; + } + + /* schedule for kibnal_rx() in thread context */ + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); + + list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq); + wake_up (&kibnal_data.kib_sched_waitq); + + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); + return; + + failed: + CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); + kibnal_close_conn(conn, -ECONNABORTED); + + /* Don't re-post rx & drop its ref on conn */ + kibnal_put_conn(conn); +} + +void +kibnal_rx (kib_rx_t *rx) +{ + kib_msg_t *msg = rx->rx_msg; + + /* Clear flag so I can detect if I've sent an RDMA completion */ + rx->rx_rdma = 0; + + switch (msg->ibm_type) { + case IBNAL_MSG_GET_RDMA: + lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx); + /* If the incoming get was matched, I'll have initiated the + * RDMA and the completion message... */ + if (rx->rx_rdma) + break; + + /* Otherwise, I'll send a failed completion now to prevent + * the peer's GET blocking for the full timeout. */ + CERROR ("Completing unmatched RDMA GET from "LPX64"\n", + rx->rx_conn->ibc_peer->ibp_nid); + kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO, + rx, NULL, 0, NULL, NULL, 0, 0); + break; + + case IBNAL_MSG_PUT_RDMA: + lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx); + if (rx->rx_rdma) + break; + /* This is most unusual, since even if lib_parse() didn't + * match anything, it should have asked us to read (and + * discard) the payload. The portals header must be + * inconsistent with this message type, so it's the + * sender's fault for sending garbage and she can time + * herself out... */ + CERROR ("Uncompleted RMDA PUT from "LPX64"\n", + rx->rx_conn->ibc_peer->ibp_nid); + break; + + case IBNAL_MSG_IMMEDIATE: + lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx); + LASSERT (!rx->rx_rdma); + break; + + default: + LBUG(); + break; + } + + kibnal_post_rx (rx, 1); +} + +static struct page * +kibnal_kvaddr_to_page (unsigned long vaddr) +{ + struct page *page; + + if (vaddr >= VMALLOC_START && + vaddr < VMALLOC_END) + page = vmalloc_to_page ((void *)vaddr); +#if CONFIG_HIGHMEM + else if (vaddr >= PKMAP_BASE && + vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) + page = vmalloc_to_page ((void *)vaddr); + /* in 2.4 ^ just walks the page tables */ +#endif + else + page = virt_to_page (vaddr); + + if (!VALID_PAGE (page)) + page = NULL; + + return page; +} + +static void +kibnal_fill_ibrm(kib_tx_t *tx, struct page *page, unsigned long page_offset, + unsigned long len, int active) +{ + kib_rdma_msg_t *ibrm = &tx->tx_msg->ibm_u.rdma; + kib_rdma_desc_t *desc; + + LASSERTF(ibrm->ibrm_num_descs < PTL_MD_MAX_IOV, "%u\n", + ibrm->ibrm_num_descs); + + desc = &ibrm->ibrm_desc[ibrm->ibrm_num_descs]; + if (active) + desc->rd_key = kibnal_data.kib_md.md_lkey; + else + desc->rd_key = kibnal_data.kib_md.md_rkey; + desc->rd_nob = len; /*PAGE_SIZE - kiov->kiov_offset; */ + desc->rd_addr = kibnal_page2phys(page) + page_offset + + kibnal_data.kib_md.md_addr; + + ibrm->ibrm_num_descs++; +} + +static int +kibnal_map_rdma_iov(kib_tx_t *tx, unsigned long vaddr, int nob, int active) +{ + struct page *page; + int page_offset, len; + + while (nob > 0) { + page = kibnal_kvaddr_to_page(vaddr); + if (page == NULL) + return -EFAULT; + + page_offset = vaddr & (PAGE_SIZE - 1); + len = min(nob, (int)PAGE_SIZE - page_offset); + + kibnal_fill_ibrm(tx, page, page_offset, len, active); + nob -= len; + vaddr += len; + } + return 0; +} + +static int +kibnal_map_iov (kib_tx_t *tx, IB_ACCESS_CONTROL access, + int niov, struct iovec *iov, int offset, int nob, int active) + +{ + void *vaddr; + FSTATUS frc; + + LASSERT (nob > 0); + LASSERT (niov > 0); + LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); + + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + niov--; + iov++; + LASSERT (niov > 0); + } + + if (nob > iov->iov_len - offset) { + CERROR ("Can't map multiple vaddr fragments\n"); + return (-EMSGSIZE); + } + + /* our large contiguous iov could be backed by multiple physical + * pages. */ + if (kibnal_whole_mem()) { + int rc; + tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0; + rc = kibnal_map_rdma_iov(tx, (unsigned long)iov->iov_base + + offset, nob, active); + if (rc != 0) { + CERROR ("Can't map iov: %d\n", rc); + return rc; + } + return 0; + } + + vaddr = (void *)(((unsigned long)iov->iov_base) + offset); + tx->tx_md.md_addr = (__u64)((unsigned long)vaddr); + + frc = iibt_register_memory(kibnal_data.kib_hca, vaddr, nob, + kibnal_data.kib_pd, access, + &tx->tx_md.md_handle, &tx->tx_md.md_lkey, + &tx->tx_md.md_rkey); + if (frc != 0) { + CERROR ("Can't map vaddr %p: %d\n", vaddr, frc); + return -EINVAL; + } + + tx->tx_mapped = KIB_TX_MAPPED; + return (0); +} + +static int +kibnal_map_kiov (kib_tx_t *tx, IB_ACCESS_CONTROL access, + int nkiov, ptl_kiov_t *kiov, + int offset, int nob, int active) +{ + __u64 *phys = NULL; + int page_offset; + int nphys; + int resid; + int phys_size = 0; + FSTATUS frc; + int i, rc = 0; + + CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob); + + LASSERT (nob > 0); + LASSERT (nkiov > 0); + LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); + + while (offset >= kiov->kiov_len) { + offset -= kiov->kiov_len; + nkiov--; + kiov++; + LASSERT (nkiov > 0); + } + + page_offset = kiov->kiov_offset + offset; + nphys = 1; + + if (!kibnal_whole_mem()) { + phys_size = nkiov * sizeof (*phys); + PORTAL_ALLOC(phys, phys_size); + if (phys == NULL) { + CERROR ("Can't allocate tmp phys\n"); + return (-ENOMEM); + } + + phys[0] = kibnal_page2phys(kiov->kiov_page); + } else { + tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0; + kibnal_fill_ibrm(tx, kiov->kiov_page, kiov->kiov_offset, + kiov->kiov_len, active); + } + + resid = nob - (kiov->kiov_len - offset); + + while (resid > 0) { + kiov++; + nkiov--; + LASSERT (nkiov > 0); + + if (kiov->kiov_offset != 0 || + ((resid > PAGE_SIZE) && + kiov->kiov_len < PAGE_SIZE)) { + /* Can't have gaps */ + CERROR ("Can't make payload contiguous in I/O VM:" + "page %d, offset %d, len %d \n", nphys, + kiov->kiov_offset, kiov->kiov_len); + + for (i = -nphys; i < nkiov; i++) + { + CERROR("kiov[%d] %p +%d for %d\n", + i, kiov[i].kiov_page, kiov[i].kiov_offset, kiov[i].kiov_len); + } + + rc = -EINVAL; + goto out; + } + + if (nphys == PTL_MD_MAX_IOV) { + CERROR ("payload too big (%d)\n", nphys); + rc = -EMSGSIZE; + goto out; + } + + if (!kibnal_whole_mem()) { + LASSERT (nphys * sizeof (*phys) < phys_size); + phys[nphys] = kibnal_page2phys(kiov->kiov_page); + } else { + if (kib_rdma_msg_len(nphys) > IBNAL_MSG_SIZE) { + CERROR ("payload too big (%d)\n", nphys); + rc = -EMSGSIZE; + goto out; + } + kibnal_fill_ibrm(tx, kiov->kiov_page, + kiov->kiov_offset, kiov->kiov_len, + active); + } + + nphys ++; + resid -= PAGE_SIZE; + } + + if (kibnal_whole_mem()) + goto out; + +#if 0 + CWARN ("nphys %d, nob %d, page_offset %d\n", nphys, nob, page_offset); + for (i = 0; i < nphys; i++) + CWARN (" [%d] "LPX64"\n", i, phys[i]); +#endif + +#if IBNAL_FMR +#error "iibnal hasn't learned about FMR yet" + rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool, + phys, nphys, + &tx->tx_md.md_addr, + page_offset, + &tx->tx_md.md_handle.fmr, + &tx->tx_md.md_lkey, + &tx->tx_md.md_rkey); +#else + frc = iibt_register_physical_memory(kibnal_data.kib_hca, + IBNAL_RDMA_BASE, + phys, nphys, + 0, /* offset */ + kibnal_data.kib_pd, + access, + &tx->tx_md.md_handle, + &tx->tx_md.md_addr, + &tx->tx_md.md_lkey, + &tx->tx_md.md_rkey); +#endif + if (frc == FSUCCESS) { + CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: lkey %x, rkey %x\n", + nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey); +#if IBNAL_FMR + tx->tx_mapped = KIB_TX_MAPPED_FMR; +#else + tx->tx_mapped = KIB_TX_MAPPED; +#endif + } else { + CERROR ("Can't map phys: %d\n", rc); + rc = -EFAULT; + } + + out: + if (phys != NULL) + PORTAL_FREE(phys, phys_size); + return (rc); +} + +static kib_conn_t * +kibnal_find_conn_locked (kib_peer_t *peer) +{ + struct list_head *tmp; + + /* just return the first connection */ + list_for_each (tmp, &peer->ibp_conns) { + return (list_entry(tmp, kib_conn_t, ibc_list)); + } + + return (NULL); +} + +void +kibnal_check_sends (kib_conn_t *conn) +{ + unsigned long flags; + kib_tx_t *tx; + int rc; + int i; + int done; + int nwork; + ENTRY; + + spin_lock_irqsave (&conn->ibc_lock, flags); + + LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE); + + if (list_empty(&conn->ibc_tx_queue) && + conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) { + spin_unlock_irqrestore(&conn->ibc_lock, flags); + + tx = kibnal_get_idle_tx(0); /* don't block */ + if (tx != NULL) + kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0); + + spin_lock_irqsave(&conn->ibc_lock, flags); + + if (tx != NULL) { + atomic_inc(&conn->ibc_refcount); + kibnal_queue_tx_locked(tx, conn); + } + } + + while (!list_empty (&conn->ibc_tx_queue)) { + tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list); + + /* We rely on this for QP sizing */ + LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= IBNAL_TX_MAX_SG); + + LASSERT (conn->ibc_outstanding_credits >= 0); + LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE); + LASSERT (conn->ibc_credits >= 0); + LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE); + + /* Not on ibc_rdma_queue */ + LASSERT (!tx->tx_passive_rdma_wait); + + if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE) + GOTO(out, 0); + + if (conn->ibc_credits == 0) /* no credits */ + GOTO(out, 1); + + if (conn->ibc_credits == 1 && /* last credit reserved for */ + conn->ibc_outstanding_credits == 0) /* giving back credits */ + GOTO(out, 2); + + list_del (&tx->tx_list); + + if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP && + (!list_empty(&conn->ibc_tx_queue) || + conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) { + /* redundant NOOP */ + spin_unlock_irqrestore(&conn->ibc_lock, flags); + kibnal_tx_done(tx); + spin_lock_irqsave(&conn->ibc_lock, flags); + continue; + } + + tx->tx_msg->ibm_credits = conn->ibc_outstanding_credits; + conn->ibc_outstanding_credits = 0; + + conn->ibc_nsends_posted++; + conn->ibc_credits--; + + /* we only get a tx completion for the final rdma op */ + tx->tx_sending = min(tx->tx_nsp, 2); + tx->tx_passive_rdma_wait = tx->tx_passive_rdma; + list_add (&tx->tx_list, &conn->ibc_active_txs); +#if IBNAL_CKSUM + tx->tx_msg->ibm_cksum = 0; + tx->tx_msg->ibm_cksum = kibnal_cksum(tx->tx_msg, tx->tx_msg->ibm_nob); + CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->ibm_cksum, tx->tx_msg->ibm_nob); +#endif + spin_unlock_irqrestore (&conn->ibc_lock, flags); + + /* NB the gap between removing tx from the queue and sending it + * allows message re-ordering to occur */ + + LASSERT (tx->tx_nsp > 0); + + rc = -ECONNABORTED; + nwork = 0; + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { + tx->tx_status = 0; + /* Driver only accepts 1 item at a time */ + for (i = 0; i < tx->tx_nsp; i++) { + hexdump("tx", tx->tx_msg, sizeof(kib_msg_t)); + rc = iibt_postsend(conn->ibc_qp, + &tx->tx_wrq[i]); + if (rc != 0) + break; + if (wrq_signals_completion(&tx->tx_wrq[i])) + nwork++; + CDEBUG(D_NET, "posted tx wrq %p\n", + &tx->tx_wrq[i]); + } + } + + spin_lock_irqsave (&conn->ibc_lock, flags); + if (rc != 0) { + /* NB credits are transferred in the actual + * message, which can only be the last work item */ + conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits; + conn->ibc_credits++; + conn->ibc_nsends_posted--; + + tx->tx_status = rc; + tx->tx_passive_rdma_wait = 0; + tx->tx_sending -= tx->tx_nsp - nwork; + + done = (tx->tx_sending == 0); + if (done) + list_del (&tx->tx_list); + + spin_unlock_irqrestore (&conn->ibc_lock, flags); + + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) + CERROR ("Error %d posting transmit to "LPX64"\n", + rc, conn->ibc_peer->ibp_nid); + else + CDEBUG (D_NET, "Error %d posting transmit to " + LPX64"\n", rc, conn->ibc_peer->ibp_nid); + + kibnal_close_conn (conn, rc); + + if (done) + kibnal_tx_done (tx); + return; + } + + } + + EXIT; +out: + spin_unlock_irqrestore (&conn->ibc_lock, flags); +} + +static void +kibnal_tx_callback (IB_WORK_COMPLETION *wc) +{ + kib_tx_t *tx = (kib_tx_t *)kibnal_wreqid2ptr(wc->WorkReqId); + kib_conn_t *conn; + unsigned long flags; + int idle; + + conn = tx->tx_conn; + LASSERT (conn != NULL); + LASSERT (tx->tx_sending != 0); + + spin_lock_irqsave(&conn->ibc_lock, flags); + + CDEBUG(D_NET, "conn %p tx %p [%d/%d]: %d\n", conn, tx, + tx->tx_sending, tx->tx_nsp, wc->Status); + + /* I could be racing with rdma completion. Whoever makes 'tx' idle + * gets to free it, which also drops its ref on 'conn'. If it's + * not me, then I take an extra ref on conn so it can't disappear + * under me. */ + + tx->tx_sending--; + idle = (tx->tx_sending == 0) && /* This is the final callback */ + (!tx->tx_passive_rdma_wait); /* Not waiting for RDMA completion */ + if (idle) + list_del(&tx->tx_list); + + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); + + if (tx->tx_sending == 0) + conn->ibc_nsends_posted--; + + if (wc->Status != WRStatusSuccess && + tx->tx_status == 0) + tx->tx_status = -ECONNABORTED; + + spin_unlock_irqrestore(&conn->ibc_lock, flags); + + if (idle) + kibnal_tx_done (tx); + + if (wc->Status != WRStatusSuccess) { + CERROR ("Tx completion to "LPX64" failed: %d\n", + conn->ibc_peer->ibp_nid, wc->Status); + kibnal_close_conn (conn, -ENETDOWN); + } else { + /* can I shovel some more sends out the door? */ + kibnal_check_sends(conn); + } + + kibnal_put_conn (conn); +} + +void +kibnal_ca_async_callback (void *ca_arg, IB_EVENT_RECORD *ev) +{ + /* XXX flesh out. this seems largely for async errors */ + CERROR("type: %d code: %u\n", ev->EventType, ev->EventCode); +} + +void +kibnal_ca_callback (void *ca_arg, void *cq_arg) +{ + IB_HANDLE cq = *(IB_HANDLE *)cq_arg; + IB_HANDLE ca = *(IB_HANDLE *)ca_arg; + IB_WORK_COMPLETION wc; + int armed = 0; + + CDEBUG(D_NET, "ca %p cq %p\n", ca, cq); + + for(;;) { + while (iibt_cq_poll(cq, &wc) == FSUCCESS) { + if (kibnal_wreqid_is_rx(wc.WorkReqId)) + kibnal_rx_callback(&wc); + else + kibnal_tx_callback(&wc); + } + if (armed) + return; + if (iibt_cq_rearm(cq, CQEventSelNextWC) != FSUCCESS) { + CERROR("rearm failed?\n"); + return; + } + armed = 1; + } +} + +void +kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob) +{ + IB_LOCAL_DATASEGMENT *gl = &tx->tx_gl[tx->tx_nsp]; + IB_WORK_REQ *wrq = &tx->tx_wrq[tx->tx_nsp]; + int fence; + int nob = offsetof (kib_msg_t, ibm_u) + body_nob; + + LASSERT (tx->tx_nsp >= 0 && + tx->tx_nsp < sizeof(tx->tx_wrq)/sizeof(tx->tx_wrq[0])); + LASSERT (nob <= IBNAL_MSG_SIZE); + + tx->tx_msg->ibm_magic = IBNAL_MSG_MAGIC; + tx->tx_msg->ibm_version = IBNAL_MSG_VERSION; + tx->tx_msg->ibm_type = type; +#if IBNAL_CKSUM + tx->tx_msg->ibm_nob = nob; +#endif + /* Fence the message if it's bundled with an RDMA read */ + fence = (tx->tx_nsp > 0) && + (type == IBNAL_MSG_PUT_DONE); + + *gl = (IB_LOCAL_DATASEGMENT) { + .Address = tx->tx_vaddr, + .Length = IBNAL_MSG_SIZE, + .Lkey = kibnal_lkey(kibnal_data.kib_tx_pages), + }; + + wrq->WorkReqId = kibnal_ptr2wreqid(tx, 0); + wrq->Operation = WROpSend; + wrq->DSList = gl; + wrq->DSListDepth = 1; + wrq->MessageLen = nob; + wrq->Req.SendRC.ImmediateData = 0; + wrq->Req.SendRC.Options.s.SolicitedEvent = 1; + wrq->Req.SendRC.Options.s.SignaledCompletion = 1; + wrq->Req.SendRC.Options.s.ImmediateData = 0; + wrq->Req.SendRC.Options.s.Fence = fence; + + tx->tx_nsp++; +} + +static void +kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn) +{ + unsigned long flags; + + spin_lock_irqsave(&conn->ibc_lock, flags); + + kibnal_queue_tx_locked (tx, conn); + + spin_unlock_irqrestore(&conn->ibc_lock, flags); + + kibnal_check_sends(conn); +} + +static void +kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid) +{ + unsigned long flags; + kib_peer_t *peer; + kib_conn_t *conn; + rwlock_t *g_lock = &kibnal_data.kib_global_lock; + + /* If I get here, I've committed to send, so I complete the tx with + * failure on any problems */ + + LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */ + LASSERT (tx->tx_nsp > 0); /* work items have been set up */ + + read_lock (g_lock); + + peer = kibnal_find_peer_locked (nid); + if (peer == NULL) { + read_unlock (g_lock); + tx->tx_status = -EHOSTUNREACH; + kibnal_tx_done (tx); + return; + } + + conn = kibnal_find_conn_locked (peer); + if (conn != NULL) { + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */ + read_unlock (g_lock); + + kibnal_queue_tx (tx, conn); + return; + } + + /* Making one or more connections; I'll need a write lock... */ + read_unlock (g_lock); + write_lock_irqsave (g_lock, flags); + + peer = kibnal_find_peer_locked (nid); + if (peer == NULL) { + write_unlock_irqrestore (g_lock, flags); + tx->tx_status = -EHOSTUNREACH; + kibnal_tx_done (tx); + return; + } + + conn = kibnal_find_conn_locked (peer); + if (conn != NULL) { + /* Connection exists; queue message on it */ + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */ + write_unlock_irqrestore (g_lock, flags); + + kibnal_queue_tx (tx, conn); + return; + } + + if (peer->ibp_connecting == 0) { + if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) { + write_unlock_irqrestore (g_lock, flags); + tx->tx_status = -EHOSTUNREACH; + kibnal_tx_done (tx); + return; + } + + peer->ibp_connecting = 1; + kib_peer_addref(peer); /* extra ref for connd */ + + spin_lock (&kibnal_data.kib_connd_lock); + + list_add_tail (&peer->ibp_connd_list, + &kibnal_data.kib_connd_peers); + wake_up (&kibnal_data.kib_connd_waitq); + + spin_unlock (&kibnal_data.kib_connd_lock); + } + + /* A connection is being established; queue the message... */ + list_add_tail (&tx->tx_list, &peer->ibp_tx_queue); + + write_unlock_irqrestore (g_lock, flags); +} + +static ptl_err_t +kibnal_start_passive_rdma (int type, ptl_nid_t nid, + lib_msg_t *libmsg, ptl_hdr_t *hdr) +{ + int nob = libmsg->md->length; + kib_tx_t *tx; + kib_msg_t *ibmsg; + int rc; + IB_ACCESS_CONTROL access = {0,}; + + LASSERT (type == IBNAL_MSG_PUT_RDMA || type == IBNAL_MSG_GET_RDMA); + LASSERT (nob > 0); + LASSERT (!in_interrupt()); /* Mapping could block */ + + access.s.MWBindable = 1; + access.s.LocalWrite = 1; + access.s.RdmaRead = 1; + access.s.RdmaWrite = 1; + + tx = kibnal_get_idle_tx (1); /* May block; caller is an app thread */ + LASSERT (tx != NULL); + + if ((libmsg->md->options & PTL_MD_KIOV) == 0) + rc = kibnal_map_iov (tx, access, + libmsg->md->md_niov, + libmsg->md->md_iov.iov, + 0, nob, 0); + else + rc = kibnal_map_kiov (tx, access, + libmsg->md->md_niov, + libmsg->md->md_iov.kiov, + 0, nob, 0); + + if (rc != 0) { + CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc); + goto failed; + } + + if (type == IBNAL_MSG_GET_RDMA) { + /* reply gets finalized when tx completes */ + tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, + nid, libmsg); + if (tx->tx_libmsg[1] == NULL) { + CERROR ("Can't create reply for GET -> "LPX64"\n", + nid); + rc = -ENOMEM; + goto failed; + } + } + + tx->tx_passive_rdma = 1; + + ibmsg = tx->tx_msg; + + ibmsg->ibm_u.rdma.ibrm_hdr = *hdr; + ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie; + /* map_kiov alrady filled the rdma descs for the whole_mem case */ + if (!kibnal_whole_mem()) { + ibmsg->ibm_u.rdma.ibrm_desc[0].rd_key = tx->tx_md.md_rkey; + ibmsg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr; + ibmsg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob; + ibmsg->ibm_u.rdma.ibrm_num_descs = 1; + } + + kibnal_init_tx_msg (tx, type, + kib_rdma_msg_len(ibmsg->ibm_u.rdma.ibrm_num_descs)); + + CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr " + LPX64", nob %d\n", + tx, tx->tx_passive_rdma_cookie, tx->tx_md.md_rkey, + tx->tx_md.md_addr, nob); + + /* libmsg gets finalized when tx completes. */ + tx->tx_libmsg[0] = libmsg; + + kibnal_launch_tx(tx, nid); + return (PTL_OK); + + failed: + tx->tx_status = rc; + kibnal_tx_done (tx); + return (PTL_FAIL); +} + +void +kibnal_start_active_rdma (int type, int status, + kib_rx_t *rx, lib_msg_t *libmsg, + unsigned int niov, + struct iovec *iov, ptl_kiov_t *kiov, + size_t offset, size_t nob) +{ + kib_msg_t *rxmsg = rx->rx_msg; + kib_msg_t *txmsg; + kib_tx_t *tx; + IB_ACCESS_CONTROL access = {0,}; + IB_WR_OP rdma_op; + int rc; + __u32 i; + + CDEBUG(D_NET, "type %d, status %d, niov %d, offset %d, nob %d\n", + type, status, niov, offset, nob); + + /* Called by scheduler */ + LASSERT (!in_interrupt ()); + + /* Either all pages or all vaddrs */ + LASSERT (!(kiov != NULL && iov != NULL)); + + /* No data if we're completing with failure */ + LASSERT (status == 0 || nob == 0); + + LASSERT (type == IBNAL_MSG_GET_DONE || + type == IBNAL_MSG_PUT_DONE); + + /* Flag I'm completing the RDMA. Even if I fail to send the + * completion message, I will have tried my best so further + * attempts shouldn't be tried. */ + LASSERT (!rx->rx_rdma); + rx->rx_rdma = 1; + + if (type == IBNAL_MSG_GET_DONE) { + rdma_op = WROpRdmaWrite; + LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA); + } else { + access.s.LocalWrite = 1; + rdma_op = WROpRdmaRead; + LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA); + } + + tx = kibnal_get_idle_tx (0); /* Mustn't block */ + if (tx == NULL) { + CERROR ("tx descs exhausted on RDMA from "LPX64 + " completing locally with failure\n", + rx->rx_conn->ibc_peer->ibp_nid); + lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE); + return; + } + LASSERT (tx->tx_nsp == 0); + + if (nob == 0) + GOTO(init_tx, 0); + + /* We actually need to transfer some data (the transfer + * size could get truncated to zero when the incoming + * message is matched) */ + if (kiov != NULL) + rc = kibnal_map_kiov (tx, access, niov, kiov, offset, nob, 1); + else + rc = kibnal_map_iov (tx, access, niov, iov, offset, nob, 1); + + if (rc != 0) { + CERROR ("Can't map RDMA -> "LPX64": %d\n", + rx->rx_conn->ibc_peer->ibp_nid, rc); + /* We'll skip the RDMA and complete with failure. */ + status = rc; + nob = 0; + GOTO(init_tx, rc); + } + + if (!kibnal_whole_mem()) { + tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_key = tx->tx_md.md_lkey; + tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr; + tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob; + tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 1; + } + + /* XXX ugh. different page-sized hosts. */ + if (tx->tx_msg->ibm_u.rdma.ibrm_num_descs != + rxmsg->ibm_u.rdma.ibrm_num_descs) { + CERROR("tx descs (%u) != rx descs (%u)\n", + tx->tx_msg->ibm_u.rdma.ibrm_num_descs, + rxmsg->ibm_u.rdma.ibrm_num_descs); + /* We'll skip the RDMA and complete with failure. */ + status = rc; + nob = 0; + GOTO(init_tx, rc); + } + + /* map_kiov filled in the rdma descs which describe our side of the + * rdma transfer. */ + /* ibrm_num_descs was verified in rx_callback */ + for(i = 0; i < rxmsg->ibm_u.rdma.ibrm_num_descs; i++) { + kib_rdma_desc_t *ldesc, *rdesc; /* local, remote */ + IB_LOCAL_DATASEGMENT *ds = &tx->tx_gl[i]; + IB_WORK_REQ *wrq = &tx->tx_wrq[i]; + + ldesc = &tx->tx_msg->ibm_u.rdma.ibrm_desc[i]; + rdesc = &rxmsg->ibm_u.rdma.ibrm_desc[i]; + + ds->Address = ldesc->rd_addr; + ds->Length = ldesc->rd_nob; + ds->Lkey = ldesc->rd_key; + + memset(wrq, 0, sizeof(*wrq)); + wrq->WorkReqId = kibnal_ptr2wreqid(tx, 0); + wrq->Operation = rdma_op; + wrq->DSList = ds; + wrq->DSListDepth = 1; + wrq->MessageLen = ds->Length; + wrq->Req.SendRC.ImmediateData = 0; + wrq->Req.SendRC.Options.s.SolicitedEvent = 0; + wrq->Req.SendRC.Options.s.SignaledCompletion = 0; + wrq->Req.SendRC.Options.s.ImmediateData = 0; + wrq->Req.SendRC.Options.s.Fence = 0; + wrq->Req.SendRC.RemoteDS.Address = rdesc->rd_addr; + wrq->Req.SendRC.RemoteDS.Rkey = rdesc->rd_key; + + /* only the last rdma post triggers tx completion */ + if (i == rxmsg->ibm_u.rdma.ibrm_num_descs - 1) + wrq->Req.SendRC.Options.s.SignaledCompletion = 1; + + tx->tx_nsp++; + } + +init_tx: + txmsg = tx->tx_msg; + + txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie; + txmsg->ibm_u.completion.ibcm_status = status; + + kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t)); + + if (status == 0 && nob != 0) { + LASSERT (tx->tx_nsp > 1); + /* RDMA: libmsg gets finalized when the tx completes. This + * is after the completion message has been sent, which in + * turn is after the RDMA has finished. */ + tx->tx_libmsg[0] = libmsg; + } else { + LASSERT (tx->tx_nsp == 1); + /* No RDMA: local completion happens now! */ + CDEBUG(D_WARNING,"No data: immediate completion\n"); + lib_finalize (&kibnal_lib, NULL, libmsg, + status == 0 ? PTL_OK : PTL_FAIL); + } + + /* +1 ref for this tx... */ + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + rx->rx_conn, rx->rx_conn->ibc_state, + rx->rx_conn->ibc_peer->ibp_nid, + atomic_read (&rx->rx_conn->ibc_refcount)); + atomic_inc (&rx->rx_conn->ibc_refcount); + /* ...and queue it up */ + kibnal_queue_tx(tx, rx->rx_conn); +} + +static ptl_err_t +kibnal_sendmsg(lib_nal_t *nal, + void *private, + lib_msg_t *libmsg, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int payload_niov, + struct iovec *payload_iov, + ptl_kiov_t *payload_kiov, + size_t payload_offset, + size_t payload_nob) +{ + kib_msg_t *ibmsg; + kib_tx_t *tx; + int nob; + + /* NB 'private' is different depending on what we're sending.... */ + + CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid:"LPX64 + " pid %d\n", payload_nob, payload_niov, nid , pid); + + LASSERT (payload_nob == 0 || payload_niov > 0); + LASSERT (payload_niov <= PTL_MD_MAX_IOV); + + /* Thread context if we're sending payload */ + LASSERT (!in_interrupt() || payload_niov == 0); + /* payload is either all vaddrs or all pages */ + LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); + + switch (type) { + default: + LBUG(); + return (PTL_FAIL); + + case PTL_MSG_REPLY: { + /* reply's 'private' is the incoming receive */ + kib_rx_t *rx = private; + + /* RDMA reply expected? */ + if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) { + kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0, + rx, libmsg, payload_niov, + payload_iov, payload_kiov, + payload_offset, payload_nob); + return (PTL_OK); + } + + /* Incoming message consistent with immediate reply? */ + if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) { + CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n", + nid, rx->rx_msg->ibm_type); + return (PTL_FAIL); + } + + /* Will it fit in a message? */ + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); + if (nob >= IBNAL_MSG_SIZE) { + CERROR("REPLY for "LPX64" too big (RDMA not requested): %d\n", + nid, payload_nob); + return (PTL_FAIL); + } + break; + } + + case PTL_MSG_GET: + /* might the REPLY message be big enough to need RDMA? */ + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]); + if (nob > IBNAL_MSG_SIZE) + return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, + nid, libmsg, hdr)); + break; + + case PTL_MSG_ACK: + LASSERT (payload_nob == 0); + break; + + case PTL_MSG_PUT: + /* Is the payload big enough to need RDMA? */ + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); + if (nob > IBNAL_MSG_SIZE) + return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA, + nid, libmsg, hdr)); + + break; + } + + tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK || + type == PTL_MSG_REPLY || + in_interrupt())); + if (tx == NULL) { + CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n", + type, nid, in_interrupt() ? " (intr)" : ""); + return (PTL_NO_SPACE); + } + + ibmsg = tx->tx_msg; + ibmsg->ibm_u.immediate.ibim_hdr = *hdr; + + if (payload_nob > 0) { + if (payload_kiov != NULL) + lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload, + payload_niov, payload_kiov, + payload_offset, payload_nob); + else + lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload, + payload_niov, payload_iov, + payload_offset, payload_nob); + } + + kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, + offsetof(kib_immediate_msg_t, + ibim_payload[payload_nob])); + + /* libmsg gets finalized when tx completes */ + tx->tx_libmsg[0] = libmsg; + + kibnal_launch_tx(tx, nid); + return (PTL_OK); +} + +static ptl_err_t +kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int payload_niov, struct iovec *payload_iov, + size_t payload_offset, size_t payload_len) +{ + return (kibnal_sendmsg(nal, private, cookie, + hdr, type, nid, pid, + payload_niov, payload_iov, NULL, + payload_offset, payload_len)); +} + +static ptl_err_t +kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int payload_niov, ptl_kiov_t *payload_kiov, + size_t payload_offset, size_t payload_len) +{ + return (kibnal_sendmsg(nal, private, cookie, + hdr, type, nid, pid, + payload_niov, NULL, payload_kiov, + payload_offset, payload_len)); +} + +static ptl_err_t +kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, + unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov, + size_t offset, size_t mlen, size_t rlen) +{ + kib_rx_t *rx = private; + kib_msg_t *rxmsg = rx->rx_msg; + int msg_nob; + + LASSERT (mlen <= rlen); + LASSERT (!in_interrupt ()); + /* Either all pages or all vaddrs */ + LASSERT (!(kiov != NULL && iov != NULL)); + + switch (rxmsg->ibm_type) { + default: + LBUG(); + return (PTL_FAIL); + + case IBNAL_MSG_IMMEDIATE: + msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]); + if (msg_nob > IBNAL_MSG_SIZE) { + CERROR ("Immediate message from "LPX64" too big: %d\n", + rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen); + return (PTL_FAIL); + } + + if (kiov != NULL) + lib_copy_buf2kiov(niov, kiov, offset, + rxmsg->ibm_u.immediate.ibim_payload, + mlen); + else + lib_copy_buf2iov(niov, iov, offset, + rxmsg->ibm_u.immediate.ibim_payload, + mlen); + + lib_finalize (nal, NULL, libmsg, PTL_OK); + return (PTL_OK); + + case IBNAL_MSG_GET_RDMA: + /* We get called here just to discard any junk after the + * GET hdr. */ + LASSERT (libmsg == NULL); + lib_finalize (nal, NULL, libmsg, PTL_OK); + return (PTL_OK); + + case IBNAL_MSG_PUT_RDMA: + kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0, + rx, libmsg, + niov, iov, kiov, offset, mlen); + return (PTL_OK); + } +} + +static ptl_err_t +kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, + unsigned int niov, struct iovec *iov, + size_t offset, size_t mlen, size_t rlen) +{ + return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL, + offset, mlen, rlen)); +} + +static ptl_err_t +kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg, + unsigned int niov, ptl_kiov_t *kiov, + size_t offset, size_t mlen, size_t rlen) +{ + return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov, + offset, mlen, rlen)); +} + +/***************************************************************************** + * the rest of this file concerns connection management. active connetions + * start with connect_peer, passive connections start with passive_callback. + * active disconnects start with conn_close, cm_callback starts passive + * disconnects and contains the guts of how the disconnect state machine + * progresses. + *****************************************************************************/ + +int +kibnal_thread_start (int (*fn)(void *arg), void *arg) +{ + long pid = kernel_thread (fn, arg, 0); + + if (pid < 0) + return ((int)pid); + + atomic_inc (&kibnal_data.kib_nthreads); + return (0); +} + +static void +kibnal_thread_fini (void) +{ + atomic_dec (&kibnal_data.kib_nthreads); +} + +/* this can be called by anyone at any time to close a connection. if + * the connection is still established it heads to the connd to start + * the disconnection in a safe context. It has no effect if called + * on a connection that is already disconnecting */ +void +kibnal_close_conn_locked (kib_conn_t *conn, int error) +{ + /* This just does the immmediate housekeeping, and schedules the + * connection for the connd to finish off. + * Caller holds kib_global_lock exclusively in irq context */ + kib_peer_t *peer = conn->ibc_peer; + + KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_CONNECTING, + IBNAL_CONN_DISCONNECTED); + + if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) + return; /* already disconnecting */ + + CDEBUG (error == 0 ? D_NET : D_ERROR, + "closing conn to "LPX64": error %d\n", peer->ibp_nid, error); + + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { + /* kib_connd_conns takes ibc_list's ref */ + list_del (&conn->ibc_list); + } else { + /* new ref for kib_connd_conns */ + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); + } + + if (list_empty (&peer->ibp_conns) && + peer->ibp_persistence == 0) { + /* Non-persistent peer with no more conns... */ + kibnal_unlink_peer_locked (peer); + } + + conn->ibc_state = IBNAL_CONN_SEND_DREQ; + + spin_lock (&kibnal_data.kib_connd_lock); + + list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns); + wake_up (&kibnal_data.kib_connd_waitq); + + spin_unlock (&kibnal_data.kib_connd_lock); +} + +void +kibnal_close_conn (kib_conn_t *conn, int error) +{ + unsigned long flags; + + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + + kibnal_close_conn_locked (conn, error); + + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); +} + +static void +kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc) +{ + LIST_HEAD (zombies); + kib_tx_t *tx; + unsigned long flags; + + LASSERT (rc != 0); + LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL); + + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + + LASSERT (peer->ibp_connecting != 0); + peer->ibp_connecting--; + + if (peer->ibp_connecting != 0) { + /* another connection attempt under way (loopback?)... */ + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + return; + } + + if (list_empty(&peer->ibp_conns)) { + /* Say when active connection can be re-attempted */ + peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval; + /* Increase reconnection interval */ + peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2, + IBNAL_MAX_RECONNECT_INTERVAL); + + /* Take peer's blocked blocked transmits; I'll complete + * them with error */ + while (!list_empty (&peer->ibp_tx_queue)) { + tx = list_entry (peer->ibp_tx_queue.next, + kib_tx_t, tx_list); + + list_del (&tx->tx_list); + list_add_tail (&tx->tx_list, &zombies); + } + + if (kibnal_peer_active(peer) && + (peer->ibp_persistence == 0)) { + /* failed connection attempt on non-persistent peer */ + kibnal_unlink_peer_locked (peer); + } + } else { + /* Can't have blocked transmits if there are connections */ + LASSERT (list_empty(&peer->ibp_tx_queue)); + } + + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + + if (!list_empty (&zombies)) + CERROR ("Deleting messages for "LPX64": connection failed\n", + peer->ibp_nid); + + while (!list_empty (&zombies)) { + tx = list_entry (zombies.next, kib_tx_t, tx_list); + + list_del (&tx->tx_list); + /* complete now */ + tx->tx_status = -EHOSTUNREACH; + kibnal_tx_done (tx); + } +} + +static void +kibnal_connreq_done (kib_conn_t *conn, int active, int status) +{ + int state = conn->ibc_state; + kib_peer_t *peer = conn->ibc_peer; + kib_tx_t *tx; + unsigned long flags; + int i; + + /* passive connection has no connreq & vice versa */ + LASSERTF(!active == !(conn->ibc_connreq != NULL), + "%d %p\n", active, conn->ibc_connreq); + if (active) { + PORTAL_FREE (conn->ibc_connreq, sizeof (*conn->ibc_connreq)); + conn->ibc_connreq = NULL; + } + + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + + LASSERT (peer->ibp_connecting != 0); + + if (status == 0) { + /* connection established... */ + KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_CONNECTING); + conn->ibc_state = IBNAL_CONN_ESTABLISHED; + + if (!kibnal_peer_active(peer)) { + /* ...but peer deleted meantime */ + status = -ECONNABORTED; + } + } else { + KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_INIT_QP, + IBNAL_CONN_CONNECTING); + } + + if (status == 0) { + /* Everything worked! */ + + peer->ibp_connecting--; + + /* +1 ref for ibc_list; caller(== CM)'s ref remains until + * the IB_CM_IDLE callback */ + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); + list_add (&conn->ibc_list, &peer->ibp_conns); + + /* reset reconnect interval for next attempt */ + peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; + + /* post blocked sends to the new connection */ + spin_lock (&conn->ibc_lock); + + while (!list_empty (&peer->ibp_tx_queue)) { + tx = list_entry (peer->ibp_tx_queue.next, + kib_tx_t, tx_list); + + list_del (&tx->tx_list); + + /* +1 ref for each tx */ + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); + kibnal_queue_tx_locked (tx, conn); + } + + spin_unlock (&conn->ibc_lock); + + /* Nuke any dangling conns from a different peer instance... */ + kibnal_close_stale_conns_locked (conn->ibc_peer, + conn->ibc_incarnation); + + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + + /* queue up all the receives */ + for (i = 0; i < IBNAL_RX_MSGS; i++) { + /* +1 ref for rx desc */ + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); + + CDEBUG(D_NET, "RX[%d] %p->%p - "LPX64"\n", + i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg, + conn->ibc_rxs[i].rx_vaddr); + + kibnal_post_rx (&conn->ibc_rxs[i], 0); + } + + kibnal_check_sends (conn); + return; + } + + /* connection failed */ + if (state == IBNAL_CONN_CONNECTING) { + /* schedule for connd to close */ + kibnal_close_conn_locked (conn, status); + } else { + /* Don't have a CM comm_id; just wait for refs to drain */ + conn->ibc_state = IBNAL_CONN_DISCONNECTED; + } + + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + + kibnal_peer_connect_failed (conn->ibc_peer, active, status); + + /* If we didn't establish the connection we don't have to pass + * through the disconnect protocol before dropping the CM ref */ + if (state < IBNAL_CONN_CONNECTING) + kibnal_put_conn (conn); +} + +static int +kibnal_accept (kib_conn_t **connp, IB_HANDLE *cep, + ptl_nid_t nid, __u64 incarnation, int queue_depth) +{ + kib_conn_t *conn = kibnal_create_conn(); + kib_peer_t *peer; + kib_peer_t *peer2; + unsigned long flags; + + if (conn == NULL) + return (-ENOMEM); + + if (queue_depth != IBNAL_MSG_QUEUE_SIZE) { + CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n", + nid, queue_depth, IBNAL_MSG_QUEUE_SIZE); + atomic_dec (&conn->ibc_refcount); + kibnal_destroy_conn(conn); + return (-EPROTO); + } + + /* assume 'nid' is a new peer */ + peer = kibnal_create_peer (nid); + if (peer == NULL) { + CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_dec (&conn->ibc_refcount); + kibnal_destroy_conn(conn); + return (-ENOMEM); + } + + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + + peer2 = kibnal_find_peer_locked(nid); + if (peer2 == NULL) { + /* peer table takes my ref on peer */ + list_add_tail (&peer->ibp_list, kibnal_nid2peerlist(nid)); + } else { + kib_peer_decref (peer); + peer = peer2; + } + + kib_peer_addref(peer); /* +1 ref for conn */ + peer->ibp_connecting++; + + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + + conn->ibc_peer = peer; + conn->ibc_state = IBNAL_CONN_CONNECTING; + /* conn->ibc_cep is set when cm_accept is called */ + conn->ibc_incarnation = incarnation; + conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; + + *connp = conn; + return (0); +} + +static void kibnal_set_qp_state(IB_HANDLE *qp, IB_QP_STATE state) +{ + IB_QP_ATTRIBUTES_MODIFY modify_attr = {0,}; + FSTATUS frc; + + modify_attr.RequestState = state; + + frc = iibt_qp_modify(qp, &modify_attr, NULL); + if (frc != FSUCCESS) + CERROR("couldn't set qp state to %d, error %d\n", state, frc); +} + +static void kibnal_flush_pending(kib_conn_t *conn) +{ + LIST_HEAD (zombies); + struct list_head *tmp; + struct list_head *nxt; + kib_tx_t *tx; + unsigned long flags; + int done; + + /* NB we wait until the connection has closed before completing + * outstanding passive RDMAs so we can be sure the network can't + * touch the mapped memory any more. */ + KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_DISCONNECTED); + + /* set the QP to the error state so that we get flush callbacks + * on our posted receives which can then drop their conn refs */ + kibnal_set_qp_state(conn->ibc_qp, QPStateError); + + spin_lock_irqsave (&conn->ibc_lock, flags); + + /* grab passive RDMAs not waiting for the tx callback */ + list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) { + tx = list_entry (tmp, kib_tx_t, tx_list); + + LASSERT (tx->tx_passive_rdma || + !tx->tx_passive_rdma_wait); + + LASSERT (tx->tx_passive_rdma_wait || + tx->tx_sending != 0); + + /* still waiting for tx callback? */ + if (!tx->tx_passive_rdma_wait) + continue; + + tx->tx_status = -ECONNABORTED; + tx->tx_passive_rdma_wait = 0; + done = (tx->tx_sending == 0); + + if (!done) + continue; + + list_del (&tx->tx_list); + list_add (&tx->tx_list, &zombies); + } + + /* grab all blocked transmits */ + list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) { + tx = list_entry (tmp, kib_tx_t, tx_list); + + list_del (&tx->tx_list); + list_add (&tx->tx_list, &zombies); + } + + spin_unlock_irqrestore (&conn->ibc_lock, flags); + + while (!list_empty(&zombies)) { + tx = list_entry (zombies.next, kib_tx_t, tx_list); + + list_del(&tx->tx_list); + kibnal_tx_done (tx); + } +} + +static void +kibnal_reject (IB_HANDLE cep, uint16_t reason) +{ + CM_REJECT_INFO *rej; + + PORTAL_ALLOC(rej, sizeof(*rej)); + if (rej == NULL) /* PORTAL_ALLOC() will CERROR on failure */ + return; + + rej->Reason = reason; + iibt_cm_reject(cep, rej); + PORTAL_FREE(rej, sizeof(*rej)); +} + +static FSTATUS +kibnal_qp_rts(IB_HANDLE qp_handle, __u32 qpn, __u8 resp_res, + IB_PATH_RECORD *path, __u8 init_depth, __u32 send_psn) +{ + IB_QP_ATTRIBUTES_MODIFY modify_attr; + FSTATUS frc; + ENTRY; + + modify_attr = (IB_QP_ATTRIBUTES_MODIFY) { + .RequestState = QPStateReadyToRecv, + .RecvPSN = IBNAL_STARTING_PSN, + .DestQPNumber = qpn, + .ResponderResources = resp_res, + .MinRnrTimer = UsecToRnrNakTimer(2000), /* 20 ms */ + .Attrs = (IB_QP_ATTR_RECVPSN | + IB_QP_ATTR_DESTQPNUMBER | + IB_QP_ATTR_RESPONDERRESOURCES | + IB_QP_ATTR_DESTAV | + IB_QP_ATTR_PATHMTU | + IB_QP_ATTR_MINRNRTIMER), + }; + GetAVFromPath(0, path, &modify_attr.PathMTU, NULL, + &modify_attr.DestAV); + + frc = iibt_qp_modify(qp_handle, &modify_attr, NULL); + if (frc != FSUCCESS) + RETURN(frc); + + modify_attr = (IB_QP_ATTRIBUTES_MODIFY) { + .RequestState = QPStateReadyToSend, + .FlowControl = TRUE, + .InitiatorDepth = init_depth, + .SendPSN = send_psn, + .LocalAckTimeout = path->PktLifeTime + 2, /* 2 or 1? */ + .RetryCount = IBNAL_RETRY, + .RnrRetryCount = IBNAL_RNR_RETRY, + .Attrs = (IB_QP_ATTR_FLOWCONTROL | + IB_QP_ATTR_INITIATORDEPTH | + IB_QP_ATTR_SENDPSN | + IB_QP_ATTR_LOCALACKTIMEOUT | + IB_QP_ATTR_RETRYCOUNT | + IB_QP_ATTR_RNRRETRYCOUNT), + }; + + frc = iibt_qp_modify(qp_handle, &modify_attr, NULL); + RETURN(frc); +} + +static void +kibnal_connect_reply (IB_HANDLE cep, CM_CONN_INFO *info, void *arg) +{ + IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs; + kib_conn_t *conn = arg; + kib_wire_connreq_t *wcr; + CM_REPLY_INFO *rep = &info->Info.Reply; + uint16_t reason; + FSTATUS frc; + + wcr = (kib_wire_connreq_t *)info->Info.Reply.PrivateData; + + if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) { + CERROR ("Can't connect "LPX64": bad magic %08x\n", + conn->ibc_peer->ibp_nid, le32_to_cpu(wcr->wcr_magic)); + GOTO(reject, reason = RC_USER_REJ); + } + + if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) { + CERROR ("Can't connect "LPX64": bad version %d\n", + conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_magic)); + GOTO(reject, reason = RC_USER_REJ); + } + + if (wcr->wcr_queue_depth != cpu_to_le16(IBNAL_MSG_QUEUE_SIZE)) { + CERROR ("Can't connect "LPX64": bad queue depth %d\n", + conn->ibc_peer->ibp_nid, + le16_to_cpu(wcr->wcr_queue_depth)); + GOTO(reject, reason = RC_USER_REJ); + } + + if (le64_to_cpu(wcr->wcr_nid) != conn->ibc_peer->ibp_nid) { + CERROR ("Unexpected NID "LPX64" from "LPX64"\n", + le64_to_cpu(wcr->wcr_nid), conn->ibc_peer->ibp_nid); + GOTO(reject, reason = RC_USER_REJ); + } + + CDEBUG(D_NET, "Connection %p -> "LPX64" REP_RECEIVED.\n", + conn, conn->ibc_peer->ibp_nid); + + conn->ibc_incarnation = le64_to_cpu(wcr->wcr_incarnation); + conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; + + frc = kibnal_qp_rts(conn->ibc_qp, rep->QPN, + min_t(__u8, rep->ArbInitiatorDepth, + ca_attr->MaxQPResponderResources), + &conn->ibc_connreq->cr_path, + min_t(__u8, rep->ArbResponderResources, + ca_attr->MaxQPInitiatorDepth), + rep->StartingPSN); + if (frc != FSUCCESS) { + CERROR("Connection %p -> "LPX64" QP RTS/RTR failed: %d\n", + conn, conn->ibc_peer->ibp_nid, frc); + GOTO(reject, reason = RC_NO_QP); + } + + /* the callback arguments are ignored for an active accept */ + conn->ibc_connreq->cr_discarded.Status = FSUCCESS; + frc = iibt_cm_accept(cep, &conn->ibc_connreq->cr_discarded, + NULL, NULL, NULL, NULL); + if (frc != FCM_CONNECT_ESTABLISHED) { + CERROR("Connection %p -> "LPX64" CMAccept failed: %d\n", + conn, conn->ibc_peer->ibp_nid, frc); + kibnal_connreq_done (conn, 1, -ECONNABORTED); + /* XXX don't call reject after accept fails? */ + return; + } + + CDEBUG(D_NET, "Connection %p -> "LPX64" Established\n", + conn, conn->ibc_peer->ibp_nid); + + kibnal_connreq_done (conn, 1, 0); + return; + +reject: + kibnal_reject(cep, reason); + kibnal_connreq_done (conn, 1, -EPROTO); +} + +/* ib_cm.h has a wealth of information on the CM procedures */ +static void +kibnal_cm_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg) +{ + kib_conn_t *conn = arg; + + CDEBUG(D_NET, "status 0x%x\n", info->Status); + + /* Established Connection Notifier */ + switch (info->Status) { + default: + CERROR("unknown status %d on Connection %p -> "LPX64"\n", + info->Status, conn, conn->ibc_peer->ibp_nid); + LBUG(); + break; + + case FCM_CONNECT_REPLY: + kibnal_connect_reply(cep, info, arg); + break; + + case FCM_DISCONNECT_REQUEST: + /* XXX lock around these state management bits? */ + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) + kibnal_close_conn (conn, 0); + conn->ibc_state = IBNAL_CONN_DREP; + iibt_cm_disconnect(conn->ibc_cep, NULL, NULL); + break; + + /* these both guarantee that no more cm callbacks will occur */ + case FCM_DISCONNECTED: /* aka FCM_DISCONNECT_TIMEOUT */ + case FCM_DISCONNECT_REPLY: + CDEBUG(D_NET, "Connection %p -> "LPX64" disconnect done.\n", + conn, conn->ibc_peer->ibp_nid); + + conn->ibc_state = IBNAL_CONN_DISCONNECTED; + kibnal_flush_pending(conn); + kibnal_put_conn(conn); /* Lose CM's ref */ + break; + } + + return; +} + +static int +kibnal_set_cm_flags(IB_HANDLE cep) +{ + FSTATUS frc; + uint32 value = 1; + + frc = iibt_cm_modify_cep(cep, CM_FLAG_TIMEWAIT_CALLBACK, + (char *)&value, sizeof(value), 0); + if (frc != FSUCCESS) { + CERROR("error setting timeout callback: %d\n", frc); + return -1; + } + +#if 0 + frc = iibt_cm_modify_cep(cep, CM_FLAG_ASYNC_ACCEPT, (char *)&value, + sizeof(value), 0); + if (frc != FSUCCESS) { + CERROR("error setting async accept: %d\n", frc); + return -1; + } +#endif + + return 0; +} + +void +kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg) +{ + IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs; + IB_QP_ATTRIBUTES_QUERY *query; + CM_REQUEST_INFO *req; + CM_CONN_INFO *rep = NULL, *rcv = NULL; + kib_wire_connreq_t *wcr; + kib_conn_t *conn = NULL; + uint16_t reason = 0; + FSTATUS frc; + int rc = 0; + + LASSERT(cep); + LASSERT(info); + LASSERT(arg == NULL); /* no conn yet for passive */ + + CDEBUG(D_NET, "status 0x%x\n", info->Status); + + req = &info->Info.Request; + wcr = (kib_wire_connreq_t *)req->PrivateData; + + CDEBUG(D_NET, "%d from "LPX64"\n", info->Status, + le64_to_cpu(wcr->wcr_nid)); + + if (info->Status == FCM_CONNECT_CANCEL) + return; + + LASSERT (info->Status == FCM_CONNECT_REQUEST); + + if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) { + CERROR ("Can't accept: bad magic %08x\n", + le32_to_cpu(wcr->wcr_magic)); + GOTO(out, reason = RC_USER_REJ); + } + + if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) { + CERROR ("Can't accept: bad version %d\n", + le16_to_cpu(wcr->wcr_magic)); + GOTO(out, reason = RC_USER_REJ); + } + + rc = kibnal_accept(&conn, cep, + le64_to_cpu(wcr->wcr_nid), + le64_to_cpu(wcr->wcr_incarnation), + le16_to_cpu(wcr->wcr_queue_depth)); + if (rc != 0) { + CERROR ("Can't accept "LPX64": %d\n", + le64_to_cpu(wcr->wcr_nid), rc); + GOTO(out, reason = RC_NO_RESOURCES); + } + + frc = kibnal_qp_rts(conn->ibc_qp, req->CEPInfo.QPN, + min_t(__u8, req->CEPInfo.OfferedInitiatorDepth, + ca_attr->MaxQPResponderResources), + &req->PathInfo.Path, + min_t(__u8, req->CEPInfo.OfferedResponderResources, + ca_attr->MaxQPInitiatorDepth), + req->CEPInfo.StartingPSN); + + if (frc != FSUCCESS) { + CERROR ("Can't mark QP RTS/RTR "LPX64": %d\n", + le64_to_cpu(wcr->wcr_nid), frc); + GOTO(out, reason = RC_NO_QP); + } + + frc = iibt_qp_query(conn->ibc_qp, &conn->ibc_qp_attrs, NULL); + if (frc != FSUCCESS) { + CERROR ("Couldn't query qp attributes "LPX64": %d\n", + le64_to_cpu(wcr->wcr_nid), frc); + GOTO(out, reason = RC_NO_QP); + } + query = &conn->ibc_qp_attrs; + + PORTAL_ALLOC(rep, sizeof(*rep)); + PORTAL_ALLOC(rcv, sizeof(*rcv)); + if (rep == NULL || rcv == NULL) { + CERROR ("can't reply and receive buffers\n"); + GOTO(out, reason = RC_INSUFFICIENT_RESP_RES); + } + + /* don't try to deref this into the incoming wcr :) */ + wcr = (kib_wire_connreq_t *)rep->Info.Reply.PrivateData; + + rep->Info.Reply = (CM_REPLY_INFO) { + .QPN = query->QPNumber, + .QKey = query->Qkey, + .StartingPSN = query->RecvPSN, + .EndToEndFlowControl = query->FlowControl, + /* XXX Hmm. */ + .ArbInitiatorDepth = query->InitiatorDepth, + .ArbResponderResources = query->ResponderResources, + .TargetAckDelay = 0, + .FailoverAccepted = 0, + .RnRRetryCount = req->CEPInfo.RnrRetryCount, + }; + + *wcr = (kib_wire_connreq_t) { + .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC), + .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION), + .wcr_queue_depth = cpu_to_le32(IBNAL_MSG_QUEUE_SIZE), + .wcr_nid = cpu_to_le64(kibnal_data.kib_nid), + .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation), + }; + + frc = iibt_cm_accept(cep, rep, rcv, kibnal_cm_callback, conn, + &conn->ibc_cep); + + PORTAL_FREE(rep, sizeof(*rep)); + PORTAL_FREE(rcv, sizeof(*rcv)); + + if (frc != FCM_CONNECT_ESTABLISHED) { + /* XXX it seems we don't call reject after this point? */ + CERROR("iibt_cm_accept() failed: %d, aborting\n", frc); + rc = -ECONNABORTED; + goto out; + } + + if (kibnal_set_cm_flags(conn->ibc_cep)) { + rc = -ECONNABORTED; + goto out; + } + + CDEBUG(D_WARNING, "Connection %p -> "LPX64" ESTABLISHED.\n", + conn, conn->ibc_peer->ibp_nid); + +out: + if (reason) { + kibnal_reject(cep, reason); + rc = -ECONNABORTED; + } + if (conn != NULL) + kibnal_connreq_done(conn, 0, rc); + + return; +} + +static void +dump_path_records(PATH_RESULTS *results) +{ + IB_PATH_RECORD *path; + int i; + + for(i = 0; i < results->NumPathRecords; i++) { + path = &results->PathRecords[i]; + CDEBUG(D_NET, "%d: sgid "LPX64":"LPX64" dgid " + LPX64":"LPX64" pkey %x\n", + i, + path->SGID.Type.Global.SubnetPrefix, + path->SGID.Type.Global.InterfaceID, + path->DGID.Type.Global.SubnetPrefix, + path->DGID.Type.Global.InterfaceID, + path->P_Key); + } +} + +static void +kibnal_pathreq_callback (void *arg, QUERY *query, + QUERY_RESULT_VALUES *query_res) +{ + IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs; + kib_conn_t *conn = arg; + PATH_RESULTS *path; + FSTATUS frc; + + if (query_res->Status != FSUCCESS || query_res->ResultDataSize == 0) { + CERROR ("status %d data size %d\n", query_res->Status, + query_res->ResultDataSize); + kibnal_connreq_done (conn, 1, -EINVAL); + return; + } + + path = (PATH_RESULTS *)query_res->QueryResult; + + if (path->NumPathRecords < 1) { + CERROR ("expected path records: %d\n", path->NumPathRecords); + kibnal_connreq_done (conn, 1, -EINVAL); + return; + } + + dump_path_records(path); + + /* just using the first. this is probably a horrible idea. */ + conn->ibc_connreq->cr_path = path->PathRecords[0]; + + conn->ibc_cep = iibt_cm_create_cep(CM_RC_TYPE); + if (conn->ibc_cep == NULL) { + CERROR ("Can't create CEP\n"); + kibnal_connreq_done (conn, 1, -EINVAL); + return; + } + + if (kibnal_set_cm_flags(conn->ibc_cep)) { + kibnal_connreq_done (conn, 1, -EINVAL); + return; + } + + conn->ibc_connreq->cr_wcr = (kib_wire_connreq_t) { + .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC), + .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION), + .wcr_queue_depth = cpu_to_le16(IBNAL_MSG_QUEUE_SIZE), + .wcr_nid = cpu_to_le64(kibnal_data.kib_nid), + .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation), + }; + + conn->ibc_connreq->cr_cmreq = (CM_REQUEST_INFO) { + .SID = conn->ibc_connreq->cr_service.RID.ServiceID, + .CEPInfo = (CM_CEP_INFO) { + .CaGUID = kibnal_data.kib_hca_guids[0], + .EndToEndFlowControl = FALSE, + .PortGUID = conn->ibc_connreq->cr_path.SGID.Type.Global.InterfaceID, + .RetryCount = IBNAL_RETRY, + .RnrRetryCount = IBNAL_RNR_RETRY, + .AckTimeout = IBNAL_ACK_TIMEOUT, + .StartingPSN = IBNAL_STARTING_PSN, + .QPN = conn->ibc_qp_attrs.QPNumber, + .QKey = conn->ibc_qp_attrs.Qkey, + .OfferedResponderResources = ca_attr->MaxQPResponderResources, + .OfferedInitiatorDepth = ca_attr->MaxQPInitiatorDepth, + }, + .PathInfo = (CM_CEP_PATHINFO) { + .bSubnetLocal = TRUE, + .Path = conn->ibc_connreq->cr_path, + }, + }; + +#if 0 + /* XXX set timeout just like SDP!!!*/ + conn->ibc_connreq->cr_path.packet_life = 13; +#endif + /* Flag I'm getting involved with the CM... */ + conn->ibc_state = IBNAL_CONN_CONNECTING; + + CDEBUG(D_NET, "Connecting to, service id "LPX64", on "LPX64"\n", + conn->ibc_connreq->cr_service.RID.ServiceID, + *kibnal_service_nid_field(&conn->ibc_connreq->cr_service)); + + memset(conn->ibc_connreq->cr_cmreq.PrivateData, 0, + CM_REQUEST_INFO_USER_LEN); + memcpy(conn->ibc_connreq->cr_cmreq.PrivateData, + &conn->ibc_connreq->cr_wcr, sizeof(conn->ibc_connreq->cr_wcr)); + + /* kibnal_cm_callback gets my conn ref */ + frc = iibt_cm_connect(conn->ibc_cep, &conn->ibc_connreq->cr_cmreq, + kibnal_cm_callback, conn); + if (frc != FPENDING && frc != FSUCCESS) { + CERROR ("Connect: %d\n", frc); + /* Back out state change as connect failed */ + conn->ibc_state = IBNAL_CONN_INIT_QP; + kibnal_connreq_done (conn, 1, -EINVAL); + } +} + +static void +dump_service_records(SERVICE_RECORD_RESULTS *results) +{ + IB_SERVICE_RECORD *svc; + int i; + + for(i = 0; i < results->NumServiceRecords; i++) { + svc = &results->ServiceRecords[i]; + CDEBUG(D_NET, "%d: sid "LPX64" gid "LPX64":"LPX64" pkey %x\n", + i, + svc->RID.ServiceID, + svc->RID.ServiceGID.Type.Global.SubnetPrefix, + svc->RID.ServiceGID.Type.Global.InterfaceID, + svc->RID.ServiceP_Key); + } +} + + +static void +kibnal_service_get_callback (void *arg, QUERY *query, + QUERY_RESULT_VALUES *query_res) +{ + kib_conn_t *conn = arg; + SERVICE_RECORD_RESULTS *svc; + COMMAND_CONTROL_PARAMETERS sd_params; + QUERY path_query; + FSTATUS frc; + + if (query_res->Status != FSUCCESS || query_res->ResultDataSize == 0) { + CERROR ("status %d data size %d\n", query_res->Status, + query_res->ResultDataSize); + kibnal_connreq_done (conn, 1, -EINVAL); + return; + } + + svc = (SERVICE_RECORD_RESULTS *)query_res->QueryResult; + + if (svc->NumServiceRecords < 1) { + CERROR ("%d service records\n", svc->NumServiceRecords); + kibnal_connreq_done (conn, 1, -EINVAL); + return; + } + + dump_service_records(svc); + + conn->ibc_connreq->cr_service = svc->ServiceRecords[0]; + + CDEBUG(D_NET, "Got status %d, service id "LPX64", on "LPX64"\n", + query_res->Status , conn->ibc_connreq->cr_service.RID.ServiceID, + *kibnal_service_nid_field(&conn->ibc_connreq->cr_service)); + + memset(&path_query, 0, sizeof(path_query)); + path_query.InputType = InputTypePortGuidPair; + path_query.OutputType = OutputTypePathRecord; + path_query.InputValue.PortGuidPair.SourcePortGuid = kibnal_data.kib_port_guid; + path_query.InputValue.PortGuidPair.DestPortGuid = conn->ibc_connreq->cr_service.RID.ServiceGID.Type.Global.InterfaceID; + + memset(&sd_params, 0, sizeof(sd_params)); + sd_params.RetryCount = IBNAL_RETRY; + sd_params.Timeout = 10 * 1000; /* wait 10 seconds */ + + /* kibnal_service_get_callback gets my conn ref */ + + frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd, + kibnal_data.kib_port_guid, + &path_query, + kibnal_pathreq_callback, + &sd_params, conn); + if (frc == FPENDING) + return; + + CERROR ("Path record request failed: %d\n", frc); + kibnal_connreq_done (conn, 1, -EINVAL); +} + +static void +kibnal_connect_peer (kib_peer_t *peer) +{ + COMMAND_CONTROL_PARAMETERS sd_params; + QUERY query; + FSTATUS frc; + kib_conn_t *conn = kibnal_create_conn(); + + LASSERT (peer->ibp_connecting != 0); + + if (conn == NULL) { + CERROR ("Can't allocate conn\n"); + kibnal_peer_connect_failed (peer, 1, -ENOMEM); + return; + } + + conn->ibc_peer = peer; + kib_peer_addref(peer); + + PORTAL_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq)); + if (conn->ibc_connreq == NULL) { + CERROR ("Can't allocate connreq\n"); + kibnal_connreq_done (conn, 1, -ENOMEM); + return; + } + + memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq)); + + kibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid); + + memset(&query, 0, sizeof(query)); + query.InputType = InputTypeServiceRecord; + query.OutputType = OutputTypeServiceRecord; + query.InputValue.ServiceRecordValue.ServiceRecord = conn->ibc_connreq->cr_service; + query.InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK; + + memset(&sd_params, 0, sizeof(sd_params)); + sd_params.RetryCount = IBNAL_RETRY; + sd_params.Timeout = 10 * 1000; /* wait 10 seconds */ + + /* kibnal_service_get_callback gets my conn ref */ + frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd, + kibnal_data.kib_port_guid, + &query, + kibnal_service_get_callback, + &sd_params, conn); + if (frc == FPENDING) + return; + + CERROR ("iibt_sd_query_port_fabric_information(): %d\n", frc); + kibnal_connreq_done (conn, 1, frc); +} + +static int +kibnal_conn_timed_out (kib_conn_t *conn) +{ + kib_tx_t *tx; + struct list_head *ttmp; + unsigned long flags; + + spin_lock_irqsave (&conn->ibc_lock, flags); + + list_for_each (ttmp, &conn->ibc_tx_queue) { + tx = list_entry (ttmp, kib_tx_t, tx_list); + + LASSERT (!tx->tx_passive_rdma_wait); + LASSERT (tx->tx_sending == 0); + + if (time_after_eq (jiffies, tx->tx_deadline)) { + spin_unlock_irqrestore (&conn->ibc_lock, flags); + return 1; + } + } + + list_for_each (ttmp, &conn->ibc_active_txs) { + tx = list_entry (ttmp, kib_tx_t, tx_list); + + LASSERT (tx->tx_passive_rdma || + !tx->tx_passive_rdma_wait); + + LASSERT (tx->tx_passive_rdma_wait || + tx->tx_sending != 0); + + if (time_after_eq (jiffies, tx->tx_deadline)) { + spin_unlock_irqrestore (&conn->ibc_lock, flags); + return 1; + } + } + + spin_unlock_irqrestore (&conn->ibc_lock, flags); + + return 0; +} + +static void +kibnal_check_conns (int idx) +{ + struct list_head *peers = &kibnal_data.kib_peers[idx]; + struct list_head *ptmp; + kib_peer_t *peer; + kib_conn_t *conn; + struct list_head *ctmp; + + again: + /* NB. We expect to have a look at all the peers and not find any + * rdmas to time out, so we just use a shared lock while we + * take a look... */ + read_lock (&kibnal_data.kib_global_lock); + + list_for_each (ptmp, peers) { + peer = list_entry (ptmp, kib_peer_t, ibp_list); + + list_for_each (ctmp, &peer->ibp_conns) { + conn = list_entry (ctmp, kib_conn_t, ibc_list); + + KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_ESTABLISHED); + + /* In case we have enough credits to return via a + * NOOP, but there were no non-blocking tx descs + * free to do it last time... */ + kibnal_check_sends(conn); + + if (!kibnal_conn_timed_out(conn)) + continue; + + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + + atomic_inc (&conn->ibc_refcount); + read_unlock (&kibnal_data.kib_global_lock); + + CERROR("Timed out RDMA with "LPX64"\n", + peer->ibp_nid); + + kibnal_close_conn (conn, -ETIMEDOUT); + kibnal_put_conn (conn); + + /* start again now I've dropped the lock */ + goto again; + } + } + + read_unlock (&kibnal_data.kib_global_lock); +} + +static void +kib_connd_handle_state(kib_conn_t *conn) +{ + FSTATUS frc; + + switch (conn->ibc_state) { + /* all refs have gone, free and be done with it */ + case IBNAL_CONN_DISCONNECTED: + kibnal_destroy_conn (conn); + return; /* avoid put_conn */ + + case IBNAL_CONN_SEND_DREQ: + frc = iibt_cm_disconnect(conn->ibc_cep, NULL, NULL); + if (frc != FSUCCESS) /* XXX do real things */ + CERROR("disconnect failed: %d\n", frc); + conn->ibc_state = IBNAL_CONN_DREQ; + break; + + /* a callback got to the conn before we did */ + case IBNAL_CONN_DREP: + break; + + default: + CERROR ("Bad conn %p state: %d\n", conn, + conn->ibc_state); + LBUG(); + break; + } + + /* drop ref from close_conn */ + kibnal_put_conn(conn); +} + +int +kibnal_connd (void *arg) +{ + wait_queue_t wait; + unsigned long flags; + kib_conn_t *conn; + kib_peer_t *peer; + int timeout; + int i; + int peer_index = 0; + unsigned long deadline = jiffies; + + kportal_daemonize ("kibnal_connd"); + kportal_blockallsigs (); + + init_waitqueue_entry (&wait, current); + + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); + + for (;;) { + if (!list_empty (&kibnal_data.kib_connd_conns)) { + conn = list_entry (kibnal_data.kib_connd_conns.next, + kib_conn_t, ibc_list); + list_del (&conn->ibc_list); + + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); + kib_connd_handle_state(conn); + + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); + continue; + } + + if (!list_empty (&kibnal_data.kib_connd_peers)) { + peer = list_entry (kibnal_data.kib_connd_peers.next, + kib_peer_t, ibp_connd_list); + + list_del_init (&peer->ibp_connd_list); + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); + + kibnal_connect_peer (peer); + kib_peer_decref (peer); + + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); + } + + /* shut down and nobody left to reap... */ + if (kibnal_data.kib_shutdown && + atomic_read(&kibnal_data.kib_nconns) == 0) + break; + + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); + + /* careful with the jiffy wrap... */ + while ((timeout = (int)(deadline - jiffies)) <= 0) { + const int n = 4; + const int p = 1; + int chunk = kibnal_data.kib_peer_hash_size; + + /* Time to check for RDMA timeouts on a few more + * peers: I do checks every 'p' seconds on a + * proportion of the peer table and I need to check + * every connection 'n' times within a timeout + * interval, to ensure I detect a timeout on any + * connection within (n+1)/n times the timeout + * interval. */ + + if (kibnal_tunables.kib_io_timeout > n * p) + chunk = (chunk * n * p) / + kibnal_tunables.kib_io_timeout; + if (chunk == 0) + chunk = 1; + + for (i = 0; i < chunk; i++) { + kibnal_check_conns (peer_index); + peer_index = (peer_index + 1) % + kibnal_data.kib_peer_hash_size; + } + + deadline += p * HZ; + } + + kibnal_data.kib_connd_waketime = jiffies + timeout; + + set_current_state (TASK_INTERRUPTIBLE); + add_wait_queue (&kibnal_data.kib_connd_waitq, &wait); + + if (!kibnal_data.kib_shutdown && + list_empty (&kibnal_data.kib_connd_conns) && + list_empty (&kibnal_data.kib_connd_peers)) + schedule_timeout (timeout); + + set_current_state (TASK_RUNNING); + remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait); + + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); + } + + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); + + kibnal_thread_fini (); + return (0); +} + +int +kibnal_scheduler(void *arg) +{ + long id = (long)arg; + char name[16]; + kib_rx_t *rx; + kib_tx_t *tx; + unsigned long flags; + int rc; + int counter = 0; + int did_something; + + snprintf(name, sizeof(name), "kibnal_sd_%02ld", id); + kportal_daemonize(name); + kportal_blockallsigs(); + + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); + + for (;;) { + did_something = 0; + + while (!list_empty(&kibnal_data.kib_sched_txq)) { + tx = list_entry(kibnal_data.kib_sched_txq.next, + kib_tx_t, tx_list); + list_del(&tx->tx_list); + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, + flags); + kibnal_tx_done(tx); + + spin_lock_irqsave(&kibnal_data.kib_sched_lock, + flags); + } + + if (!list_empty(&kibnal_data.kib_sched_rxq)) { + rx = list_entry(kibnal_data.kib_sched_rxq.next, + kib_rx_t, rx_list); + list_del(&rx->rx_list); + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, + flags); + + kibnal_rx(rx); + + did_something = 1; + spin_lock_irqsave(&kibnal_data.kib_sched_lock, + flags); + } + + /* shut down and no receives to complete... */ + if (kibnal_data.kib_shutdown && + atomic_read(&kibnal_data.kib_nconns) == 0) + break; + + /* nothing to do or hogging CPU */ + if (!did_something || counter++ == IBNAL_RESCHED) { + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, + flags); + counter = 0; + + if (!did_something) { + rc = wait_event_interruptible( + kibnal_data.kib_sched_waitq, + !list_empty(&kibnal_data.kib_sched_txq) || + !list_empty(&kibnal_data.kib_sched_rxq) || + (kibnal_data.kib_shutdown && + atomic_read (&kibnal_data.kib_nconns) == 0)); + } else { + our_cond_resched(); + } + + spin_lock_irqsave(&kibnal_data.kib_sched_lock, + flags); + } + } + + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); + + kibnal_thread_fini(); + return (0); +} + + +lib_nal_t kibnal_lib = { + libnal_data: &kibnal_data, /* NAL private data */ + libnal_send: kibnal_send, + libnal_send_pages: kibnal_send_pages, + libnal_recv: kibnal_recv, + libnal_recv_pages: kibnal_recv_pages, + libnal_dist: kibnal_dist +}; diff --git a/lustre/portals/knals/openibnal/.cvsignore b/lustre/portals/knals/openibnal/.cvsignore new file mode 100644 index 0000000..5ed596b --- /dev/null +++ b/lustre/portals/knals/openibnal/.cvsignore @@ -0,0 +1,10 @@ +.deps +Makefile +.*.cmd +autoMakefile.in +autoMakefile +*.ko +*.mod.c +.*.flags +.tmp_versions +.depend diff --git a/lustre/portals/knals/openibnal/openibnal.c b/lustre/portals/knals/openibnal/openibnal.c index 6f66143..652eb34 100644 --- a/lustre/portals/knals/openibnal/openibnal.c +++ b/lustre/portals/knals/openibnal/openibnal.c @@ -23,26 +23,25 @@ #include "openibnal.h" -nal_t koibnal_api; -ptl_handle_ni_t koibnal_ni; -koib_data_t koibnal_data; -koib_tunables_t koibnal_tunables; +nal_t kibnal_api; +ptl_handle_ni_t kibnal_ni; +kib_data_t kibnal_data; +kib_tunables_t kibnal_tunables; #ifdef CONFIG_SYSCTL -#define OPENIBNAL_SYSCTL 202 +#define IBNAL_SYSCTL 202 -#define OPENIBNAL_SYSCTL_TIMEOUT 1 -#define OPENIBNAL_SYSCTL_ZERO_COPY 2 +#define IBNAL_SYSCTL_TIMEOUT 1 -static ctl_table koibnal_ctl_table[] = { - {OPENIBNAL_SYSCTL_TIMEOUT, "timeout", - &koibnal_tunables.koib_io_timeout, sizeof (int), +static ctl_table kibnal_ctl_table[] = { + {IBNAL_SYSCTL_TIMEOUT, "timeout", + &kibnal_tunables.kib_io_timeout, sizeof (int), 0644, NULL, &proc_dointvec}, { 0 } }; -static ctl_table koibnal_top_ctl_table[] = { - {OPENIBNAL_SYSCTL, "openibnal", NULL, 0, 0555, koibnal_ctl_table}, +static ctl_table kibnal_top_ctl_table[] = { + {IBNAL_SYSCTL, "openibnal", NULL, 0, 0555, kibnal_ctl_table}, { 0 } }; #endif @@ -66,167 +65,183 @@ print_service(struct ib_common_attrib_service *service, char *tag, int rc) "service id: "LPX64"\n" "name : %s\n" "NID : "LPX64"\n", tag, rc, - service->service_id, name, service->service_data64[0]); + service->service_id, name, + *kibnal_service_nid_field(service)); } void -koibnal_service_setunset_done (tTS_IB_CLIENT_QUERY_TID tid, int status, +kibnal_service_setunset_done (tTS_IB_CLIENT_QUERY_TID tid, int status, struct ib_common_attrib_service *service, void *arg) { *(int *)arg = status; - up (&koibnal_data.koib_nid_signal); + up (&kibnal_data.kib_nid_signal); } +#if IBNAL_CHECK_ADVERT +void +kibnal_check_advert (void) +{ + struct ib_common_attrib_service *svc; + __u64 tid; + int rc; + int rc2; + + PORTAL_ALLOC(svc, sizeof(*svc)); + if (svc == NULL) + return; + + memset (svc, 0, sizeof (*svc)); + kibnal_set_service_keys(svc, kibnal_data.kib_nid); + + rc = ib_service_get (kibnal_data.kib_device, + kibnal_data.kib_port, + svc, + KIBNAL_SERVICE_KEY_MASK, + kibnal_tunables.kib_io_timeout * HZ, + kibnal_service_setunset_done, &rc2, + &tid); + + if (rc != 0) { + CERROR ("Immediate error %d checking SM service\n", rc); + } else { + down (&kibnal_data.kib_nid_signal); + rc = rc2; + + if (rc != 0) + CERROR ("Error %d checking SM service\n", rc); + } + + PORTAL_FREE(svc, sizeof(*svc)); +} +#endif + int -koibnal_advertise (void) +kibnal_advertise (void) { + struct ib_common_attrib_service *svc; __u64 tid; int rc; int rc2; - LASSERT (koibnal_data.koib_nid != PTL_NID_ANY); + LASSERT (kibnal_data.kib_nid != PTL_NID_ANY); + + PORTAL_ALLOC(svc, sizeof(*svc)); + if (svc == NULL) + return (-ENOMEM); - memset (&koibnal_data.koib_service, 0, - sizeof (koibnal_data.koib_service)); + memset (svc, 0, sizeof (*svc)); - koibnal_data.koib_service.service_id - = koibnal_data.koib_cm_service_id; + svc->service_id = kibnal_data.kib_service_id; - rc = ib_cached_gid_get(koibnal_data.koib_device, - koibnal_data.koib_port, + rc = ib_cached_gid_get(kibnal_data.kib_device, + kibnal_data.kib_port, 0, - koibnal_data.koib_service.service_gid); + svc->service_gid); if (rc != 0) { CERROR ("Can't get port %d GID: %d\n", - koibnal_data.koib_port, rc); - return (rc); + kibnal_data.kib_port, rc); + goto out; } - rc = ib_cached_pkey_get(koibnal_data.koib_device, - koibnal_data.koib_port, + rc = ib_cached_pkey_get(kibnal_data.kib_device, + kibnal_data.kib_port, 0, - &koibnal_data.koib_service.service_pkey); + &svc->service_pkey); if (rc != 0) { CERROR ("Can't get port %d PKEY: %d\n", - koibnal_data.koib_port, rc); - return (rc); + kibnal_data.kib_port, rc); + goto out; } - koibnal_data.koib_service.service_lease = 0xffffffff; + svc->service_lease = 0xffffffff; - koibnal_set_service_keys(&koibnal_data.koib_service, koibnal_data.koib_nid); + kibnal_set_service_keys(svc, kibnal_data.kib_nid); CDEBUG(D_NET, "Advertising service id "LPX64" %s:"LPX64"\n", - koibnal_data.koib_service.service_id, - koibnal_data.koib_service.service_name, - *koibnal_service_nid_field(&koibnal_data.koib_service)); + svc->service_id, + svc->service_name, *kibnal_service_nid_field(svc)); - rc = ib_service_set (koibnal_data.koib_device, - koibnal_data.koib_port, - &koibnal_data.koib_service, + rc = ib_service_set (kibnal_data.kib_device, + kibnal_data.kib_port, + svc, IB_SA_SERVICE_COMP_MASK_ID | IB_SA_SERVICE_COMP_MASK_GID | IB_SA_SERVICE_COMP_MASK_PKEY | IB_SA_SERVICE_COMP_MASK_LEASE | - KOIBNAL_SERVICE_KEY_MASK, - koibnal_tunables.koib_io_timeout * HZ, - koibnal_service_setunset_done, &rc2, &tid); + KIBNAL_SERVICE_KEY_MASK, + kibnal_tunables.kib_io_timeout * HZ, + kibnal_service_setunset_done, &rc2, &tid); - if (rc == 0) { - down (&koibnal_data.koib_nid_signal); - rc = rc2; + if (rc != 0) { + CERROR ("Immediate error %d advertising NID "LPX64"\n", + rc, kibnal_data.kib_nid); + goto out; } - - if (rc != 0) - CERROR ("Error %d advertising SM service\n", rc); + down (&kibnal_data.kib_nid_signal); + + rc = rc2; + if (rc != 0) + CERROR ("Error %d advertising NID "LPX64"\n", + rc, kibnal_data.kib_nid); + out: + PORTAL_FREE(svc, sizeof(*svc)); return (rc); } -int -koibnal_unadvertise (int expect_success) +void +kibnal_unadvertise (int expect_success) { + struct ib_common_attrib_service *svc; __u64 tid; int rc; int rc2; - LASSERT (koibnal_data.koib_nid != PTL_NID_ANY); + LASSERT (kibnal_data.kib_nid != PTL_NID_ANY); - memset (&koibnal_data.koib_service, 0, - sizeof (koibnal_data.koib_service)); + PORTAL_ALLOC(svc, sizeof(*svc)); + if (svc == NULL) + return; - koibnal_set_service_keys(&koibnal_data.koib_service, koibnal_data.koib_nid); + memset (svc, 0, sizeof(*svc)); + + kibnal_set_service_keys(svc, kibnal_data.kib_nid); CDEBUG(D_NET, "Unadvertising service %s:"LPX64"\n", - koibnal_data.koib_service.service_name, - *koibnal_service_nid_field(&koibnal_data.koib_service)); - - rc = ib_service_delete (koibnal_data.koib_device, - koibnal_data.koib_port, - &koibnal_data.koib_service, - KOIBNAL_SERVICE_KEY_MASK, - koibnal_tunables.koib_io_timeout * HZ, - koibnal_service_setunset_done, &rc2, &tid); + svc->service_name, *kibnal_service_nid_field(svc)); + + rc = ib_service_delete (kibnal_data.kib_device, + kibnal_data.kib_port, + svc, + KIBNAL_SERVICE_KEY_MASK, + kibnal_tunables.kib_io_timeout * HZ, + kibnal_service_setunset_done, &rc2, &tid); if (rc != 0) { CERROR ("Immediate error %d unadvertising NID "LPX64"\n", - rc, koibnal_data.koib_nid); - return (rc); + rc, kibnal_data.kib_nid); + goto out; } - down (&koibnal_data.koib_nid_signal); + down (&kibnal_data.kib_nid_signal); if ((rc2 == 0) == !!expect_success) - return (0); + goto out; /* success: rc == 0 */ if (expect_success) CERROR("Error %d unadvertising NID "LPX64"\n", - rc, koibnal_data.koib_nid); + rc, kibnal_data.kib_nid); else CWARN("Removed conflicting NID "LPX64"\n", - koibnal_data.koib_nid); - - return (rc); -} - -int -koibnal_check_advert (void) -{ - __u64 tid; - int rc; - int rc2; - - static struct ib_common_attrib_service srv; - - memset (&srv, 0, sizeof (srv)); - - koibnal_set_service_keys(&srv, koibnal_data.koib_nid); - - rc = ib_service_get (koibnal_data.koib_device, - koibnal_data.koib_port, - &srv, - KOIBNAL_SERVICE_KEY_MASK, - koibnal_tunables.koib_io_timeout * HZ, - koibnal_service_setunset_done, &rc2, - &tid); - - if (rc != 0) { - CERROR ("Immediate error %d checking SM service\n", rc); - } else { - down (&koibnal_data.koib_nid_signal); - rc = rc2; - - if (rc != 0) - CERROR ("Error %d checking SM service\n", rc); - } - - return (rc); + kibnal_data.kib_nid); + out: + PORTAL_FREE(svc, sizeof(*svc)); } int -koibnal_set_mynid(ptl_nid_t nid) +kibnal_set_mynid(ptl_nid_t nid) { struct timeval tv; - lib_ni_t *ni = &koibnal_lib.libnal_ni; + lib_ni_t *ni = &kibnal_lib.libnal_ni; int rc; CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", @@ -234,75 +249,76 @@ koibnal_set_mynid(ptl_nid_t nid) do_gettimeofday(&tv); - down (&koibnal_data.koib_nid_mutex); + down (&kibnal_data.kib_nid_mutex); - if (nid == koibnal_data.koib_nid) { + if (nid == kibnal_data.kib_nid) { /* no change of NID */ - up (&koibnal_data.koib_nid_mutex); + up (&kibnal_data.kib_nid_mutex); return (0); } CDEBUG(D_NET, "NID "LPX64"("LPX64")\n", - koibnal_data.koib_nid, nid); + kibnal_data.kib_nid, nid); - if (koibnal_data.koib_nid != PTL_NID_ANY) { + if (kibnal_data.kib_nid != PTL_NID_ANY) { - koibnal_unadvertise (1); + kibnal_unadvertise (1); - rc = ib_cm_listen_stop (koibnal_data.koib_listen_handle); + rc = ib_cm_listen_stop (kibnal_data.kib_listen_handle); if (rc != 0) CERROR ("Error %d stopping listener\n", rc); } - koibnal_data.koib_nid = ni->ni_pid.nid = nid; - koibnal_data.koib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; + kibnal_data.kib_nid = ni->ni_pid.nid = nid; + kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; /* Delete all existing peers and their connections after new * NID/incarnation set to ensure no old connections in our brave * new world. */ - koibnal_del_peer (PTL_NID_ANY, 0); - - rc = 0; - if (koibnal_data.koib_nid != PTL_NID_ANY) { - /* New NID installed */ + kibnal_del_peer (PTL_NID_ANY, 0); - /* remove any previous advert (crashed node etc) */ - koibnal_unadvertise(0); + if (kibnal_data.kib_nid == PTL_NID_ANY) { + /* No new NID to install */ + up (&kibnal_data.kib_nid_mutex); + return (0); + } + + /* remove any previous advert (crashed node etc) */ + kibnal_unadvertise(0); - /* Assign new service number */ - koibnal_data.koib_cm_service_id = ib_cm_service_assign(); - CDEBUG(D_NET, "service_id "LPX64"\n", koibnal_data.koib_cm_service_id); + /* Assign new service number */ + kibnal_data.kib_service_id = ib_cm_service_assign(); + CDEBUG(D_NET, "service_id "LPX64"\n", kibnal_data.kib_service_id); - rc = ib_cm_listen(koibnal_data.koib_cm_service_id, - TS_IB_CM_SERVICE_EXACT_MASK, - koibnal_passive_conn_callback, NULL, - &koibnal_data.koib_listen_handle); - if (rc != 0) { - CERROR ("ib_cm_listen error: %d\n", rc); - goto out; + rc = ib_cm_listen(kibnal_data.kib_service_id, + TS_IB_CM_SERVICE_EXACT_MASK, + kibnal_passive_conn_callback, NULL, + &kibnal_data.kib_listen_handle); + if (rc == 0) { + rc = kibnal_advertise(); + if (rc == 0) { +#if IBNAL_CHECK_ADVERT + kibnal_check_advert(); +#endif + up (&kibnal_data.kib_nid_mutex); + return (0); } - rc = koibnal_advertise(); - - koibnal_check_advert(); - } - - out: - if (rc != 0) { - koibnal_data.koib_nid = PTL_NID_ANY; + ib_cm_listen_stop(kibnal_data.kib_listen_handle); /* remove any peers that sprung up while I failed to * advertise myself */ - koibnal_del_peer (PTL_NID_ANY, 0); + kibnal_del_peer (PTL_NID_ANY, 0); } - - up (&koibnal_data.koib_nid_mutex); - return (0); + + kibnal_data.kib_nid = PTL_NID_ANY; + up (&kibnal_data.kib_nid_mutex); + return (rc); } -koib_peer_t * -koibnal_create_peer (ptl_nid_t nid) +kib_peer_t * +kibnal_create_peer (ptl_nid_t nid) { - koib_peer_t *peer; + kib_peer_t *peer; LASSERT (nid != PTL_NID_ANY); @@ -320,20 +336,20 @@ koibnal_create_peer (ptl_nid_t nid) INIT_LIST_HEAD (&peer->ibp_tx_queue); peer->ibp_reconnect_time = jiffies; - peer->ibp_reconnect_interval = OPENIBNAL_MIN_RECONNECT_INTERVAL; + peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; - atomic_inc (&koibnal_data.koib_npeers); + atomic_inc (&kibnal_data.kib_npeers); return (peer); } void -koibnal_destroy_peer (koib_peer_t *peer) +kibnal_destroy_peer (kib_peer_t *peer) { CDEBUG (D_NET, "peer "LPX64" %p deleted\n", peer->ibp_nid, peer); LASSERT (atomic_read (&peer->ibp_refcount) == 0); LASSERT (peer->ibp_persistence == 0); - LASSERT (!koibnal_peer_active(peer)); + LASSERT (!kibnal_peer_active(peer)); LASSERT (peer->ibp_connecting == 0); LASSERT (list_empty (&peer->ibp_conns)); LASSERT (list_empty (&peer->ibp_tx_queue)); @@ -344,11 +360,11 @@ koibnal_destroy_peer (koib_peer_t *peer) * they are destroyed, so we can be assured that _all_ state to do * with this peer has been cleaned up when its refcount drops to * zero. */ - atomic_dec (&koibnal_data.koib_npeers); + atomic_dec (&kibnal_data.kib_npeers); } void -koibnal_put_peer (koib_peer_t *peer) +kibnal_put_peer (kib_peer_t *peer) { CDEBUG (D_OTHER, "putting peer[%p] -> "LPX64" (%d)\n", peer, peer->ibp_nid, @@ -358,19 +374,19 @@ koibnal_put_peer (koib_peer_t *peer) if (!atomic_dec_and_test (&peer->ibp_refcount)) return; - koibnal_destroy_peer (peer); + kibnal_destroy_peer (peer); } -koib_peer_t * -koibnal_find_peer_locked (ptl_nid_t nid) +kib_peer_t * +kibnal_find_peer_locked (ptl_nid_t nid) { - struct list_head *peer_list = koibnal_nid2peerlist (nid); + struct list_head *peer_list = kibnal_nid2peerlist (nid); struct list_head *tmp; - koib_peer_t *peer; + kib_peer_t *peer; list_for_each (tmp, peer_list) { - peer = list_entry (tmp, koib_peer_t, ibp_list); + peer = list_entry (tmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence != 0 || /* persistent peer */ peer->ibp_connecting != 0 || /* creating conns */ @@ -386,46 +402,46 @@ koibnal_find_peer_locked (ptl_nid_t nid) return (NULL); } -koib_peer_t * -koibnal_get_peer (ptl_nid_t nid) +kib_peer_t * +kibnal_get_peer (ptl_nid_t nid) { - koib_peer_t *peer; + kib_peer_t *peer; - read_lock (&koibnal_data.koib_global_lock); - peer = koibnal_find_peer_locked (nid); + read_lock (&kibnal_data.kib_global_lock); + peer = kibnal_find_peer_locked (nid); if (peer != NULL) /* +1 ref for caller? */ atomic_inc (&peer->ibp_refcount); - read_unlock (&koibnal_data.koib_global_lock); + read_unlock (&kibnal_data.kib_global_lock); return (peer); } void -koibnal_unlink_peer_locked (koib_peer_t *peer) +kibnal_unlink_peer_locked (kib_peer_t *peer) { LASSERT (peer->ibp_persistence == 0); LASSERT (list_empty(&peer->ibp_conns)); - LASSERT (koibnal_peer_active(peer)); + LASSERT (kibnal_peer_active(peer)); list_del_init (&peer->ibp_list); /* lose peerlist's ref */ - koibnal_put_peer (peer); + kibnal_put_peer (peer); } int -koibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep) +kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep) { - koib_peer_t *peer; + kib_peer_t *peer; struct list_head *ptmp; int i; - read_lock (&koibnal_data.koib_global_lock); + read_lock (&kibnal_data.kib_global_lock); - for (i = 0; i < koibnal_data.koib_peer_hash_size; i++) { + for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { - list_for_each (ptmp, &koibnal_data.koib_peers[i]) { + list_for_each (ptmp, &kibnal_data.kib_peers[i]) { - peer = list_entry (ptmp, koib_peer_t, ibp_list); + peer = list_entry (ptmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence != 0 || peer->ibp_connecting != 0 || !list_empty (&peer->ibp_conns)); @@ -436,53 +452,53 @@ koibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep) *nidp = peer->ibp_nid; *persistencep = peer->ibp_persistence; - read_unlock (&koibnal_data.koib_global_lock); + read_unlock (&kibnal_data.kib_global_lock); return (0); } } - read_unlock (&koibnal_data.koib_global_lock); + read_unlock (&kibnal_data.kib_global_lock); return (-ENOENT); } int -koibnal_add_persistent_peer (ptl_nid_t nid) +kibnal_add_persistent_peer (ptl_nid_t nid) { unsigned long flags; - koib_peer_t *peer; - koib_peer_t *peer2; + kib_peer_t *peer; + kib_peer_t *peer2; if (nid == PTL_NID_ANY) return (-EINVAL); - peer = koibnal_create_peer (nid); + peer = kibnal_create_peer (nid); if (peer == NULL) return (-ENOMEM); - write_lock_irqsave (&koibnal_data.koib_global_lock, flags); + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - peer2 = koibnal_find_peer_locked (nid); + peer2 = kibnal_find_peer_locked (nid); if (peer2 != NULL) { - koibnal_put_peer (peer); + kibnal_put_peer (peer); peer = peer2; } else { /* peer table takes existing ref on peer */ list_add_tail (&peer->ibp_list, - koibnal_nid2peerlist (nid)); + kibnal_nid2peerlist (nid)); } peer->ibp_persistence++; - write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags); + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); return (0); } void -koibnal_del_peer_locked (koib_peer_t *peer, int single_share) +kibnal_del_peer_locked (kib_peer_t *peer, int single_share) { struct list_head *ctmp; struct list_head *cnxt; - koib_conn_t *conn; + kib_conn_t *conn; if (!single_share) peer->ibp_persistence = 0; @@ -493,38 +509,38 @@ koibnal_del_peer_locked (koib_peer_t *peer, int single_share) return; list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { - conn = list_entry(ctmp, koib_conn_t, ibc_list); + conn = list_entry(ctmp, kib_conn_t, ibc_list); - koibnal_close_conn_locked (conn, 0); + kibnal_close_conn_locked (conn, 0); } /* NB peer unlinks itself when last conn is closed */ } int -koibnal_del_peer (ptl_nid_t nid, int single_share) +kibnal_del_peer (ptl_nid_t nid, int single_share) { unsigned long flags; struct list_head *ptmp; struct list_head *pnxt; - koib_peer_t *peer; + kib_peer_t *peer; int lo; int hi; int i; int rc = -ENOENT; - write_lock_irqsave (&koibnal_data.koib_global_lock, flags); + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); if (nid != PTL_NID_ANY) - lo = hi = koibnal_nid2peerlist(nid) - koibnal_data.koib_peers; + lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; else { lo = 0; - hi = koibnal_data.koib_peer_hash_size - 1; + hi = kibnal_data.kib_peer_hash_size - 1; } for (i = lo; i <= hi; i++) { - list_for_each_safe (ptmp, pnxt, &koibnal_data.koib_peers[i]) { - peer = list_entry (ptmp, koib_peer_t, ibp_list); + list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) { + peer = list_entry (ptmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence != 0 || peer->ibp_connecting != 0 || !list_empty (&peer->ibp_conns)); @@ -532,7 +548,7 @@ koibnal_del_peer (ptl_nid_t nid, int single_share) if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid)) continue; - koibnal_del_peer_locked (peer, single_share); + kibnal_del_peer_locked (peer, single_share); rc = 0; /* matched something */ if (single_share) @@ -540,26 +556,26 @@ koibnal_del_peer (ptl_nid_t nid, int single_share) } } out: - write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags); + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); return (rc); } -koib_conn_t * -koibnal_get_conn_by_idx (int index) +kib_conn_t * +kibnal_get_conn_by_idx (int index) { - koib_peer_t *peer; + kib_peer_t *peer; struct list_head *ptmp; - koib_conn_t *conn; + kib_conn_t *conn; struct list_head *ctmp; int i; - read_lock (&koibnal_data.koib_global_lock); + read_lock (&kibnal_data.kib_global_lock); - for (i = 0; i < koibnal_data.koib_peer_hash_size; i++) { - list_for_each (ptmp, &koibnal_data.koib_peers[i]) { + for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { + list_for_each (ptmp, &kibnal_data.kib_peers[i]) { - peer = list_entry (ptmp, koib_peer_t, ibp_list); + peer = list_entry (ptmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence > 0 || peer->ibp_connecting != 0 || !list_empty (&peer->ibp_conns)); @@ -568,25 +584,25 @@ koibnal_get_conn_by_idx (int index) if (index-- > 0) continue; - conn = list_entry (ctmp, koib_conn_t, ibc_list); + conn = list_entry (ctmp, kib_conn_t, ibc_list); CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", conn, conn->ibc_state, conn->ibc_peer->ibp_nid, atomic_read (&conn->ibc_refcount)); atomic_inc (&conn->ibc_refcount); - read_unlock (&koibnal_data.koib_global_lock); + read_unlock (&kibnal_data.kib_global_lock); return (conn); } } } - read_unlock (&koibnal_data.koib_global_lock); + read_unlock (&kibnal_data.kib_global_lock); return (NULL); } -koib_conn_t * -koibnal_create_conn (void) +kib_conn_t * +kibnal_create_conn (void) { - koib_conn_t *conn; + kib_conn_t *conn; int i; __u64 vaddr = 0; __u64 vaddr_base; @@ -608,57 +624,57 @@ koibnal_create_conn (void) memset (conn, 0, sizeof (*conn)); INIT_LIST_HEAD (&conn->ibc_tx_queue); - INIT_LIST_HEAD (&conn->ibc_rdma_queue); + INIT_LIST_HEAD (&conn->ibc_active_txs); spin_lock_init (&conn->ibc_lock); - atomic_inc (&koibnal_data.koib_nconns); + atomic_inc (&kibnal_data.kib_nconns); /* well not really, but I call destroy() on failure, which decrements */ - PORTAL_ALLOC (conn->ibc_rxs, OPENIBNAL_RX_MSGS * sizeof (koib_rx_t)); + PORTAL_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t)); if (conn->ibc_rxs == NULL) goto failed; - memset (conn->ibc_rxs, 0, OPENIBNAL_RX_MSGS * sizeof(koib_rx_t)); + memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t)); - rc = koibnal_alloc_pages(&conn->ibc_rx_pages, - OPENIBNAL_RX_MSG_PAGES, - IB_ACCESS_LOCAL_WRITE); + rc = kibnal_alloc_pages(&conn->ibc_rx_pages, + IBNAL_RX_MSG_PAGES, + IB_ACCESS_LOCAL_WRITE); if (rc != 0) goto failed; - vaddr_base = vaddr = conn->ibc_rx_pages->oibp_vaddr; + vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr; - for (i = ipage = page_offset = 0; i < OPENIBNAL_RX_MSGS; i++) { - struct page *page = conn->ibc_rx_pages->oibp_pages[ipage]; - koib_rx_t *rx = &conn->ibc_rxs[i]; + for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) { + struct page *page = conn->ibc_rx_pages->ibp_pages[ipage]; + kib_rx_t *rx = &conn->ibc_rxs[i]; rx->rx_conn = conn; rx->rx_vaddr = vaddr; - rx->rx_msg = (koib_msg_t *)(((char *)page_address(page)) + page_offset); + rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset); - vaddr += OPENIBNAL_MSG_SIZE; - LASSERT (vaddr <= vaddr_base + OPENIBNAL_RX_MSG_BYTES); + vaddr += IBNAL_MSG_SIZE; + LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES); - page_offset += OPENIBNAL_MSG_SIZE; + page_offset += IBNAL_MSG_SIZE; LASSERT (page_offset <= PAGE_SIZE); if (page_offset == PAGE_SIZE) { page_offset = 0; ipage++; - LASSERT (ipage <= OPENIBNAL_RX_MSG_PAGES); + LASSERT (ipage <= IBNAL_RX_MSG_PAGES); } } params.qp_create = (struct ib_qp_create_param) { .limit = { /* Sends have an optional RDMA */ - .max_outstanding_send_request = 2 * OPENIBNAL_MSG_QUEUE_SIZE, - .max_outstanding_receive_request = OPENIBNAL_MSG_QUEUE_SIZE, + .max_outstanding_send_request = 2 * IBNAL_MSG_QUEUE_SIZE, + .max_outstanding_receive_request = IBNAL_MSG_QUEUE_SIZE, .max_send_gather_element = 1, .max_receive_scatter_element = 1, }, - .pd = koibnal_data.koib_pd, - .send_queue = koibnal_data.koib_tx_cq, - .receive_queue = koibnal_data.koib_rx_cq, + .pd = kibnal_data.kib_pd, + .send_queue = kibnal_data.kib_cq, + .receive_queue = kibnal_data.kib_cq, .send_policy = IB_WQ_SIGNAL_SELECTABLE, .receive_policy = IB_WQ_SIGNAL_SELECTABLE, .rd_domain = 0, @@ -673,11 +689,11 @@ koibnal_create_conn (void) } /* Mark QP created */ - conn->ibc_state = OPENIBNAL_CONN_INIT_QP; + conn->ibc_state = IBNAL_CONN_INIT_QP; params.qp_attr = (struct ib_qp_attribute) { .state = IB_QP_STATE_INIT, - .port = koibnal_data.koib_port, + .port = kibnal_data.kib_port, .enable_rdma_read = 1, .enable_rdma_write = 1, .valid_fields = (IB_QP_ATTRIBUTE_STATE | @@ -696,12 +712,12 @@ koibnal_create_conn (void) return (conn); failed: - koibnal_destroy_conn (conn); + kibnal_destroy_conn (conn); return (NULL); } void -koibnal_destroy_conn (koib_conn_t *conn) +kibnal_destroy_conn (kib_conn_t *conn) { int rc; @@ -709,21 +725,21 @@ koibnal_destroy_conn (koib_conn_t *conn) LASSERT (atomic_read (&conn->ibc_refcount) == 0); LASSERT (list_empty(&conn->ibc_tx_queue)); - LASSERT (list_empty(&conn->ibc_rdma_queue)); + LASSERT (list_empty(&conn->ibc_active_txs)); LASSERT (conn->ibc_nsends_posted == 0); LASSERT (conn->ibc_connreq == NULL); switch (conn->ibc_state) { - case OPENIBNAL_CONN_ZOMBIE: + case IBNAL_CONN_ZOMBIE: /* called after connection sequence initiated */ - case OPENIBNAL_CONN_INIT_QP: + case IBNAL_CONN_INIT_QP: rc = ib_qp_destroy(conn->ibc_qp); if (rc != 0) CERROR("Can't destroy QP: %d\n", rc); /* fall through */ - case OPENIBNAL_CONN_INIT_NOTHING: + case IBNAL_CONN_INIT_NOTHING: break; default: @@ -731,30 +747,30 @@ koibnal_destroy_conn (koib_conn_t *conn) } if (conn->ibc_rx_pages != NULL) - koibnal_free_pages(conn->ibc_rx_pages); + kibnal_free_pages(conn->ibc_rx_pages); if (conn->ibc_rxs != NULL) PORTAL_FREE(conn->ibc_rxs, - OPENIBNAL_RX_MSGS * sizeof(koib_rx_t)); + IBNAL_RX_MSGS * sizeof(kib_rx_t)); if (conn->ibc_peer != NULL) - koibnal_put_peer(conn->ibc_peer); + kibnal_put_peer(conn->ibc_peer); PORTAL_FREE(conn, sizeof (*conn)); - atomic_dec(&koibnal_data.koib_nconns); + atomic_dec(&kibnal_data.kib_nconns); - if (atomic_read (&koibnal_data.koib_nconns) == 0 && - koibnal_data.koib_shutdown) { + if (atomic_read (&kibnal_data.kib_nconns) == 0 && + kibnal_data.kib_shutdown) { /* I just nuked the last connection on shutdown; wake up * everyone so they can exit. */ - wake_up_all(&koibnal_data.koib_sched_waitq); - wake_up_all(&koibnal_data.koib_connd_waitq); + wake_up_all(&kibnal_data.kib_sched_waitq); + wake_up_all(&kibnal_data.kib_connd_waitq); } } void -koibnal_put_conn (koib_conn_t *conn) +kibnal_put_conn (kib_conn_t *conn) { unsigned long flags; @@ -767,44 +783,44 @@ koibnal_put_conn (koib_conn_t *conn) return; /* last ref only goes on zombies */ - LASSERT (conn->ibc_state == OPENIBNAL_CONN_ZOMBIE); + LASSERT (conn->ibc_state == IBNAL_CONN_ZOMBIE); - spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags); + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); - list_add (&conn->ibc_list, &koibnal_data.koib_connd_conns); - wake_up (&koibnal_data.koib_connd_waitq); + list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns); + wake_up (&kibnal_data.kib_connd_waitq); - spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags); + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); } int -koibnal_close_peer_conns_locked (koib_peer_t *peer, int why) +kibnal_close_peer_conns_locked (kib_peer_t *peer, int why) { - koib_conn_t *conn; + kib_conn_t *conn; struct list_head *ctmp; struct list_head *cnxt; int count = 0; list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { - conn = list_entry (ctmp, koib_conn_t, ibc_list); + conn = list_entry (ctmp, kib_conn_t, ibc_list); count++; - koibnal_close_conn_locked (conn, why); + kibnal_close_conn_locked (conn, why); } return (count); } int -koibnal_close_stale_conns_locked (koib_peer_t *peer, __u64 incarnation) +kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation) { - koib_conn_t *conn; + kib_conn_t *conn; struct list_head *ctmp; struct list_head *cnxt; int count = 0; list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { - conn = list_entry (ctmp, koib_conn_t, ibc_list); + conn = list_entry (ctmp, kib_conn_t, ibc_list); if (conn->ibc_incarnation == incarnation) continue; @@ -813,17 +829,17 @@ koibnal_close_stale_conns_locked (koib_peer_t *peer, __u64 incarnation) peer->ibp_nid, conn->ibc_incarnation, incarnation); count++; - koibnal_close_conn_locked (conn, -ESTALE); + kibnal_close_conn_locked (conn, -ESTALE); } return (count); } int -koibnal_close_matching_conns (ptl_nid_t nid) +kibnal_close_matching_conns (ptl_nid_t nid) { unsigned long flags; - koib_peer_t *peer; + kib_peer_t *peer; struct list_head *ptmp; struct list_head *pnxt; int lo; @@ -831,19 +847,19 @@ koibnal_close_matching_conns (ptl_nid_t nid) int i; int count = 0; - write_lock_irqsave (&koibnal_data.koib_global_lock, flags); + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); if (nid != PTL_NID_ANY) - lo = hi = koibnal_nid2peerlist(nid) - koibnal_data.koib_peers; + lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; else { lo = 0; - hi = koibnal_data.koib_peer_hash_size - 1; + hi = kibnal_data.kib_peer_hash_size - 1; } for (i = lo; i <= hi; i++) { - list_for_each_safe (ptmp, pnxt, &koibnal_data.koib_peers[i]) { + list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) { - peer = list_entry (ptmp, koib_peer_t, ibp_list); + peer = list_entry (ptmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence != 0 || peer->ibp_connecting != 0 || !list_empty (&peer->ibp_conns)); @@ -851,11 +867,11 @@ koibnal_close_matching_conns (ptl_nid_t nid) if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid)) continue; - count += koibnal_close_peer_conns_locked (peer, 0); + count += kibnal_close_peer_conns_locked (peer, 0); } } - write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags); + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); /* wildcards always succeed */ if (nid == PTL_NID_ANY) @@ -865,7 +881,7 @@ koibnal_close_matching_conns (ptl_nid_t nid) } int -koibnal_cmd(struct portals_cfg *pcfg, void * private) +kibnal_cmd(struct portals_cfg *pcfg, void * private) { int rc = -EINVAL; @@ -876,8 +892,8 @@ koibnal_cmd(struct portals_cfg *pcfg, void * private) ptl_nid_t nid = 0; int share_count = 0; - rc = koibnal_get_peer_info(pcfg->pcfg_count, - &nid, &share_count); + rc = kibnal_get_peer_info(pcfg->pcfg_count, + &nid, &share_count); pcfg->pcfg_nid = nid; pcfg->pcfg_size = 0; pcfg->pcfg_id = 0; @@ -887,17 +903,17 @@ koibnal_cmd(struct portals_cfg *pcfg, void * private) break; } case NAL_CMD_ADD_PEER: { - rc = koibnal_add_persistent_peer (pcfg->pcfg_nid); + rc = kibnal_add_persistent_peer (pcfg->pcfg_nid); break; } case NAL_CMD_DEL_PEER: { - rc = koibnal_del_peer (pcfg->pcfg_nid, + rc = kibnal_del_peer (pcfg->pcfg_nid, /* flags == single_share */ pcfg->pcfg_flags != 0); break; } case NAL_CMD_GET_CONN: { - koib_conn_t *conn = koibnal_get_conn_by_idx (pcfg->pcfg_count); + kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count); if (conn == NULL) rc = -ENOENT; @@ -907,19 +923,19 @@ koibnal_cmd(struct portals_cfg *pcfg, void * private) pcfg->pcfg_id = 0; pcfg->pcfg_misc = 0; pcfg->pcfg_flags = 0; - koibnal_put_conn (conn); + kibnal_put_conn (conn); } break; } case NAL_CMD_CLOSE_CONNECTION: { - rc = koibnal_close_matching_conns (pcfg->pcfg_nid); + rc = kibnal_close_matching_conns (pcfg->pcfg_nid); break; } case NAL_CMD_REGISTER_MYNID: { if (pcfg->pcfg_nid == PTL_NID_ANY) rc = -EINVAL; else - rc = koibnal_set_mynid (pcfg->pcfg_nid); + rc = kibnal_set_mynid (pcfg->pcfg_nid); break; } } @@ -928,47 +944,47 @@ koibnal_cmd(struct portals_cfg *pcfg, void * private) } void -koibnal_free_pages (koib_pages_t *p) +kibnal_free_pages (kib_pages_t *p) { - int npages = p->oibp_npages; + int npages = p->ibp_npages; int rc; int i; - if (p->oibp_mapped) { - rc = ib_memory_deregister(p->oibp_handle); + if (p->ibp_mapped) { + rc = ib_memory_deregister(p->ibp_handle); if (rc != 0) CERROR ("Deregister error: %d\n", rc); } for (i = 0; i < npages; i++) - if (p->oibp_pages[i] != NULL) - __free_page(p->oibp_pages[i]); + if (p->ibp_pages[i] != NULL) + __free_page(p->ibp_pages[i]); - PORTAL_FREE (p, offsetof(koib_pages_t, oibp_pages[npages])); + PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages])); } int -koibnal_alloc_pages (koib_pages_t **pp, int npages, int access) +kibnal_alloc_pages (kib_pages_t **pp, int npages, int access) { - koib_pages_t *p; + kib_pages_t *p; struct ib_physical_buffer *phys_pages; int i; int rc; - PORTAL_ALLOC(p, offsetof(koib_pages_t, oibp_pages[npages])); + PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages])); if (p == NULL) { CERROR ("Can't allocate buffer %d\n", npages); return (-ENOMEM); } - memset (p, 0, offsetof(koib_pages_t, oibp_pages[npages])); - p->oibp_npages = npages; + memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages])); + p->ibp_npages = npages; for (i = 0; i < npages; i++) { - p->oibp_pages[i] = alloc_page (GFP_KERNEL); - if (p->oibp_pages[i] == NULL) { + p->ibp_pages[i] = alloc_page (GFP_KERNEL); + if (p->ibp_pages[i] == NULL) { CERROR ("Can't allocate page %d of %d\n", i, npages); - koibnal_free_pages(p); + kibnal_free_pages(p); return (-ENOMEM); } } @@ -976,96 +992,96 @@ koibnal_alloc_pages (koib_pages_t **pp, int npages, int access) PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages)); if (phys_pages == NULL) { CERROR ("Can't allocate physarray for %d pages\n", npages); - koibnal_free_pages(p); + kibnal_free_pages(p); return (-ENOMEM); } for (i = 0; i < npages; i++) { phys_pages[i].size = PAGE_SIZE; phys_pages[i].address = - koibnal_page2phys(p->oibp_pages[i]); + kibnal_page2phys(p->ibp_pages[i]); } - p->oibp_vaddr = 0; - rc = ib_memory_register_physical(koibnal_data.koib_pd, + p->ibp_vaddr = 0; + rc = ib_memory_register_physical(kibnal_data.kib_pd, phys_pages, npages, - &p->oibp_vaddr, + &p->ibp_vaddr, npages * PAGE_SIZE, 0, access, - &p->oibp_handle, - &p->oibp_lkey, - &p->oibp_rkey); + &p->ibp_handle, + &p->ibp_lkey, + &p->ibp_rkey); PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages)); if (rc != 0) { CERROR ("Error %d mapping %d pages\n", rc, npages); - koibnal_free_pages(p); + kibnal_free_pages(p); return (rc); } - p->oibp_mapped = 1; + p->ibp_mapped = 1; *pp = p; return (0); } int -koibnal_setup_tx_descs (void) +kibnal_setup_tx_descs (void) { int ipage = 0; int page_offset = 0; __u64 vaddr; __u64 vaddr_base; struct page *page; - koib_tx_t *tx; + kib_tx_t *tx; int i; int rc; /* pre-mapped messages are not bigger than 1 page */ - LASSERT (OPENIBNAL_MSG_SIZE <= PAGE_SIZE); + LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE); /* No fancy arithmetic when we do the buffer calculations */ - LASSERT (PAGE_SIZE % OPENIBNAL_MSG_SIZE == 0); + LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0); - rc = koibnal_alloc_pages(&koibnal_data.koib_tx_pages, - OPENIBNAL_TX_MSG_PAGES, - 0); /* local read access only */ + rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, + IBNAL_TX_MSG_PAGES, + 0); /* local read access only */ if (rc != 0) return (rc); - vaddr = vaddr_base = koibnal_data.koib_tx_pages->oibp_vaddr; + vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr; - for (i = 0; i < OPENIBNAL_TX_MSGS; i++) { - page = koibnal_data.koib_tx_pages->oibp_pages[ipage]; - tx = &koibnal_data.koib_tx_descs[i]; + for (i = 0; i < IBNAL_TX_MSGS; i++) { + page = kibnal_data.kib_tx_pages->ibp_pages[ipage]; + tx = &kibnal_data.kib_tx_descs[i]; memset (tx, 0, sizeof(*tx)); /* zero flags etc */ - tx->tx_msg = (koib_msg_t *)(((char *)page_address(page)) + page_offset); + tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset); tx->tx_vaddr = vaddr; - tx->tx_isnblk = (i >= OPENIBNAL_NTX); - tx->tx_mapped = KOIB_TX_UNMAPPED; + tx->tx_isnblk = (i >= IBNAL_NTX); + tx->tx_mapped = KIB_TX_UNMAPPED; CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", i, tx, tx->tx_msg, tx->tx_vaddr); if (tx->tx_isnblk) list_add (&tx->tx_list, - &koibnal_data.koib_idle_nblk_txs); + &kibnal_data.kib_idle_nblk_txs); else list_add (&tx->tx_list, - &koibnal_data.koib_idle_txs); + &kibnal_data.kib_idle_txs); - vaddr += OPENIBNAL_MSG_SIZE; - LASSERT (vaddr <= vaddr_base + OPENIBNAL_TX_MSG_BYTES); + vaddr += IBNAL_MSG_SIZE; + LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES); - page_offset += OPENIBNAL_MSG_SIZE; + page_offset += IBNAL_MSG_SIZE; LASSERT (page_offset <= PAGE_SIZE); if (page_offset == PAGE_SIZE) { page_offset = 0; ipage++; - LASSERT (ipage <= OPENIBNAL_TX_MSG_PAGES); + LASSERT (ipage <= IBNAL_TX_MSG_PAGES); } } @@ -1073,7 +1089,7 @@ koibnal_setup_tx_descs (void) } void -koibnal_api_shutdown (nal_t *nal) +kibnal_api_shutdown (nal_t *nal) { int i; int rc; @@ -1087,119 +1103,113 @@ koibnal_api_shutdown (nal_t *nal) CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", atomic_read (&portal_kmemory)); - LASSERT(nal == &koibnal_api); + LASSERT(nal == &kibnal_api); - switch (koibnal_data.koib_init) { + switch (kibnal_data.kib_init) { default: - CERROR ("Unexpected state %d\n", koibnal_data.koib_init); + CERROR ("Unexpected state %d\n", kibnal_data.kib_init); LBUG(); - case OPENIBNAL_INIT_ALL: + case IBNAL_INIT_ALL: /* stop calls to nal_cmd */ libcfs_nal_cmd_unregister(OPENIBNAL); /* No new peers */ /* resetting my NID to unadvertises me, removes my * listener and nukes all current peers */ - koibnal_set_mynid (PTL_NID_ANY); + kibnal_set_mynid (PTL_NID_ANY); /* Wait for all peer state to clean up */ i = 2; - while (atomic_read (&koibnal_data.koib_npeers) != 0) { + while (atomic_read (&kibnal_data.kib_npeers) != 0) { i++; CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ "waiting for %d peers to close down\n", - atomic_read (&koibnal_data.koib_npeers)); + atomic_read (&kibnal_data.kib_npeers)); set_current_state (TASK_INTERRUPTIBLE); schedule_timeout (HZ); } /* fall through */ - case OPENIBNAL_INIT_TX_CQ: - rc = ib_cq_destroy (koibnal_data.koib_tx_cq); - if (rc != 0) - CERROR ("Destroy tx CQ error: %d\n", rc); - /* fall through */ - - case OPENIBNAL_INIT_RX_CQ: - rc = ib_cq_destroy (koibnal_data.koib_rx_cq); + case IBNAL_INIT_CQ: + rc = ib_cq_destroy (kibnal_data.kib_cq); if (rc != 0) - CERROR ("Destroy rx CQ error: %d\n", rc); + CERROR ("Destroy CQ error: %d\n", rc); /* fall through */ - case OPENIBNAL_INIT_TXD: - koibnal_free_pages (koibnal_data.koib_tx_pages); + case IBNAL_INIT_TXD: + kibnal_free_pages (kibnal_data.kib_tx_pages); /* fall through */ -#if OPENIBNAL_FMR - case OPENIBNAL_INIT_FMR: - rc = ib_fmr_pool_destroy (koibnal_data.koib_fmr_pool); +#if IBNAL_FMR + case IBNAL_INIT_FMR: + rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool); if (rc != 0) CERROR ("Destroy FMR pool error: %d\n", rc); /* fall through */ #endif - case OPENIBNAL_INIT_PD: - rc = ib_pd_destroy(koibnal_data.koib_pd); + case IBNAL_INIT_PD: + rc = ib_pd_destroy(kibnal_data.kib_pd); if (rc != 0) CERROR ("Destroy PD error: %d\n", rc); /* fall through */ - case OPENIBNAL_INIT_LIB: - lib_fini(&koibnal_lib); + case IBNAL_INIT_LIB: + lib_fini(&kibnal_lib); /* fall through */ - case OPENIBNAL_INIT_DATA: + case IBNAL_INIT_DATA: /* Module refcount only gets to zero when all peers * have been closed so all lists must be empty */ - LASSERT (atomic_read (&koibnal_data.koib_npeers) == 0); - LASSERT (koibnal_data.koib_peers != NULL); - for (i = 0; i < koibnal_data.koib_peer_hash_size; i++) { - LASSERT (list_empty (&koibnal_data.koib_peers[i])); + LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0); + LASSERT (kibnal_data.kib_peers != NULL); + for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { + LASSERT (list_empty (&kibnal_data.kib_peers[i])); } - LASSERT (atomic_read (&koibnal_data.koib_nconns) == 0); - LASSERT (list_empty (&koibnal_data.koib_sched_rxq)); - LASSERT (list_empty (&koibnal_data.koib_sched_txq)); - LASSERT (list_empty (&koibnal_data.koib_connd_conns)); - LASSERT (list_empty (&koibnal_data.koib_connd_peers)); + LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0); + LASSERT (list_empty (&kibnal_data.kib_sched_rxq)); + LASSERT (list_empty (&kibnal_data.kib_sched_txq)); + LASSERT (list_empty (&kibnal_data.kib_connd_conns)); + LASSERT (list_empty (&kibnal_data.kib_connd_peers)); /* flag threads to terminate; wake and wait for them to die */ - koibnal_data.koib_shutdown = 1; - wake_up_all (&koibnal_data.koib_sched_waitq); - wake_up_all (&koibnal_data.koib_connd_waitq); + kibnal_data.kib_shutdown = 1; + wake_up_all (&kibnal_data.kib_sched_waitq); + wake_up_all (&kibnal_data.kib_connd_waitq); i = 2; - while (atomic_read (&koibnal_data.koib_nthreads) != 0) { + while (atomic_read (&kibnal_data.kib_nthreads) != 0) { i++; CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ "Waiting for %d threads to terminate\n", - atomic_read (&koibnal_data.koib_nthreads)); + atomic_read (&kibnal_data.kib_nthreads)); set_current_state (TASK_INTERRUPTIBLE); schedule_timeout (HZ); } /* fall through */ - case OPENIBNAL_INIT_NOTHING: + case IBNAL_INIT_NOTHING: break; } - if (koibnal_data.koib_tx_descs != NULL) - PORTAL_FREE (koibnal_data.koib_tx_descs, - OPENIBNAL_TX_MSGS * sizeof(koib_tx_t)); + if (kibnal_data.kib_tx_descs != NULL) + PORTAL_FREE (kibnal_data.kib_tx_descs, + IBNAL_TX_MSGS * sizeof(kib_tx_t)); - if (koibnal_data.koib_peers != NULL) - PORTAL_FREE (koibnal_data.koib_peers, + if (kibnal_data.kib_peers != NULL) + PORTAL_FREE (kibnal_data.kib_peers, sizeof (struct list_head) * - koibnal_data.koib_peer_hash_size); + kibnal_data.kib_peer_hash_size); CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", atomic_read (&portal_kmemory)); printk(KERN_INFO "Lustre: OpenIB NAL unloaded (final mem %d)\n", atomic_read(&portal_kmemory)); - koibnal_data.koib_init = OPENIBNAL_INIT_NOTHING; + kibnal_data.kib_init = IBNAL_INIT_NOTHING; } int -koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, +kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, ptl_ni_limits_t *requested_limits, ptl_ni_limits_t *actual_limits) { @@ -1208,65 +1218,66 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, int rc; int i; - LASSERT (nal == &koibnal_api); + LASSERT (nal == &kibnal_api); if (nal->nal_refct != 0) { if (actual_limits != NULL) - *actual_limits = koibnal_lib.libnal_ni.ni_actual_limits; + *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits; /* This module got the first ref */ PORTAL_MODULE_USE; return (PTL_OK); } - LASSERT (koibnal_data.koib_init == OPENIBNAL_INIT_NOTHING); + LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING); - memset (&koibnal_data, 0, sizeof (koibnal_data)); /* zero pointers, flags etc */ + memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */ - init_MUTEX (&koibnal_data.koib_nid_mutex); - init_MUTEX_LOCKED (&koibnal_data.koib_nid_signal); - koibnal_data.koib_nid = PTL_NID_ANY; + init_MUTEX (&kibnal_data.kib_nid_mutex); + init_MUTEX_LOCKED (&kibnal_data.kib_nid_signal); + kibnal_data.kib_nid = PTL_NID_ANY; - rwlock_init(&koibnal_data.koib_global_lock); + rwlock_init(&kibnal_data.kib_global_lock); - koibnal_data.koib_peer_hash_size = OPENIBNAL_PEER_HASH_SIZE; - PORTAL_ALLOC (koibnal_data.koib_peers, - sizeof (struct list_head) * koibnal_data.koib_peer_hash_size); - if (koibnal_data.koib_peers == NULL) { + kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE; + PORTAL_ALLOC (kibnal_data.kib_peers, + sizeof (struct list_head) * kibnal_data.kib_peer_hash_size); + if (kibnal_data.kib_peers == NULL) { goto failed; } - for (i = 0; i < koibnal_data.koib_peer_hash_size; i++) - INIT_LIST_HEAD(&koibnal_data.koib_peers[i]); - - spin_lock_init (&koibnal_data.koib_connd_lock); - INIT_LIST_HEAD (&koibnal_data.koib_connd_peers); - INIT_LIST_HEAD (&koibnal_data.koib_connd_conns); - init_waitqueue_head (&koibnal_data.koib_connd_waitq); - - spin_lock_init (&koibnal_data.koib_sched_lock); - INIT_LIST_HEAD (&koibnal_data.koib_sched_txq); - INIT_LIST_HEAD (&koibnal_data.koib_sched_rxq); - init_waitqueue_head (&koibnal_data.koib_sched_waitq); - - spin_lock_init (&koibnal_data.koib_tx_lock); - INIT_LIST_HEAD (&koibnal_data.koib_idle_txs); - INIT_LIST_HEAD (&koibnal_data.koib_idle_nblk_txs); - init_waitqueue_head(&koibnal_data.koib_idle_tx_waitq); - - PORTAL_ALLOC (koibnal_data.koib_tx_descs, - OPENIBNAL_TX_MSGS * sizeof(koib_tx_t)); - if (koibnal_data.koib_tx_descs == NULL) { + for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) + INIT_LIST_HEAD(&kibnal_data.kib_peers[i]); + + spin_lock_init (&kibnal_data.kib_connd_lock); + INIT_LIST_HEAD (&kibnal_data.kib_connd_peers); + INIT_LIST_HEAD (&kibnal_data.kib_connd_conns); + init_waitqueue_head (&kibnal_data.kib_connd_waitq); + + spin_lock_init (&kibnal_data.kib_sched_lock); + INIT_LIST_HEAD (&kibnal_data.kib_sched_txq); + INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq); + init_waitqueue_head (&kibnal_data.kib_sched_waitq); + + spin_lock_init (&kibnal_data.kib_tx_lock); + INIT_LIST_HEAD (&kibnal_data.kib_idle_txs); + INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs); + init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq); + + PORTAL_ALLOC (kibnal_data.kib_tx_descs, + IBNAL_TX_MSGS * sizeof(kib_tx_t)); + if (kibnal_data.kib_tx_descs == NULL) { CERROR ("Can't allocate tx descs\n"); goto failed; } /* lists/ptrs/locks initialised */ - koibnal_data.koib_init = OPENIBNAL_INIT_DATA; + kibnal_data.kib_init = IBNAL_INIT_DATA; /*****************************************************/ + process_id.pid = requested_pid; - process_id.nid = koibnal_data.koib_nid; + process_id.nid = kibnal_data.kib_nid; - rc = lib_init(&koibnal_lib, nal, process_id, + rc = lib_init(&kibnal_lib, nal, process_id, requested_limits, actual_limits); if (rc != PTL_OK) { CERROR("lib_init failed: error %d\n", rc); @@ -1274,11 +1285,11 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, } /* lib interface initialised */ - koibnal_data.koib_init = OPENIBNAL_INIT_LIB; + kibnal_data.kib_init = IBNAL_INIT_LIB; /*****************************************************/ - for (i = 0; i < OPENIBNAL_N_SCHED; i++) { - rc = koibnal_thread_start (koibnal_scheduler, (void *)i); + for (i = 0; i < IBNAL_N_SCHED; i++) { + rc = kibnal_thread_start (kibnal_scheduler, (void *)i); if (rc != 0) { CERROR("Can't spawn openibnal scheduler[%d]: %d\n", i, rc); @@ -1286,56 +1297,56 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, } } - rc = koibnal_thread_start (koibnal_connd, NULL); + rc = kibnal_thread_start (kibnal_connd, NULL); if (rc != 0) { CERROR ("Can't spawn openibnal connd: %d\n", rc); goto failed; } - koibnal_data.koib_device = ib_device_get_by_index(0); - if (koibnal_data.koib_device == NULL) { + kibnal_data.kib_device = ib_device_get_by_index(0); + if (kibnal_data.kib_device == NULL) { CERROR ("Can't open ib device 0\n"); goto failed; } - rc = ib_device_properties_get(koibnal_data.koib_device, - &koibnal_data.koib_device_props); + rc = ib_device_properties_get(kibnal_data.kib_device, + &kibnal_data.kib_device_props); if (rc != 0) { CERROR ("Can't get device props: %d\n", rc); goto failed; } CDEBUG(D_NET, "Max Initiator: %d Max Responder %d\n", - koibnal_data.koib_device_props.max_initiator_per_qp, - koibnal_data.koib_device_props.max_responder_per_qp); + kibnal_data.kib_device_props.max_initiator_per_qp, + kibnal_data.kib_device_props.max_responder_per_qp); - koibnal_data.koib_port = 0; + kibnal_data.kib_port = 0; for (i = 1; i <= 2; i++) { - rc = ib_port_properties_get(koibnal_data.koib_device, i, - &koibnal_data.koib_port_props); + rc = ib_port_properties_get(kibnal_data.kib_device, i, + &kibnal_data.kib_port_props); if (rc == 0) { - koibnal_data.koib_port = i; + kibnal_data.kib_port = i; break; } } - if (koibnal_data.koib_port == 0) { + if (kibnal_data.kib_port == 0) { CERROR ("Can't find a port\n"); goto failed; } - rc = ib_pd_create(koibnal_data.koib_device, - NULL, &koibnal_data.koib_pd); + rc = ib_pd_create(kibnal_data.kib_device, + NULL, &kibnal_data.kib_pd); if (rc != 0) { CERROR ("Can't create PD: %d\n", rc); goto failed; } /* flag PD initialised */ - koibnal_data.koib_init = OPENIBNAL_INIT_PD; + kibnal_data.kib_init = IBNAL_INIT_PD; /*****************************************************/ -#if OPENIBNAL_FMR +#if IBNAL_FMR { - const int pool_size = OPENIBNAL_NTX + OPENIBNAL_NTX_NBLK; + const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK; struct ib_fmr_pool_param params = { .max_pages_per_fmr = PTL_MTU/PAGE_SIZE, .access = (IB_ACCESS_LOCAL_WRITE | @@ -1347,8 +1358,8 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, .flush_arg = NULL, .cache = 1, }; - rc = ib_fmr_pool_create(koibnal_data.koib_pd, ¶ms, - &koibnal_data.koib_fmr_pool); + rc = ib_fmr_pool_create(kibnal_data.kib_pd, ¶ms, + &kibnal_data.kib_fmr_pool); if (rc != 0) { CERROR ("Can't create FMR pool size %d: %d\n", pool_size, rc); @@ -1357,84 +1368,56 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, } /* flag FMR pool initialised */ - koibnal_data.koib_init = OPENIBNAL_INIT_FMR; + kibnal_data.kib_init = IBNAL_INIT_FMR; #endif /*****************************************************/ - rc = koibnal_setup_tx_descs(); + rc = kibnal_setup_tx_descs(); if (rc != 0) { CERROR ("Can't register tx descs: %d\n", rc); goto failed; } /* flag TX descs initialised */ - koibnal_data.koib_init = OPENIBNAL_INIT_TXD; + kibnal_data.kib_init = IBNAL_INIT_TXD; /*****************************************************/ { struct ib_cq_callback callback = { - .context = OPENIBNAL_CALLBACK_CTXT, + .context = IBNAL_CALLBACK_CTXT, .policy = IB_CQ_PROVIDER_REARM, .function = { - .entry = koibnal_rx_callback, + .entry = kibnal_callback, }, .arg = NULL, }; - int nentries = OPENIBNAL_RX_CQ_ENTRIES; + int nentries = IBNAL_CQ_ENTRIES; - rc = ib_cq_create (koibnal_data.koib_device, + rc = ib_cq_create (kibnal_data.kib_device, &nentries, &callback, NULL, - &koibnal_data.koib_rx_cq); + &kibnal_data.kib_cq); if (rc != 0) { - CERROR ("Can't create RX CQ: %d\n", rc); + CERROR ("Can't create CQ: %d\n", rc); goto failed; } /* I only want solicited events */ - rc = ib_cq_request_notification(koibnal_data.koib_rx_cq, 1); + rc = ib_cq_request_notification(kibnal_data.kib_cq, 1); LASSERT (rc == 0); } - /* flag RX CQ initialised */ - koibnal_data.koib_init = OPENIBNAL_INIT_RX_CQ; - /*****************************************************/ - - { - struct ib_cq_callback callback = { - .context = OPENIBNAL_CALLBACK_CTXT, - .policy = IB_CQ_PROVIDER_REARM, - .function = { - .entry = koibnal_tx_callback, - }, - .arg = NULL, - }; - int nentries = OPENIBNAL_TX_CQ_ENTRIES; - - rc = ib_cq_create (koibnal_data.koib_device, - &nentries, &callback, NULL, - &koibnal_data.koib_tx_cq); - if (rc != 0) { - CERROR ("Can't create RX CQ: %d\n", rc); - goto failed; - } - - /* I only want solicited events */ - rc = ib_cq_request_notification(koibnal_data.koib_tx_cq, 1); - LASSERT (rc == 0); - } - - /* flag TX CQ initialised */ - koibnal_data.koib_init = OPENIBNAL_INIT_TX_CQ; + /* flag CQ initialised */ + kibnal_data.kib_init = IBNAL_INIT_CQ; /*****************************************************/ - rc = libcfs_nal_cmd_register(OPENIBNAL, &koibnal_cmd, NULL); + rc = libcfs_nal_cmd_register(OPENIBNAL, &kibnal_cmd, NULL); if (rc != 0) { CERROR ("Can't initialise command interface (rc = %d)\n", rc); goto failed; } /* flag everything initialised */ - koibnal_data.koib_init = OPENIBNAL_INIT_ALL; + kibnal_data.kib_init = IBNAL_INIT_ALL; /*****************************************************/ printk(KERN_INFO "Lustre: OpenIB NAL loaded " @@ -1443,44 +1426,44 @@ koibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, return (PTL_OK); failed: - koibnal_api_shutdown (&koibnal_api); + kibnal_api_shutdown (&kibnal_api); return (PTL_FAIL); } void __exit -koibnal_module_fini (void) +kibnal_module_fini (void) { #ifdef CONFIG_SYSCTL - if (koibnal_tunables.koib_sysctl != NULL) - unregister_sysctl_table (koibnal_tunables.koib_sysctl); + if (kibnal_tunables.kib_sysctl != NULL) + unregister_sysctl_table (kibnal_tunables.kib_sysctl); #endif - PtlNIFini(koibnal_ni); + PtlNIFini(kibnal_ni); ptl_unregister_nal(OPENIBNAL); } int __init -koibnal_module_init (void) +kibnal_module_init (void) { int rc; /* the following must be sizeof(int) for proc_dointvec() */ - LASSERT(sizeof (koibnal_tunables.koib_io_timeout) == sizeof (int)); + LASSERT(sizeof (kibnal_tunables.kib_io_timeout) == sizeof (int)); - koibnal_api.nal_ni_init = koibnal_api_startup; - koibnal_api.nal_ni_fini = koibnal_api_shutdown; + kibnal_api.nal_ni_init = kibnal_api_startup; + kibnal_api.nal_ni_fini = kibnal_api_shutdown; /* Initialise dynamic tunables to defaults once only */ - koibnal_tunables.koib_io_timeout = OPENIBNAL_IO_TIMEOUT; + kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT; - rc = ptl_register_nal(OPENIBNAL, &koibnal_api); + rc = ptl_register_nal(OPENIBNAL, &kibnal_api); if (rc != PTL_OK) { - CERROR("Can't register OPENIBNAL: %d\n", rc); + CERROR("Can't register IBNAL: %d\n", rc); return (-ENOMEM); /* or something... */ } /* Pure gateways want the NAL started up at module load time... */ - rc = PtlNIInit(OPENIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &koibnal_ni); + rc = PtlNIInit(OPENIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni); if (rc != PTL_OK && rc != PTL_IFACE_DUP) { ptl_unregister_nal(OPENIBNAL); return (-ENODEV); @@ -1488,8 +1471,8 @@ koibnal_module_init (void) #ifdef CONFIG_SYSCTL /* Press on regardless even if registering sysctl doesn't work */ - koibnal_tunables.koib_sysctl = - register_sysctl_table (koibnal_top_ctl_table, 0); + kibnal_tunables.kib_sysctl = + register_sysctl_table (kibnal_top_ctl_table, 0); #endif return (0); } @@ -1498,6 +1481,6 @@ MODULE_AUTHOR("Cluster File Systems, Inc. "); MODULE_DESCRIPTION("Kernel OpenIB NAL v0.01"); MODULE_LICENSE("GPL"); -module_init(koibnal_module_init); -module_exit(koibnal_module_fini); +module_init(kibnal_module_init); +module_exit(kibnal_module_fini); diff --git a/lustre/portals/knals/openibnal/openibnal.h b/lustre/portals/knals/openibnal/openibnal.h index 301d3ae..f0610f2 100644 --- a/lustre/portals/knals/openibnal/openibnal.h +++ b/lustre/portals/knals/openibnal/openibnal.h @@ -48,7 +48,7 @@ #include #include -#define DEBUG_SUBSYSTEM S_OPENIBNAL +#define DEBUG_SUBSYSTEM S_IBNAL #include #include @@ -59,144 +59,140 @@ #include #include -#define OPENIBNAL_SERVICE_NAME "openibnal" +#define IBNAL_SERVICE_NAME "openibnal" #if CONFIG_SMP -# define OPENIBNAL_N_SCHED num_online_cpus() /* # schedulers */ +# define IBNAL_N_SCHED num_online_cpus() /* # schedulers */ #else -# define OPENIBNAL_N_SCHED 1 /* # schedulers */ +# define IBNAL_N_SCHED 1 /* # schedulers */ #endif -#define OPENIBNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */ -#define OPENIBNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */ +#define IBNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */ +#define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */ -#define OPENIBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */ +#define IBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */ -#define OPENIBNAL_MSG_QUEUE_SIZE 8 /* # messages in-flight */ -#define OPENIBNAL_CREDIT_HIGHWATER 6 /* when to eagerly return credits */ -#define OPENIBNAL_RETRY 7 /* # times to retry */ -#define OPENIBNAL_RNR_RETRY 7 /* */ -#define OPENIBNAL_CM_RETRY 7 /* # times to retry connection */ -#define OPENIBNAL_FLOW_CONTROL 1 -#define OPENIBNAL_RESPONDER_RESOURCES 8 +#define IBNAL_MSG_QUEUE_SIZE 8 /* # messages/RDMAs in-flight */ +#define IBNAL_CREDIT_HIGHWATER 6 /* when to eagerly return credits */ +#define IBNAL_RETRY 7 /* # times to retry */ +#define IBNAL_RNR_RETRY 7 /* */ +#define IBNAL_CM_RETRY 7 /* # times to retry connection */ +#define IBNAL_FLOW_CONTROL 1 +#define IBNAL_RESPONDER_RESOURCES 8 -#define OPENIBNAL_NTX 64 /* # tx descs */ -#define OPENIBNAL_NTX_NBLK 256 /* # reserved tx descs */ +#define IBNAL_NTX 64 /* # tx descs */ +#define IBNAL_NTX_NBLK 256 /* # reserved tx descs */ -#define OPENIBNAL_PEER_HASH_SIZE 101 /* # peer lists */ +#define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */ -#define OPENIBNAL_RESCHED 100 /* # scheduler loops before reschedule */ +#define IBNAL_RESCHED 100 /* # scheduler loops before reschedule */ -#define OPENIBNAL_CONCURRENT_PEERS 1000 /* # nodes all talking at once to me */ +#define IBNAL_CONCURRENT_PEERS 1000 /* # nodes all talking at once to me */ /* default vals for runtime tunables */ -#define OPENIBNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */ +#define IBNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */ /************************/ /* derived constants... */ /* TX messages (shared by all connections) */ -#define OPENIBNAL_TX_MSGS (OPENIBNAL_NTX + OPENIBNAL_NTX_NBLK) -#define OPENIBNAL_TX_MSG_BYTES (OPENIBNAL_TX_MSGS * OPENIBNAL_MSG_SIZE) -#define OPENIBNAL_TX_MSG_PAGES ((OPENIBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) - -/* we may have up to 2 completions per transmit */ -#define OPENIBNAL_TX_CQ_ENTRIES (2*OPENIBNAL_TX_MSGS) +#define IBNAL_TX_MSGS (IBNAL_NTX + IBNAL_NTX_NBLK) +#define IBNAL_TX_MSG_BYTES (IBNAL_TX_MSGS * IBNAL_MSG_SIZE) +#define IBNAL_TX_MSG_PAGES ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) /* RX messages (per connection) */ -#define OPENIBNAL_RX_MSGS OPENIBNAL_MSG_QUEUE_SIZE -#define OPENIBNAL_RX_MSG_BYTES (OPENIBNAL_RX_MSGS * OPENIBNAL_MSG_SIZE) -#define OPENIBNAL_RX_MSG_PAGES ((OPENIBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) +#define IBNAL_RX_MSGS IBNAL_MSG_QUEUE_SIZE +#define IBNAL_RX_MSG_BYTES (IBNAL_RX_MSGS * IBNAL_MSG_SIZE) +#define IBNAL_RX_MSG_PAGES ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) -/* 1 completion per receive, per connection */ -#define OPENIBNAL_RX_CQ_ENTRIES (OPENIBNAL_RX_MSGS * OPENIBNAL_CONCURRENT_PEERS) +/* we may have up to 2 completions per transmit + + 1 completion per receive, per connection */ +#define IBNAL_CQ_ENTRIES ((2*IBNAL_TX_MSGS) + \ + (IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS)) -#define OPENIBNAL_RDMA_BASE 0x0eeb0000 -#define OPENIBNAL_FMR 1 -#define OPENIBNAL_CKSUM 0 -//#define OPENIBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_PROCESS -#define OPENIBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_INTERRUPT +#define IBNAL_RDMA_BASE 0x0eeb0000 +#define IBNAL_FMR 1 +#define IBNAL_CKSUM 0 +//#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_PROCESS +#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_INTERRUPT typedef struct { - int koib_io_timeout; /* comms timeout (seconds) */ - struct ctl_table_header *koib_sysctl; /* sysctl interface */ -} koib_tunables_t; + int kib_io_timeout; /* comms timeout (seconds) */ + struct ctl_table_header *kib_sysctl; /* sysctl interface */ +} kib_tunables_t; typedef struct { - int oibp_npages; /* # pages */ - int oibp_mapped; /* mapped? */ - __u64 oibp_vaddr; /* mapped region vaddr */ - __u32 oibp_lkey; /* mapped region lkey */ - __u32 oibp_rkey; /* mapped region rkey */ - struct ib_mr *oibp_handle; /* mapped region handle */ - struct page *oibp_pages[0]; -} koib_pages_t; + int ibp_npages; /* # pages */ + int ibp_mapped; /* mapped? */ + __u64 ibp_vaddr; /* mapped region vaddr */ + __u32 ibp_lkey; /* mapped region lkey */ + __u32 ibp_rkey; /* mapped region rkey */ + struct ib_mr *ibp_handle; /* mapped region handle */ + struct page *ibp_pages[0]; +} kib_pages_t; typedef struct { - int koib_init; /* initialisation state */ - __u64 koib_incarnation; /* which one am I */ - int koib_shutdown; /* shut down? */ - atomic_t koib_nthreads; /* # live threads */ - - __u64 koib_cm_service_id; /* service number I listen on */ - ptl_nid_t koib_nid; /* my NID */ - struct semaphore koib_nid_mutex; /* serialise NID ops */ - struct semaphore koib_nid_signal; /* signal completion */ - - rwlock_t koib_global_lock; /* stabilize peer/conn ops */ - - struct list_head *koib_peers; /* hash table of all my known peers */ - int koib_peer_hash_size; /* size of koib_peers */ - atomic_t koib_npeers; /* # peers extant */ - atomic_t koib_nconns; /* # connections extant */ - - struct list_head koib_connd_conns; /* connections to progress */ - struct list_head koib_connd_peers; /* peers waiting for a connection */ - wait_queue_head_t koib_connd_waitq; /* connection daemons sleep here */ - unsigned long koib_connd_waketime; /* when connd will wake */ - spinlock_t koib_connd_lock; /* serialise */ - - wait_queue_head_t koib_sched_waitq; /* schedulers sleep here */ - struct list_head koib_sched_txq; /* tx requiring attention */ - struct list_head koib_sched_rxq; /* rx requiring attention */ - spinlock_t koib_sched_lock; /* serialise */ + int kib_init; /* initialisation state */ + __u64 kib_incarnation; /* which one am I */ + int kib_shutdown; /* shut down? */ + atomic_t kib_nthreads; /* # live threads */ + + __u64 kib_service_id; /* service number I listen on */ + ptl_nid_t kib_nid; /* my NID */ + struct semaphore kib_nid_mutex; /* serialise NID ops */ + struct semaphore kib_nid_signal; /* signal completion */ + + rwlock_t kib_global_lock; /* stabilize peer/conn ops */ + + struct list_head *kib_peers; /* hash table of all my known peers */ + int kib_peer_hash_size; /* size of kib_peers */ + atomic_t kib_npeers; /* # peers extant */ + atomic_t kib_nconns; /* # connections extant */ + + struct list_head kib_connd_conns; /* connections to progress */ + struct list_head kib_connd_peers; /* peers waiting for a connection */ + wait_queue_head_t kib_connd_waitq; /* connection daemons sleep here */ + unsigned long kib_connd_waketime; /* when connd will wake */ + spinlock_t kib_connd_lock; /* serialise */ + + wait_queue_head_t kib_sched_waitq; /* schedulers sleep here */ + struct list_head kib_sched_txq; /* tx requiring attention */ + struct list_head kib_sched_rxq; /* rx requiring attention */ + spinlock_t kib_sched_lock; /* serialise */ - struct koib_tx *koib_tx_descs; /* all the tx descriptors */ - koib_pages_t *koib_tx_pages; /* premapped tx msg pages */ - - struct list_head koib_idle_txs; /* idle tx descriptors */ - struct list_head koib_idle_nblk_txs; /* idle reserved tx descriptors */ - wait_queue_head_t koib_idle_tx_waitq; /* block here for tx descriptor */ - __u64 koib_next_tx_cookie; /* RDMA completion cookie */ - spinlock_t koib_tx_lock; /* serialise */ + struct kib_tx *kib_tx_descs; /* all the tx descriptors */ + kib_pages_t *kib_tx_pages; /* premapped tx msg pages */ + + struct list_head kib_idle_txs; /* idle tx descriptors */ + struct list_head kib_idle_nblk_txs; /* idle reserved tx descriptors */ + wait_queue_head_t kib_idle_tx_waitq; /* block here for tx descriptor */ + __u64 kib_next_tx_cookie; /* RDMA completion cookie */ + spinlock_t kib_tx_lock; /* serialise */ - struct ib_device *koib_device; /* "the" device */ - struct ib_device_properties koib_device_props; /* its properties */ - int koib_port; /* port on the device */ - struct ib_port_properties koib_port_props; /* its properties */ - struct ib_pd *koib_pd; /* protection domain */ -#if OPENIBNAL_FMR - struct ib_fmr_pool *koib_fmr_pool; /* fast memory region pool */ + struct ib_device *kib_device; /* "the" device */ + struct ib_device_properties kib_device_props; /* its properties */ + int kib_port; /* port on the device */ + struct ib_port_properties kib_port_props; /* its properties */ + struct ib_pd *kib_pd; /* protection domain */ +#if IBNAL_FMR + struct ib_fmr_pool *kib_fmr_pool; /* fast memory region pool */ #endif - struct ib_cq *koib_rx_cq; /* receive completion queue */ - struct ib_cq *koib_tx_cq; /* transmit completion queue */ - void *koib_listen_handle; /* where I listen for connections */ - struct ib_common_attrib_service koib_service; /* SM service */ + struct ib_cq *kib_cq; /* completion queue */ + void *kib_listen_handle; /* where I listen for connections */ -} koib_data_t; - -#define OPENIBNAL_INIT_NOTHING 0 -#define OPENIBNAL_INIT_DATA 1 -#define OPENIBNAL_INIT_LIB 2 -#define OPENIBNAL_INIT_PD 3 -#define OPENIBNAL_INIT_FMR 4 -#define OPENIBNAL_INIT_TXD 5 -#define OPENIBNAL_INIT_RX_CQ 6 -#define OPENIBNAL_INIT_TX_CQ 7 -#define OPENIBNAL_INIT_ALL 8 +} kib_data_t; + +#define IBNAL_INIT_NOTHING 0 +#define IBNAL_INIT_DATA 1 +#define IBNAL_INIT_LIB 2 +#define IBNAL_INIT_PD 3 +#define IBNAL_INIT_FMR 4 +#define IBNAL_INIT_TXD 5 +#define IBNAL_INIT_CQ 6 +#define IBNAL_INIT_ALL 7 /************************************************************************ * Wire message structs. @@ -214,125 +210,125 @@ typedef struct __u32 md_lkey; __u32 md_rkey; __u64 md_addr; -} koib_md_t; +} kib_md_t; typedef struct { __u32 rd_key; /* remote key */ __u32 rd_nob; /* # of bytes */ __u64 rd_addr; /* remote io vaddr */ -} koib_rdma_desc_t; +} kib_rdma_desc_t; typedef struct { - ptl_hdr_t oibim_hdr; /* portals header */ - char oibim_payload[0]; /* piggy-backed payload */ -} koib_immediate_msg_t; + ptl_hdr_t ibim_hdr; /* portals header */ + char ibim_payload[0]; /* piggy-backed payload */ +} kib_immediate_msg_t; typedef struct { - ptl_hdr_t oibrm_hdr; /* portals header */ - __u64 oibrm_cookie; /* opaque completion cookie */ - koib_rdma_desc_t oibrm_desc; /* where to suck/blow */ -} koib_rdma_msg_t; + ptl_hdr_t ibrm_hdr; /* portals header */ + __u64 ibrm_cookie; /* opaque completion cookie */ + kib_rdma_desc_t ibrm_desc; /* where to suck/blow */ +} kib_rdma_msg_t; typedef struct { - __u64 oibcm_cookie; /* opaque completion cookie */ - __u32 oibcm_status; /* completion status */ -} koib_completion_msg_t; + __u64 ibcm_cookie; /* opaque completion cookie */ + __u32 ibcm_status; /* completion status */ +} kib_completion_msg_t; typedef struct { - __u32 oibm_magic; /* I'm an openibnal message */ - __u16 oibm_version; /* this is my version number */ - __u8 oibm_type; /* msg type */ - __u8 oibm_credits; /* returned credits */ -#if OPENIBNAL_CKSUM - __u32 oibm_nob; - __u32 oibm_cksum; + __u32 ibm_magic; /* I'm an openibnal message */ + __u16 ibm_version; /* this is my version number */ + __u8 ibm_type; /* msg type */ + __u8 ibm_credits; /* returned credits */ +#if IBNAL_CKSUM + __u32 ibm_nob; + __u32 ibm_cksum; #endif union { - koib_immediate_msg_t immediate; - koib_rdma_msg_t rdma; - koib_completion_msg_t completion; - } oibm_u; -} koib_msg_t; - -#define OPENIBNAL_MSG_MAGIC 0x0be91b91 /* unique magic */ -#define OPENIBNAL_MSG_VERSION 1 /* current protocol version */ - -#define OPENIBNAL_MSG_NOOP 0xd0 /* nothing (just credits) */ -#define OPENIBNAL_MSG_IMMEDIATE 0xd1 /* portals hdr + payload */ -#define OPENIBNAL_MSG_PUT_RDMA 0xd2 /* portals PUT hdr + source rdma desc */ -#define OPENIBNAL_MSG_PUT_DONE 0xd3 /* signal PUT rdma completion */ -#define OPENIBNAL_MSG_GET_RDMA 0xd4 /* portals GET hdr + sink rdma desc */ -#define OPENIBNAL_MSG_GET_DONE 0xd5 /* signal GET rdma completion */ + kib_immediate_msg_t immediate; + kib_rdma_msg_t rdma; + kib_completion_msg_t completion; + } ibm_u; +} kib_msg_t; + +#define IBNAL_MSG_MAGIC 0x0be91b91 /* unique magic */ +#define IBNAL_MSG_VERSION 1 /* current protocol version */ + +#define IBNAL_MSG_NOOP 0xd0 /* nothing (just credits) */ +#define IBNAL_MSG_IMMEDIATE 0xd1 /* portals hdr + payload */ +#define IBNAL_MSG_PUT_RDMA 0xd2 /* portals PUT hdr + source rdma desc */ +#define IBNAL_MSG_PUT_DONE 0xd3 /* signal PUT rdma completion */ +#define IBNAL_MSG_GET_RDMA 0xd4 /* portals GET hdr + sink rdma desc */ +#define IBNAL_MSG_GET_DONE 0xd5 /* signal GET rdma completion */ /***********************************************************************/ -typedef struct koib_rx /* receive message */ +typedef struct kib_rx /* receive message */ { struct list_head rx_list; /* queue for attention */ - struct koib_conn *rx_conn; /* owning conn */ + struct kib_conn *rx_conn; /* owning conn */ int rx_rdma; /* RDMA completion posted? */ int rx_posted; /* posted? */ __u64 rx_vaddr; /* pre-mapped buffer (hca vaddr) */ - koib_msg_t *rx_msg; /* pre-mapped buffer (host vaddr) */ + kib_msg_t *rx_msg; /* pre-mapped buffer (host vaddr) */ struct ib_receive_param rx_sp; /* receive work item */ struct ib_gather_scatter rx_gl; /* and it's memory */ -} koib_rx_t; +} kib_rx_t; -typedef struct koib_tx /* transmit message */ +typedef struct kib_tx /* transmit message */ { struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */ int tx_isnblk; /* I'm reserved for non-blocking sends */ - struct koib_conn *tx_conn; /* owning conn */ + struct kib_conn *tx_conn; /* owning conn */ int tx_mapped; /* mapped for RDMA? */ int tx_sending; /* # tx callbacks outstanding */ int tx_status; /* completion status */ - int tx_passive_rdma; /* waiting for peer to RDMA? */ - int tx_passive_rdma_wait; /* on ibc_rdma_queue */ - unsigned long tx_passive_rdma_deadline; /* completion deadline */ + unsigned long tx_deadline; /* completion deadline */ + int tx_passive_rdma; /* peer sucks/blows */ + int tx_passive_rdma_wait; /* waiting for peer to complete */ __u64 tx_passive_rdma_cookie; /* completion cookie */ lib_msg_t *tx_libmsg[2]; /* lib msgs to finalize on completion */ - koib_md_t tx_md; /* RDMA mapping (active/passive) */ + kib_md_t tx_md; /* RDMA mapping (active/passive) */ __u64 tx_vaddr; /* pre-mapped buffer (hca vaddr) */ - koib_msg_t *tx_msg; /* pre-mapped buffer (host vaddr) */ + kib_msg_t *tx_msg; /* pre-mapped buffer (host vaddr) */ int tx_nsp; /* # send work items */ struct ib_send_param tx_sp[2]; /* send work items... */ struct ib_gather_scatter tx_gl[2]; /* ...and their memory */ -} koib_tx_t; +} kib_tx_t; -#define KOIB_TX_UNMAPPED 0 -#define KOIB_TX_MAPPED 1 -#define KOIB_TX_MAPPED_FMR 2 +#define KIB_TX_UNMAPPED 0 +#define KIB_TX_MAPPED 1 +#define KIB_TX_MAPPED_FMR 2 -typedef struct koib_wire_connreq +typedef struct kib_wire_connreq { __u32 wcr_magic; /* I'm an openibnal connreq */ __u16 wcr_version; /* this is my version number */ __u16 wcr_queue_depth; /* this is my receive queue size */ __u64 wcr_nid; /* peer's NID */ __u64 wcr_incarnation; /* peer's incarnation */ -} koib_wire_connreq_t; +} kib_wire_connreq_t; -typedef struct koib_connreq +typedef struct kib_connreq { /* connection-in-progress */ - struct koib_conn *cr_conn; - koib_wire_connreq_t cr_wcr; + struct kib_conn *cr_conn; + kib_wire_connreq_t cr_wcr; __u64 cr_tid; struct ib_common_attrib_service cr_service; tTS_IB_GID cr_gid; struct ib_path_record cr_path; struct ib_cm_active_param cr_connparam; -} koib_connreq_t; +} kib_connreq_t; -typedef struct koib_conn +typedef struct kib_conn { - struct koib_peer *ibc_peer; /* owning peer */ + struct kib_peer *ibc_peer; /* owning peer */ struct list_head ibc_list; /* stash on peer's conn list */ __u64 ibc_incarnation; /* which instance of the peer */ atomic_t ibc_refcount; /* # users */ @@ -342,27 +338,27 @@ typedef struct koib_conn int ibc_credits; /* # credits I have */ int ibc_outstanding_credits; /* # credits to return */ struct list_head ibc_tx_queue; /* send queue */ - struct list_head ibc_rdma_queue; /* tx awaiting RDMA completion */ + struct list_head ibc_active_txs; /* active tx awaiting completion */ spinlock_t ibc_lock; /* serialise */ - koib_rx_t *ibc_rxs; /* the rx descs */ - koib_pages_t *ibc_rx_pages; /* premapped rx msg pages */ + kib_rx_t *ibc_rxs; /* the rx descs */ + kib_pages_t *ibc_rx_pages; /* premapped rx msg pages */ struct ib_qp *ibc_qp; /* queue pair */ __u32 ibc_qpn; /* queue pair number */ tTS_IB_CM_COMM_ID ibc_comm_id; /* connection ID? */ - koib_connreq_t *ibc_connreq; /* connection request state */ -} koib_conn_t; + kib_connreq_t *ibc_connreq; /* connection request state */ +} kib_conn_t; -#define OPENIBNAL_CONN_INIT_NOTHING 0 /* initial state */ -#define OPENIBNAL_CONN_INIT_QP 1 /* ibc_qp set up */ -#define OPENIBNAL_CONN_CONNECTING 2 /* started to connect */ -#define OPENIBNAL_CONN_ESTABLISHED 3 /* connection established */ -#define OPENIBNAL_CONN_DEATHROW 4 /* waiting to be closed */ -#define OPENIBNAL_CONN_ZOMBIE 5 /* waiting to be freed */ +#define IBNAL_CONN_INIT_NOTHING 0 /* initial state */ +#define IBNAL_CONN_INIT_QP 1 /* ibc_qp set up */ +#define IBNAL_CONN_CONNECTING 2 /* started to connect */ +#define IBNAL_CONN_ESTABLISHED 3 /* connection established */ +#define IBNAL_CONN_DEATHROW 4 /* waiting to be closed */ +#define IBNAL_CONN_ZOMBIE 5 /* waiting to be freed */ -typedef struct koib_peer +typedef struct kib_peer { struct list_head ibp_list; /* stash on global peer list */ - struct list_head ibp_connd_list; /* schedule on koib_connd_peers */ + struct list_head ibp_connd_list; /* schedule on kib_connd_peers */ ptl_nid_t ibp_nid; /* who's on the other end(s) */ atomic_t ibp_refcount; /* # users */ int ibp_persistence; /* "known" peer refs */ @@ -371,30 +367,30 @@ typedef struct koib_peer int ibp_connecting; /* connecting+accepting */ unsigned long ibp_reconnect_time; /* when reconnect may be attempted */ unsigned long ibp_reconnect_interval; /* exponential backoff */ -} koib_peer_t; +} kib_peer_t; -extern lib_nal_t koibnal_lib; -extern koib_data_t koibnal_data; -extern koib_tunables_t koibnal_tunables; +extern lib_nal_t kibnal_lib; +extern kib_data_t kibnal_data; +extern kib_tunables_t kibnal_tunables; static inline struct list_head * -koibnal_nid2peerlist (ptl_nid_t nid) +kibnal_nid2peerlist (ptl_nid_t nid) { - unsigned int hash = ((unsigned int)nid) % koibnal_data.koib_peer_hash_size; + unsigned int hash = ((unsigned int)nid) % kibnal_data.kib_peer_hash_size; - return (&koibnal_data.koib_peers [hash]); + return (&kibnal_data.kib_peers [hash]); } static inline int -koibnal_peer_active(koib_peer_t *peer) +kibnal_peer_active(kib_peer_t *peer) { /* Am I in the peer hash table? */ return (!list_empty(&peer->ibp_list)); } static inline void -koibnal_queue_tx_locked (koib_tx_t *tx, koib_conn_t *conn) +kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn) { /* CAVEAT EMPTOR: tx takes caller's ref on conn */ @@ -402,40 +398,41 @@ koibnal_queue_tx_locked (koib_tx_t *tx, koib_conn_t *conn) LASSERT (tx->tx_conn == NULL); /* only set here */ tx->tx_conn = conn; + tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ; list_add_tail(&tx->tx_list, &conn->ibc_tx_queue); } -#define KOIBNAL_SERVICE_KEY_MASK (IB_SA_SERVICE_COMP_MASK_NAME | \ - IB_SA_SERVICE_COMP_MASK_DATA8_1 | \ - IB_SA_SERVICE_COMP_MASK_DATA8_2 | \ - IB_SA_SERVICE_COMP_MASK_DATA8_3 | \ - IB_SA_SERVICE_COMP_MASK_DATA8_4 | \ - IB_SA_SERVICE_COMP_MASK_DATA8_5 | \ - IB_SA_SERVICE_COMP_MASK_DATA8_6 | \ - IB_SA_SERVICE_COMP_MASK_DATA8_7 | \ - IB_SA_SERVICE_COMP_MASK_DATA8_8) +#define KIBNAL_SERVICE_KEY_MASK (IB_SA_SERVICE_COMP_MASK_NAME | \ + IB_SA_SERVICE_COMP_MASK_DATA8_1 | \ + IB_SA_SERVICE_COMP_MASK_DATA8_2 | \ + IB_SA_SERVICE_COMP_MASK_DATA8_3 | \ + IB_SA_SERVICE_COMP_MASK_DATA8_4 | \ + IB_SA_SERVICE_COMP_MASK_DATA8_5 | \ + IB_SA_SERVICE_COMP_MASK_DATA8_6 | \ + IB_SA_SERVICE_COMP_MASK_DATA8_7 | \ + IB_SA_SERVICE_COMP_MASK_DATA8_8) static inline __u64* -koibnal_service_nid_field(struct ib_common_attrib_service *srv) +kibnal_service_nid_field(struct ib_common_attrib_service *srv) { - /* must be consistent with KOIBNAL_SERVICE_KEY_MASK */ + /* must be consistent with KIBNAL_SERVICE_KEY_MASK */ return (__u64 *)srv->service_data8; } static inline void -koibnal_set_service_keys(struct ib_common_attrib_service *srv, ptl_nid_t nid) +kibnal_set_service_keys(struct ib_common_attrib_service *srv, ptl_nid_t nid) { - LASSERT (strlen (OPENIBNAL_SERVICE_NAME) < sizeof(srv->service_name)); + LASSERT (strlen (IBNAL_SERVICE_NAME) < sizeof(srv->service_name)); memset (srv->service_name, 0, sizeof(srv->service_name)); - strcpy (srv->service_name, OPENIBNAL_SERVICE_NAME); + strcpy (srv->service_name, IBNAL_SERVICE_NAME); - *koibnal_service_nid_field(srv) = cpu_to_le64(nid); + *kibnal_service_nid_field(srv) = cpu_to_le64(nid); } #if 0 static inline void -koibnal_show_rdma_attr (koib_conn_t *conn) +kibnal_show_rdma_attr (kib_conn_t *conn) { struct ib_qp_attribute qp_attr; int rc; @@ -457,7 +454,7 @@ koibnal_show_rdma_attr (koib_conn_t *conn) #if CONFIG_X86 static inline __u64 -koibnal_page2phys (struct page *p) +kibnal_page2phys (struct page *p) { __u64 page_number = p - mem_map; @@ -467,42 +464,69 @@ koibnal_page2phys (struct page *p) # error "no page->phys" #endif -extern koib_peer_t *koibnal_create_peer (ptl_nid_t nid); -extern void koibnal_put_peer (koib_peer_t *peer); -extern int koibnal_del_peer (ptl_nid_t nid, int single_share); -extern koib_peer_t *koibnal_find_peer_locked (ptl_nid_t nid); -extern void koibnal_unlink_peer_locked (koib_peer_t *peer); -extern int koibnal_close_stale_conns_locked (koib_peer_t *peer, +/* CAVEAT EMPTOR: + * We rely on tx/rx descriptor alignment to allow us to use the lowest bit + * of the work request id as a flag to determine if the completion is for a + * transmit or a receive. It seems that that the CQ entry's 'op' field + * isn't always set correctly on completions that occur after QP teardown. */ + +static inline __u64 +kibnal_ptr2wreqid (void *ptr, int isrx) +{ + unsigned long lptr = (unsigned long)ptr; + + LASSERT ((lptr & 1) == 0); + return (__u64)(lptr | (isrx ? 1 : 0)); +} + +static inline void * +kibnal_wreqid2ptr (__u64 wreqid) +{ + return (void *)(((unsigned long)wreqid) & ~1UL); +} + +static inline int +kibnal_wreqid_is_rx (__u64 wreqid) +{ + return (wreqid & 1) != 0; +} + +extern kib_peer_t *kibnal_create_peer (ptl_nid_t nid); +extern void kibnal_put_peer (kib_peer_t *peer); +extern int kibnal_del_peer (ptl_nid_t nid, int single_share); +extern kib_peer_t *kibnal_find_peer_locked (ptl_nid_t nid); +extern void kibnal_unlink_peer_locked (kib_peer_t *peer); +extern int kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation); -extern koib_conn_t *koibnal_create_conn (void); -extern void koibnal_put_conn (koib_conn_t *conn); -extern void koibnal_destroy_conn (koib_conn_t *conn); -extern int koibnal_alloc_pages (koib_pages_t **pp, int npages, int access); -extern void koibnal_free_pages (koib_pages_t *p); +extern kib_conn_t *kibnal_create_conn (void); +extern void kibnal_put_conn (kib_conn_t *conn); +extern void kibnal_destroy_conn (kib_conn_t *conn); +extern int kibnal_alloc_pages (kib_pages_t **pp, int npages, int access); +extern void kibnal_free_pages (kib_pages_t *p); -extern void koibnal_check_sends (koib_conn_t *conn); +extern void kibnal_check_sends (kib_conn_t *conn); extern tTS_IB_CM_CALLBACK_RETURN -koibnal_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid, +kibnal_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid, void *param, void *arg); extern tTS_IB_CM_CALLBACK_RETURN -koibnal_passive_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid, +kibnal_passive_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid, void *param, void *arg); -extern void koibnal_close_conn_locked (koib_conn_t *conn, int error); -extern void koibnal_destroy_conn (koib_conn_t *conn); -extern int koibnal_thread_start (int (*fn)(void *arg), void *arg); -extern int koibnal_scheduler(void *arg); -extern int koibnal_connd (void *arg); -extern void koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg); -extern void koibnal_tx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg); -extern void koibnal_init_tx_msg (koib_tx_t *tx, int type, int body_nob); -extern int koibnal_close_conn (koib_conn_t *conn, int why); -extern void koibnal_start_active_rdma (int type, int status, - koib_rx_t *rx, lib_msg_t *libmsg, - unsigned int niov, - struct iovec *iov, ptl_kiov_t *kiov, - size_t offset, size_t nob); +extern void kibnal_close_conn_locked (kib_conn_t *conn, int error); +extern void kibnal_destroy_conn (kib_conn_t *conn); +extern int kibnal_thread_start (int (*fn)(void *arg), void *arg); +extern int kibnal_scheduler(void *arg); +extern int kibnal_connd (void *arg); +extern void kibnal_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg); +extern void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob); +extern int kibnal_close_conn (kib_conn_t *conn, int why); +extern void kibnal_start_active_rdma (int type, int status, + kib_rx_t *rx, lib_msg_t *libmsg, + unsigned int niov, + struct iovec *iov, ptl_kiov_t *kiov, + size_t offset, size_t nob); + diff --git a/lustre/portals/knals/openibnal/openibnal_cb.c b/lustre/portals/knals/openibnal/openibnal_cb.c index 79bf37a..d774853 100644 --- a/lustre/portals/knals/openibnal/openibnal_cb.c +++ b/lustre/portals/knals/openibnal/openibnal_cb.c @@ -28,20 +28,20 @@ * */ void -koibnal_schedule_tx_done (koib_tx_t *tx) +kibnal_schedule_tx_done (kib_tx_t *tx) { unsigned long flags; - spin_lock_irqsave (&koibnal_data.koib_sched_lock, flags); + spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags); - list_add_tail(&tx->tx_list, &koibnal_data.koib_sched_txq); - wake_up (&koibnal_data.koib_sched_waitq); + list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq); + wake_up (&kibnal_data.kib_sched_waitq); - spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, flags); + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); } void -koibnal_tx_done (koib_tx_t *tx) +kibnal_tx_done (kib_tx_t *tx) { ptl_err_t ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL; unsigned long flags; @@ -49,31 +49,31 @@ koibnal_tx_done (koib_tx_t *tx) int rc; LASSERT (tx->tx_sending == 0); /* mustn't be awaiting callback */ - LASSERT (!tx->tx_passive_rdma_wait); /* mustn't be on ibc_rdma_queue */ + LASSERT (!tx->tx_passive_rdma_wait); /* mustn't be awaiting RDMA */ switch (tx->tx_mapped) { default: LBUG(); - case KOIB_TX_UNMAPPED: + case KIB_TX_UNMAPPED: break; - case KOIB_TX_MAPPED: + case KIB_TX_MAPPED: if (in_interrupt()) { /* can't deregister memory in IRQ context... */ - koibnal_schedule_tx_done(tx); + kibnal_schedule_tx_done(tx); return; } rc = ib_memory_deregister(tx->tx_md.md_handle.mr); LASSERT (rc == 0); - tx->tx_mapped = KOIB_TX_UNMAPPED; + tx->tx_mapped = KIB_TX_UNMAPPED; break; -#if OPENIBNAL_FMR - case KOIB_TX_MAPPED_FMR: +#if IBNAL_FMR + case KIB_TX_MAPPED_FMR: if (in_interrupt() && tx->tx_status != 0) { /* can't flush FMRs in IRQ context... */ - koibnal_schedule_tx_done(tx); + kibnal_schedule_tx_done(tx); return; } @@ -81,8 +81,8 @@ koibnal_tx_done (koib_tx_t *tx) LASSERT (rc == 0); if (tx->tx_status != 0) - ib_fmr_pool_force_flush(koibnal_data.koib_fmr_pool); - tx->tx_mapped = KOIB_TX_UNMAPPED; + ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool); + tx->tx_mapped = KIB_TX_UNMAPPED; break; #endif } @@ -92,12 +92,12 @@ koibnal_tx_done (koib_tx_t *tx) if (tx->tx_libmsg[i] == NULL) continue; - lib_finalize (&koibnal_lib, NULL, tx->tx_libmsg[i], ptlrc); + lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc); tx->tx_libmsg[i] = NULL; } if (tx->tx_conn != NULL) { - koibnal_put_conn (tx->tx_conn); + kibnal_put_conn (tx->tx_conn); tx->tx_conn = NULL; } @@ -105,52 +105,52 @@ koibnal_tx_done (koib_tx_t *tx) tx->tx_passive_rdma = 0; tx->tx_status = 0; - spin_lock_irqsave (&koibnal_data.koib_tx_lock, flags); + spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); if (tx->tx_isnblk) { - list_add_tail (&tx->tx_list, &koibnal_data.koib_idle_nblk_txs); + list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs); } else { - list_add_tail (&tx->tx_list, &koibnal_data.koib_idle_txs); - wake_up (&koibnal_data.koib_idle_tx_waitq); + list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs); + wake_up (&kibnal_data.kib_idle_tx_waitq); } - spin_unlock_irqrestore (&koibnal_data.koib_tx_lock, flags); + spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); } -koib_tx_t * -koibnal_get_idle_tx (int may_block) +kib_tx_t * +kibnal_get_idle_tx (int may_block) { - unsigned long flags; - koib_tx_t *tx = NULL; + unsigned long flags; + kib_tx_t *tx = NULL; for (;;) { - spin_lock_irqsave (&koibnal_data.koib_tx_lock, flags); + spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); /* "normal" descriptor is free */ - if (!list_empty (&koibnal_data.koib_idle_txs)) { - tx = list_entry (koibnal_data.koib_idle_txs.next, - koib_tx_t, tx_list); + if (!list_empty (&kibnal_data.kib_idle_txs)) { + tx = list_entry (kibnal_data.kib_idle_txs.next, + kib_tx_t, tx_list); break; } if (!may_block) { /* may dip into reserve pool */ - if (list_empty (&koibnal_data.koib_idle_nblk_txs)) { + if (list_empty (&kibnal_data.kib_idle_nblk_txs)) { CERROR ("reserved tx desc pool exhausted\n"); break; } - tx = list_entry (koibnal_data.koib_idle_nblk_txs.next, - koib_tx_t, tx_list); + tx = list_entry (kibnal_data.kib_idle_nblk_txs.next, + kib_tx_t, tx_list); break; } /* block for idle tx */ - spin_unlock_irqrestore (&koibnal_data.koib_tx_lock, flags); + spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); - wait_event (koibnal_data.koib_idle_tx_waitq, - !list_empty (&koibnal_data.koib_idle_txs) || - koibnal_data.koib_shutdown); + wait_event (kibnal_data.kib_idle_tx_waitq, + !list_empty (&kibnal_data.kib_idle_txs) || + kibnal_data.kib_shutdown); } if (tx != NULL) { @@ -159,9 +159,9 @@ koibnal_get_idle_tx (int may_block) /* Allocate a new passive RDMA completion cookie. It might * not be needed, but we've got a lock right now and we're * unlikely to wrap... */ - tx->tx_passive_rdma_cookie = koibnal_data.koib_next_tx_cookie++; + tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++; - LASSERT (tx->tx_mapped == KOIB_TX_UNMAPPED); + LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); LASSERT (tx->tx_nsp == 0); LASSERT (tx->tx_sending == 0); LASSERT (tx->tx_status == 0); @@ -172,15 +172,15 @@ koibnal_get_idle_tx (int may_block) LASSERT (tx->tx_libmsg[1] == NULL); } - spin_unlock_irqrestore (&koibnal_data.koib_tx_lock, flags); + spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); return (tx); } int -koibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) +kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) { - /* I would guess that if koibnal_get_peer (nid) == NULL, + /* I would guess that if kibnal_get_peer (nid) == NULL, and we're not routing, then 'nid' is very distant :) */ if ( nal->libnal_ni.ni_pid.nid == nid ) { *dist = 0; @@ -192,7 +192,7 @@ koibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) } void -koibnal_complete_passive_rdma(koib_conn_t *conn, __u64 cookie, int status) +kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status) { struct list_head *ttmp; unsigned long flags; @@ -200,30 +200,34 @@ koibnal_complete_passive_rdma(koib_conn_t *conn, __u64 cookie, int status) spin_lock_irqsave (&conn->ibc_lock, flags); - list_for_each (ttmp, &conn->ibc_rdma_queue) { - koib_tx_t *tx = list_entry(ttmp, koib_tx_t, tx_list); - - LASSERT (tx->tx_passive_rdma); - LASSERT (tx->tx_passive_rdma_wait); + list_for_each (ttmp, &conn->ibc_active_txs) { + kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list); - if (tx->tx_passive_rdma_cookie != cookie) - continue; + LASSERT (tx->tx_passive_rdma || + !tx->tx_passive_rdma_wait); - CDEBUG(D_NET, "Complete %p "LPD64"\n", tx, cookie); + LASSERT (tx->tx_passive_rdma_wait || + tx->tx_sending != 0); - list_del (&tx->tx_list); + if (!tx->tx_passive_rdma_wait || + tx->tx_passive_rdma_cookie != cookie) + continue; + + CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status); + tx->tx_status = status; tx->tx_passive_rdma_wait = 0; idle = (tx->tx_sending == 0); - tx->tx_status = status; + if (idle) + list_del (&tx->tx_list); spin_unlock_irqrestore (&conn->ibc_lock, flags); /* I could be racing with tx callbacks. It's whoever * _makes_ tx idle that frees it */ if (idle) - koibnal_tx_done (tx); + kibnal_tx_done (tx); return; } @@ -234,32 +238,32 @@ koibnal_complete_passive_rdma(koib_conn_t *conn, __u64 cookie, int status) } void -koibnal_post_rx (koib_rx_t *rx, int do_credits) +kibnal_post_rx (kib_rx_t *rx, int do_credits) { - koib_conn_t *conn = rx->rx_conn; + kib_conn_t *conn = rx->rx_conn; int rc; unsigned long flags; rx->rx_gl = (struct ib_gather_scatter) { .address = rx->rx_vaddr, - .length = OPENIBNAL_MSG_SIZE, - .key = conn->ibc_rx_pages->oibp_lkey, + .length = IBNAL_MSG_SIZE, + .key = conn->ibc_rx_pages->ibp_lkey, }; - + rx->rx_sp = (struct ib_receive_param) { - .work_request_id = (__u64)(unsigned long)rx, + .work_request_id = kibnal_ptr2wreqid(rx, 1), .scatter_list = &rx->rx_gl, .num_scatter_entries = 1, .device_specific = NULL, .signaled = 1, }; - LASSERT (conn->ibc_state >= OPENIBNAL_CONN_ESTABLISHED); + LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED); LASSERT (!rx->rx_posted); rx->rx_posted = 1; mb(); - if (conn->ibc_state != OPENIBNAL_CONN_ESTABLISHED) + if (conn->ibc_state != IBNAL_CONN_ESTABLISHED) rc = -ECONNABORTED; else rc = ib_receive (conn->ibc_qp, &rx->rx_sp, 1); @@ -270,26 +274,26 @@ koibnal_post_rx (koib_rx_t *rx, int do_credits) conn->ibc_outstanding_credits++; spin_unlock_irqrestore(&conn->ibc_lock, flags); - koibnal_check_sends(conn); + kibnal_check_sends(conn); } return; } - if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED) { + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { CERROR ("Error posting receive -> "LPX64": %d\n", conn->ibc_peer->ibp_nid, rc); - koibnal_close_conn (rx->rx_conn, rc); + kibnal_close_conn (rx->rx_conn, rc); } else { CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n", conn->ibc_peer->ibp_nid, rc); } /* Drop rx's ref */ - koibnal_put_conn (conn); + kibnal_put_conn (conn); } -#if OPENIBNAL_CKSUM -__u32 koibnal_cksum (void *ptr, int nob) +#if IBNAL_CKSUM +__u32 kibnal_cksum (void *ptr, int nob) { char *c = ptr; __u32 sum = 0; @@ -302,17 +306,17 @@ __u32 koibnal_cksum (void *ptr, int nob) #endif void -koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg) +kibnal_rx_callback (struct ib_cq_entry *e) { - koib_rx_t *rx = (koib_rx_t *)((unsigned long)e->work_request_id); - koib_msg_t *msg = rx->rx_msg; - koib_conn_t *conn = rx->rx_conn; + kib_rx_t *rx = (kib_rx_t *)kibnal_wreqid2ptr(e->work_request_id); + kib_msg_t *msg = rx->rx_msg; + kib_conn_t *conn = rx->rx_conn; int nob = e->bytes_transferred; - const int base_nob = offsetof(koib_msg_t, oibm_u); + const int base_nob = offsetof(kib_msg_t, ibm_u); int credits; int flipped; unsigned long flags; -#if OPENIBNAL_CKSUM +#if IBNAL_CKSUM __u32 msg_cksum; __u32 computed_cksum; #endif @@ -324,11 +328,11 @@ koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg) /* receives complete with error in any case after we've started * closing the QP */ - if (conn->ibc_state >= OPENIBNAL_CONN_DEATHROW) + if (conn->ibc_state >= IBNAL_CONN_DEATHROW) goto failed; /* We don't post receives until the conn is established */ - LASSERT (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED); + LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED); if (e->status != IB_COMPLETION_STATUS_SUCCESS) { CERROR("Rx from "LPX64" failed: %d\n", @@ -344,35 +348,35 @@ koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg) /* Receiver does any byte flipping if necessary... */ - if (msg->oibm_magic == OPENIBNAL_MSG_MAGIC) { + if (msg->ibm_magic == IBNAL_MSG_MAGIC) { flipped = 0; } else { - if (msg->oibm_magic != __swab32(OPENIBNAL_MSG_MAGIC)) { + if (msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) { CERROR ("Unrecognised magic: %08x from "LPX64"\n", - msg->oibm_magic, conn->ibc_peer->ibp_nid); + msg->ibm_magic, conn->ibc_peer->ibp_nid); goto failed; } flipped = 1; - __swab16s (&msg->oibm_version); - LASSERT (sizeof(msg->oibm_type) == 1); - LASSERT (sizeof(msg->oibm_credits) == 1); + __swab16s (&msg->ibm_version); + LASSERT (sizeof(msg->ibm_type) == 1); + LASSERT (sizeof(msg->ibm_credits) == 1); } - if (msg->oibm_version != OPENIBNAL_MSG_VERSION) { + if (msg->ibm_version != IBNAL_MSG_VERSION) { CERROR ("Incompatible msg version %d (%d expected)\n", - msg->oibm_version, OPENIBNAL_MSG_VERSION); + msg->ibm_version, IBNAL_MSG_VERSION); goto failed; } -#if OPENIBNAL_CKSUM - if (nob != msg->oibm_nob) { - CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->oibm_nob); +#if IBNAL_CKSUM + if (nob != msg->ibm_nob) { + CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->ibm_nob); goto failed; } - msg_cksum = le32_to_cpu(msg->oibm_cksum); - msg->oibm_cksum = 0; - computed_cksum = koibnal_cksum (msg, nob); + msg_cksum = le32_to_cpu(msg->ibm_cksum); + msg->ibm_cksum = 0; + computed_cksum = kibnal_cksum (msg, nob); if (msg_cksum != computed_cksum) { CERROR ("Checksum failure %d: (%d expected)\n", @@ -383,101 +387,101 @@ koibnal_rx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg) #endif /* Have I received credits that will let me send? */ - credits = msg->oibm_credits; + credits = msg->ibm_credits; if (credits != 0) { spin_lock_irqsave(&conn->ibc_lock, flags); conn->ibc_credits += credits; spin_unlock_irqrestore(&conn->ibc_lock, flags); - koibnal_check_sends(conn); + kibnal_check_sends(conn); } - switch (msg->oibm_type) { - case OPENIBNAL_MSG_NOOP: - koibnal_post_rx (rx, 1); + switch (msg->ibm_type) { + case IBNAL_MSG_NOOP: + kibnal_post_rx (rx, 1); return; - case OPENIBNAL_MSG_IMMEDIATE: - if (nob < base_nob + sizeof (koib_immediate_msg_t)) { + case IBNAL_MSG_IMMEDIATE: + if (nob < base_nob + sizeof (kib_immediate_msg_t)) { CERROR ("Short IMMEDIATE from "LPX64": %d\n", conn->ibc_peer->ibp_nid, nob); goto failed; } break; - case OPENIBNAL_MSG_PUT_RDMA: - case OPENIBNAL_MSG_GET_RDMA: - if (nob < base_nob + sizeof (koib_rdma_msg_t)) { + case IBNAL_MSG_PUT_RDMA: + case IBNAL_MSG_GET_RDMA: + if (nob < base_nob + sizeof (kib_rdma_msg_t)) { CERROR ("Short RDMA msg from "LPX64": %d\n", conn->ibc_peer->ibp_nid, nob); goto failed; } if (flipped) { - __swab32s(&msg->oibm_u.rdma.oibrm_desc.rd_key); - __swab32s(&msg->oibm_u.rdma.oibrm_desc.rd_nob); - __swab64s(&msg->oibm_u.rdma.oibrm_desc.rd_addr); + __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_key); + __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_nob); + __swab64s(&msg->ibm_u.rdma.ibrm_desc.rd_addr); } CDEBUG(D_NET, "%d RDMA: cookie "LPX64", key %x, addr "LPX64", nob %d\n", - msg->oibm_type, msg->oibm_u.rdma.oibrm_cookie, - msg->oibm_u.rdma.oibrm_desc.rd_key, - msg->oibm_u.rdma.oibrm_desc.rd_addr, - msg->oibm_u.rdma.oibrm_desc.rd_nob); + msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie, + msg->ibm_u.rdma.ibrm_desc.rd_key, + msg->ibm_u.rdma.ibrm_desc.rd_addr, + msg->ibm_u.rdma.ibrm_desc.rd_nob); break; - case OPENIBNAL_MSG_PUT_DONE: - case OPENIBNAL_MSG_GET_DONE: - if (nob < base_nob + sizeof (koib_completion_msg_t)) { + case IBNAL_MSG_PUT_DONE: + case IBNAL_MSG_GET_DONE: + if (nob < base_nob + sizeof (kib_completion_msg_t)) { CERROR ("Short COMPLETION msg from "LPX64": %d\n", conn->ibc_peer->ibp_nid, nob); goto failed; } if (flipped) - __swab32s(&msg->oibm_u.completion.oibcm_status); + __swab32s(&msg->ibm_u.completion.ibcm_status); CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n", - msg->oibm_type, msg->oibm_u.completion.oibcm_cookie, - msg->oibm_u.completion.oibcm_status); + msg->ibm_type, msg->ibm_u.completion.ibcm_cookie, + msg->ibm_u.completion.ibcm_status); - koibnal_complete_passive_rdma (conn, - msg->oibm_u.completion.oibcm_cookie, - msg->oibm_u.completion.oibcm_status); - koibnal_post_rx (rx, 1); + kibnal_complete_passive_rdma (conn, + msg->ibm_u.completion.ibcm_cookie, + msg->ibm_u.completion.ibcm_status); + kibnal_post_rx (rx, 1); return; default: CERROR ("Can't parse type from "LPX64": %d\n", - conn->ibc_peer->ibp_nid, msg->oibm_type); + conn->ibc_peer->ibp_nid, msg->ibm_type); goto failed; } - /* schedule for koibnal_rx() in thread context */ - spin_lock_irqsave(&koibnal_data.koib_sched_lock, flags); + /* schedule for kibnal_rx() in thread context */ + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); - list_add_tail (&rx->rx_list, &koibnal_data.koib_sched_rxq); - wake_up (&koibnal_data.koib_sched_waitq); + list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq); + wake_up (&kibnal_data.kib_sched_waitq); - spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, flags); + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); return; failed: CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); - koibnal_close_conn(conn, -ECONNABORTED); + kibnal_close_conn(conn, -ECONNABORTED); /* Don't re-post rx & drop its ref on conn */ - koibnal_put_conn(conn); + kibnal_put_conn(conn); } void -koibnal_rx (koib_rx_t *rx) +kibnal_rx (kib_rx_t *rx) { - koib_msg_t *msg = rx->rx_msg; + kib_msg_t *msg = rx->rx_msg; /* Clear flag so I can detect if I've sent an RDMA completion */ rx->rx_rdma = 0; - switch (msg->oibm_type) { - case OPENIBNAL_MSG_GET_RDMA: - lib_parse(&koibnal_lib, &msg->oibm_u.rdma.oibrm_hdr, rx); + switch (msg->ibm_type) { + case IBNAL_MSG_GET_RDMA: + lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx); /* If the incoming get was matched, I'll have initiated the * RDMA and the completion message... */ if (rx->rx_rdma) @@ -487,12 +491,12 @@ koibnal_rx (koib_rx_t *rx) * the peer's GET blocking for the full timeout. */ CERROR ("Completing unmatched RDMA GET from "LPX64"\n", rx->rx_conn->ibc_peer->ibp_nid); - koibnal_start_active_rdma (OPENIBNAL_MSG_GET_DONE, -EIO, - rx, NULL, 0, NULL, NULL, 0, 0); + kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO, + rx, NULL, 0, NULL, NULL, 0, 0); break; - case OPENIBNAL_MSG_PUT_RDMA: - lib_parse(&koibnal_lib, &msg->oibm_u.rdma.oibrm_hdr, rx); + case IBNAL_MSG_PUT_RDMA: + lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx); if (rx->rx_rdma) break; /* This is most unusual, since even if lib_parse() didn't @@ -505,8 +509,8 @@ koibnal_rx (koib_rx_t *rx) rx->rx_conn->ibc_peer->ibp_nid); break; - case OPENIBNAL_MSG_IMMEDIATE: - lib_parse(&koibnal_lib, &msg->oibm_u.immediate.oibim_hdr, rx); + case IBNAL_MSG_IMMEDIATE: + lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx); LASSERT (!rx->rx_rdma); break; @@ -515,12 +519,12 @@ koibnal_rx (koib_rx_t *rx) break; } - koibnal_post_rx (rx, 1); + kibnal_post_rx (rx, 1); } #if 0 int -koibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp) +kibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp) { struct page *page; @@ -531,7 +535,7 @@ koibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp) else if (vaddr >= PKMAP_BASE && vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) page = vmalloc_to_page ((void *)vaddr); - /* in 2.4 ^ just walks the page tables */ + /* in 2.4 ^ just walks the page tables */ #endif else page = virt_to_page (vaddr); @@ -540,13 +544,13 @@ koibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp) !VALID_PAGE (page)) return (-EFAULT); - *physp = koibnal_page2phys(page) + (vaddr & (PAGE_SIZE - 1)); + *physp = kibnal_page2phys(page) + (vaddr & (PAGE_SIZE - 1)); return (0); } #endif int -koibnal_map_iov (koib_tx_t *tx, enum ib_memory_access access, +kibnal_map_iov (kib_tx_t *tx, enum ib_memory_access access, int niov, struct iovec *iov, int offset, int nob) { @@ -555,7 +559,7 @@ koibnal_map_iov (koib_tx_t *tx, enum ib_memory_access access, LASSERT (nob > 0); LASSERT (niov > 0); - LASSERT (tx->tx_mapped == KOIB_TX_UNMAPPED); + LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); while (offset >= iov->iov_len) { offset -= iov->iov_len; @@ -572,7 +576,7 @@ koibnal_map_iov (koib_tx_t *tx, enum ib_memory_access access, vaddr = (void *)(((unsigned long)iov->iov_base) + offset); tx->tx_md.md_addr = (__u64)((unsigned long)vaddr); - rc = ib_memory_register (koibnal_data.koib_pd, + rc = ib_memory_register (kibnal_data.kib_pd, vaddr, nob, access, &tx->tx_md.md_handle.mr, @@ -584,21 +588,21 @@ koibnal_map_iov (koib_tx_t *tx, enum ib_memory_access access, return (rc); } - tx->tx_mapped = KOIB_TX_MAPPED; + tx->tx_mapped = KIB_TX_MAPPED; return (0); } int -koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access, +kibnal_map_kiov (kib_tx_t *tx, enum ib_memory_access access, int nkiov, ptl_kiov_t *kiov, int offset, int nob) { -#if OPENIBNAL_FMR +#if IBNAL_FMR __u64 *phys; - const int mapped = KOIB_TX_MAPPED_FMR; + const int mapped = KIB_TX_MAPPED_FMR; #else struct ib_physical_buffer *phys; - const int mapped = KOIB_TX_MAPPED; + const int mapped = KIB_TX_MAPPED; #endif int page_offset; int nphys; @@ -610,7 +614,7 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access, LASSERT (nob > 0); LASSERT (nkiov > 0); - LASSERT (tx->tx_mapped == KOIB_TX_UNMAPPED); + LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); while (offset >= kiov->kiov_len) { offset -= kiov->kiov_len; @@ -627,10 +631,10 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access, } page_offset = kiov->kiov_offset + offset; -#if OPENIBNAL_FMR - phys[0] = koibnal_page2phys(kiov->kiov_page); +#if IBNAL_FMR + phys[0] = kibnal_page2phys(kiov->kiov_page); #else - phys[0].address = koibnal_page2phys(kiov->kiov_page); + phys[0].address = kibnal_page2phys(kiov->kiov_page); phys[0].size = PAGE_SIZE; #endif nphys = 1; @@ -667,10 +671,10 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access, } LASSERT (nphys * sizeof (*phys) < phys_size); -#if OPENIBNAL_FMR - phys[nphys] = koibnal_page2phys(kiov->kiov_page); +#if IBNAL_FMR + phys[nphys] = kibnal_page2phys(kiov->kiov_page); #else - phys[nphys].address = koibnal_page2phys(kiov->kiov_page); + phys[nphys].address = kibnal_page2phys(kiov->kiov_page); phys[nphys].size = PAGE_SIZE; #endif nphys++; @@ -683,10 +687,10 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access, for (rc = 0; rc < nphys; rc++) CWARN (" [%d] "LPX64" / %d\n", rc, phys[rc].address, phys[rc].size); #endif - tx->tx_md.md_addr = OPENIBNAL_RDMA_BASE; + tx->tx_md.md_addr = IBNAL_RDMA_BASE; -#if OPENIBNAL_FMR - rc = ib_fmr_register_physical (koibnal_data.koib_fmr_pool, +#if IBNAL_FMR + rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool, phys, nphys, &tx->tx_md.md_addr, page_offset, @@ -694,7 +698,7 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access, &tx->tx_md.md_lkey, &tx->tx_md.md_rkey); #else - rc = ib_memory_register_physical (koibnal_data.koib_pd, + rc = ib_memory_register_physical (kibnal_data.kib_pd, phys, nphys, &tx->tx_md.md_addr, nob, page_offset, @@ -717,24 +721,24 @@ koibnal_map_kiov (koib_tx_t *tx, enum ib_memory_access access, return (rc); } -koib_conn_t * -koibnal_find_conn_locked (koib_peer_t *peer) +kib_conn_t * +kibnal_find_conn_locked (kib_peer_t *peer) { struct list_head *tmp; /* just return the first connection */ list_for_each (tmp, &peer->ibp_conns) { - return (list_entry(tmp, koib_conn_t, ibc_list)); + return (list_entry(tmp, kib_conn_t, ibc_list)); } return (NULL); } void -koibnal_check_sends (koib_conn_t *conn) +kibnal_check_sends (kib_conn_t *conn) { unsigned long flags; - koib_tx_t *tx; + kib_tx_t *tx; int rc; int i; int done; @@ -742,39 +746,39 @@ koibnal_check_sends (koib_conn_t *conn) spin_lock_irqsave (&conn->ibc_lock, flags); + LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE); + if (list_empty(&conn->ibc_tx_queue) && - conn->ibc_outstanding_credits >= OPENIBNAL_CREDIT_HIGHWATER) { + conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) { spin_unlock_irqrestore(&conn->ibc_lock, flags); - - tx = koibnal_get_idle_tx(0); /* don't block */ + + tx = kibnal_get_idle_tx(0); /* don't block */ if (tx != NULL) - koibnal_init_tx_msg(tx, OPENIBNAL_MSG_NOOP, 0); + kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0); spin_lock_irqsave(&conn->ibc_lock, flags); - + if (tx != NULL) { atomic_inc(&conn->ibc_refcount); - koibnal_queue_tx_locked(tx, conn); + kibnal_queue_tx_locked(tx, conn); } } - LASSERT (conn->ibc_nsends_posted <= OPENIBNAL_MSG_QUEUE_SIZE); - while (!list_empty (&conn->ibc_tx_queue)) { - tx = list_entry (conn->ibc_tx_queue.next, koib_tx_t, tx_list); + tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list); /* We rely on this for QP sizing */ LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= 2); LASSERT (conn->ibc_outstanding_credits >= 0); - LASSERT (conn->ibc_outstanding_credits <= OPENIBNAL_MSG_QUEUE_SIZE); + LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE); LASSERT (conn->ibc_credits >= 0); - LASSERT (conn->ibc_credits <= OPENIBNAL_MSG_QUEUE_SIZE); + LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE); /* Not on ibc_rdma_queue */ LASSERT (!tx->tx_passive_rdma_wait); - if (conn->ibc_nsends_posted == OPENIBNAL_MSG_QUEUE_SIZE) + if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE) break; if (conn->ibc_credits == 0) /* no credits */ @@ -786,37 +790,29 @@ koibnal_check_sends (koib_conn_t *conn) list_del (&tx->tx_list); - if (tx->tx_msg->oibm_type == OPENIBNAL_MSG_NOOP && + if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP && (!list_empty(&conn->ibc_tx_queue) || - conn->ibc_outstanding_credits < OPENIBNAL_CREDIT_HIGHWATER)) { - /* Redundant NOOP */ + conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) { + /* redundant NOOP */ spin_unlock_irqrestore(&conn->ibc_lock, flags); - koibnal_tx_done(tx); + kibnal_tx_done(tx); spin_lock_irqsave(&conn->ibc_lock, flags); continue; } - - /* incoming RDMA completion can find this one now */ - if (tx->tx_passive_rdma) { - list_add (&tx->tx_list, &conn->ibc_rdma_queue); - tx->tx_passive_rdma_wait = 1; - tx->tx_passive_rdma_deadline = - jiffies + koibnal_tunables.koib_io_timeout * HZ; - } - tx->tx_msg->oibm_credits = conn->ibc_outstanding_credits; + tx->tx_msg->ibm_credits = conn->ibc_outstanding_credits; conn->ibc_outstanding_credits = 0; - /* use the free memory barrier when we unlock to ensure - * sending set before we can get the tx callback. */ conn->ibc_nsends_posted++; conn->ibc_credits--; - tx->tx_sending = tx->tx_nsp; -#if OPENIBNAL_CKSUM - tx->tx_msg->oibm_cksum = 0; - tx->tx_msg->oibm_cksum = koibnal_cksum(tx->tx_msg, tx->tx_msg->oibm_nob); - CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->oibm_cksum, tx->tx_msg->oibm_nob); + tx->tx_sending = tx->tx_nsp; + tx->tx_passive_rdma_wait = tx->tx_passive_rdma; + list_add (&tx->tx_list, &conn->ibc_active_txs); +#if IBNAL_CKSUM + tx->tx_msg->ibm_cksum = 0; + tx->tx_msg->ibm_cksum = kibnal_cksum(tx->tx_msg, tx->tx_msg->ibm_nob); + CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->ibm_cksum, tx->tx_msg->ibm_nob); #endif spin_unlock_irqrestore (&conn->ibc_lock, flags); @@ -827,7 +823,7 @@ koibnal_check_sends (koib_conn_t *conn) rc = -ECONNABORTED; nwork = 0; - if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED) { + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { tx->tx_status = 0; /* Driver only accepts 1 item at a time */ for (i = 0; i < tx->tx_nsp; i++) { @@ -842,31 +838,31 @@ koibnal_check_sends (koib_conn_t *conn) if (rc != 0) { /* NB credits are transferred in the actual * message, which can only be the last work item */ - conn->ibc_outstanding_credits += tx->tx_msg->oibm_credits; + conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits; conn->ibc_credits++; conn->ibc_nsends_posted--; - tx->tx_sending -= tx->tx_nsp - nwork; + tx->tx_status = rc; + tx->tx_passive_rdma_wait = 0; + tx->tx_sending -= tx->tx_nsp - nwork; + done = (tx->tx_sending == 0); - - if (tx->tx_passive_rdma) { - tx->tx_passive_rdma_wait = 0; + if (done) list_del (&tx->tx_list); - } spin_unlock_irqrestore (&conn->ibc_lock, flags); - if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED) + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) CERROR ("Error %d posting transmit to "LPX64"\n", rc, conn->ibc_peer->ibp_nid); else CDEBUG (D_NET, "Error %d posting transmit to " LPX64"\n", rc, conn->ibc_peer->ibp_nid); - koibnal_close_conn (conn, rc); + kibnal_close_conn (conn, rc); if (done) - koibnal_tx_done (tx); + kibnal_tx_done (tx); return; } @@ -876,10 +872,10 @@ koibnal_check_sends (koib_conn_t *conn) } void -koibnal_tx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg) +kibnal_tx_callback (struct ib_cq_entry *e) { - koib_tx_t *tx = (koib_tx_t *)((unsigned long)e->work_request_id); - koib_conn_t *conn; + kib_tx_t *tx = (kib_tx_t *)kibnal_wreqid2ptr(e->work_request_id); + kib_conn_t *conn; unsigned long flags; int idle; @@ -901,6 +897,8 @@ koibnal_tx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg) tx->tx_sending--; idle = (tx->tx_sending == 0) && /* This is the final callback */ (!tx->tx_passive_rdma_wait); /* Not waiting for RDMA completion */ + if (idle) + list_del(&tx->tx_list); CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", conn, conn->ibc_state, conn->ibc_peer->ibp_nid, @@ -917,53 +915,62 @@ koibnal_tx_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg) spin_unlock_irqrestore(&conn->ibc_lock, flags); if (idle) - koibnal_tx_done (tx); + kibnal_tx_done (tx); if (e->status != IB_COMPLETION_STATUS_SUCCESS) { CERROR ("Tx completion to "LPX64" failed: %d\n", conn->ibc_peer->ibp_nid, e->status); - koibnal_close_conn (conn, -ENETDOWN); + kibnal_close_conn (conn, -ENETDOWN); } else { /* can I shovel some more sends out the door? */ - koibnal_check_sends(conn); + kibnal_check_sends(conn); } - koibnal_put_conn (conn); + kibnal_put_conn (conn); } void -koibnal_init_tx_msg (koib_tx_t *tx, int type, int body_nob) +kibnal_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg) +{ + if (kibnal_wreqid_is_rx(e->work_request_id)) + kibnal_rx_callback (e); + else + kibnal_tx_callback (e); +} + +void +kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob) { struct ib_gather_scatter *gl = &tx->tx_gl[tx->tx_nsp]; struct ib_send_param *sp = &tx->tx_sp[tx->tx_nsp]; int fence; - int nob = offsetof (koib_msg_t, oibm_u) + body_nob; + int nob = offsetof (kib_msg_t, ibm_u) + body_nob; LASSERT (tx->tx_nsp >= 0 && tx->tx_nsp < sizeof(tx->tx_sp)/sizeof(tx->tx_sp[0])); - LASSERT (nob <= OPENIBNAL_MSG_SIZE); + LASSERT (nob <= IBNAL_MSG_SIZE); - tx->tx_msg->oibm_magic = OPENIBNAL_MSG_MAGIC; - tx->tx_msg->oibm_version = OPENIBNAL_MSG_VERSION; - tx->tx_msg->oibm_type = type; -#if OPENIBNAL_CKSUM - tx->tx_msg->oibm_nob = nob; + tx->tx_msg->ibm_magic = IBNAL_MSG_MAGIC; + tx->tx_msg->ibm_version = IBNAL_MSG_VERSION; + tx->tx_msg->ibm_type = type; +#if IBNAL_CKSUM + tx->tx_msg->ibm_nob = nob; #endif /* Fence the message if it's bundled with an RDMA read */ fence = (tx->tx_nsp > 0) && - (type == OPENIBNAL_MSG_PUT_DONE); + (type == IBNAL_MSG_PUT_DONE); *gl = (struct ib_gather_scatter) { .address = tx->tx_vaddr, .length = nob, - .key = koibnal_data.koib_tx_pages->oibp_lkey, + .key = kibnal_data.kib_tx_pages->ibp_lkey, }; /* NB If this is an RDMA read, the completion message must wait for * the RDMA to complete. Sends wait for previous RDMA writes * anyway... */ *sp = (struct ib_send_param) { - .work_request_id = (__u64)((unsigned long)tx), + .work_request_id = kibnal_ptr2wreqid(tx, 0), .op = IB_OP_SEND, .gather_list = gl, .num_gather_entries = 1, @@ -979,26 +986,26 @@ koibnal_init_tx_msg (koib_tx_t *tx, int type, int body_nob) } void -koibnal_queue_tx (koib_tx_t *tx, koib_conn_t *conn) +kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn) { unsigned long flags; spin_lock_irqsave(&conn->ibc_lock, flags); - koibnal_queue_tx_locked (tx, conn); + kibnal_queue_tx_locked (tx, conn); spin_unlock_irqrestore(&conn->ibc_lock, flags); - koibnal_check_sends(conn); + kibnal_check_sends(conn); } void -koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid) +kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid) { unsigned long flags; - koib_peer_t *peer; - koib_conn_t *conn; - rwlock_t *g_lock = &koibnal_data.koib_global_lock; + kib_peer_t *peer; + kib_conn_t *conn; + rwlock_t *g_lock = &kibnal_data.kib_global_lock; /* If I get here, I've committed to send, so I complete the tx with * failure on any problems */ @@ -1008,15 +1015,15 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid) read_lock (g_lock); - peer = koibnal_find_peer_locked (nid); + peer = kibnal_find_peer_locked (nid); if (peer == NULL) { read_unlock (g_lock); tx->tx_status = -EHOSTUNREACH; - koibnal_tx_done (tx); + kibnal_tx_done (tx); return; } - conn = koibnal_find_conn_locked (peer); + conn = kibnal_find_conn_locked (peer); if (conn != NULL) { CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", conn, conn->ibc_state, conn->ibc_peer->ibp_nid, @@ -1024,7 +1031,7 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid) atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */ read_unlock (g_lock); - koibnal_queue_tx (tx, conn); + kibnal_queue_tx (tx, conn); return; } @@ -1032,15 +1039,15 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid) read_unlock (g_lock); write_lock_irqsave (g_lock, flags); - peer = koibnal_find_peer_locked (nid); + peer = kibnal_find_peer_locked (nid); if (peer == NULL) { write_unlock_irqrestore (g_lock, flags); tx->tx_status = -EHOSTUNREACH; - koibnal_tx_done (tx); + kibnal_tx_done (tx); return; } - conn = koibnal_find_conn_locked (peer); + conn = kibnal_find_conn_locked (peer); if (conn != NULL) { /* Connection exists; queue message on it */ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", @@ -1049,7 +1056,7 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid) atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */ write_unlock_irqrestore (g_lock, flags); - koibnal_queue_tx (tx, conn); + kibnal_queue_tx (tx, conn); return; } @@ -1057,20 +1064,20 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid) if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) { write_unlock_irqrestore (g_lock, flags); tx->tx_status = -EHOSTUNREACH; - koibnal_tx_done (tx); + kibnal_tx_done (tx); return; } peer->ibp_connecting = 1; atomic_inc (&peer->ibp_refcount); /* extra ref for connd */ - spin_lock (&koibnal_data.koib_connd_lock); + spin_lock (&kibnal_data.kib_connd_lock); list_add_tail (&peer->ibp_connd_list, - &koibnal_data.koib_connd_peers); - wake_up (&koibnal_data.koib_connd_waitq); + &kibnal_data.kib_connd_peers); + wake_up (&kibnal_data.kib_connd_waitq); - spin_unlock (&koibnal_data.koib_connd_lock); + spin_unlock (&kibnal_data.kib_connd_lock); } /* A connection is being established; queue the message... */ @@ -1080,49 +1087,49 @@ koibnal_launch_tx (koib_tx_t *tx, ptl_nid_t nid) } ptl_err_t -koibnal_start_passive_rdma (int type, ptl_nid_t nid, +kibnal_start_passive_rdma (int type, ptl_nid_t nid, lib_msg_t *libmsg, ptl_hdr_t *hdr) { int nob = libmsg->md->length; - koib_tx_t *tx; - koib_msg_t *oibmsg; + kib_tx_t *tx; + kib_msg_t *ibmsg; int rc; int access; - LASSERT (type == OPENIBNAL_MSG_PUT_RDMA || - type == OPENIBNAL_MSG_GET_RDMA); + LASSERT (type == IBNAL_MSG_PUT_RDMA || + type == IBNAL_MSG_GET_RDMA); LASSERT (nob > 0); LASSERT (!in_interrupt()); /* Mapping could block */ - if (type == OPENIBNAL_MSG_PUT_RDMA) { + if (type == IBNAL_MSG_PUT_RDMA) { access = IB_ACCESS_REMOTE_READ; } else { access = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; } - tx = koibnal_get_idle_tx (1); /* May block; caller is an app thread */ + tx = kibnal_get_idle_tx (1); /* May block; caller is an app thread */ LASSERT (tx != NULL); if ((libmsg->md->options & PTL_MD_KIOV) == 0) - rc = koibnal_map_iov (tx, access, - libmsg->md->md_niov, - libmsg->md->md_iov.iov, - 0, nob); + rc = kibnal_map_iov (tx, access, + libmsg->md->md_niov, + libmsg->md->md_iov.iov, + 0, nob); else - rc = koibnal_map_kiov (tx, access, - libmsg->md->md_niov, - libmsg->md->md_iov.kiov, - 0, nob); + rc = kibnal_map_kiov (tx, access, + libmsg->md->md_niov, + libmsg->md->md_iov.kiov, + 0, nob); if (rc != 0) { CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc); goto failed; } - if (type == OPENIBNAL_MSG_GET_RDMA) { + if (type == IBNAL_MSG_GET_RDMA) { /* reply gets finalized when tx completes */ - tx->tx_libmsg[1] = lib_create_reply_msg(&koibnal_lib, + tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, nid, libmsg); if (tx->tx_libmsg[1] == NULL) { CERROR ("Can't create reply for GET -> "LPX64"\n", @@ -1134,15 +1141,15 @@ koibnal_start_passive_rdma (int type, ptl_nid_t nid, tx->tx_passive_rdma = 1; - oibmsg = tx->tx_msg; + ibmsg = tx->tx_msg; - oibmsg->oibm_u.rdma.oibrm_hdr = *hdr; - oibmsg->oibm_u.rdma.oibrm_cookie = tx->tx_passive_rdma_cookie; - oibmsg->oibm_u.rdma.oibrm_desc.rd_key = tx->tx_md.md_rkey; - oibmsg->oibm_u.rdma.oibrm_desc.rd_addr = tx->tx_md.md_addr; - oibmsg->oibm_u.rdma.oibrm_desc.rd_nob = nob; + ibmsg->ibm_u.rdma.ibrm_hdr = *hdr; + ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie; + ibmsg->ibm_u.rdma.ibrm_desc.rd_key = tx->tx_md.md_rkey; + ibmsg->ibm_u.rdma.ibrm_desc.rd_addr = tx->tx_md.md_addr; + ibmsg->ibm_u.rdma.ibrm_desc.rd_nob = nob; - koibnal_init_tx_msg (tx, type, sizeof (koib_rdma_msg_t)); + kibnal_init_tx_msg (tx, type, sizeof (kib_rdma_msg_t)); CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr " LPX64", nob %d\n", @@ -1152,25 +1159,25 @@ koibnal_start_passive_rdma (int type, ptl_nid_t nid, /* libmsg gets finalized when tx completes. */ tx->tx_libmsg[0] = libmsg; - koibnal_launch_tx(tx, nid); + kibnal_launch_tx(tx, nid); return (PTL_OK); failed: tx->tx_status = rc; - koibnal_tx_done (tx); + kibnal_tx_done (tx); return (PTL_FAIL); } void -koibnal_start_active_rdma (int type, int status, - koib_rx_t *rx, lib_msg_t *libmsg, +kibnal_start_active_rdma (int type, int status, + kib_rx_t *rx, lib_msg_t *libmsg, unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov, size_t offset, size_t nob) { - koib_msg_t *rxmsg = rx->rx_msg; - koib_msg_t *txmsg; - koib_tx_t *tx; + kib_msg_t *rxmsg = rx->rx_msg; + kib_msg_t *txmsg; + kib_tx_t *tx; int access; int rdma_op; int rc; @@ -1187,8 +1194,8 @@ koibnal_start_active_rdma (int type, int status, /* No data if we're completing with failure */ LASSERT (status == 0 || nob == 0); - LASSERT (type == OPENIBNAL_MSG_GET_DONE || - type == OPENIBNAL_MSG_PUT_DONE); + LASSERT (type == IBNAL_MSG_GET_DONE || + type == IBNAL_MSG_PUT_DONE); /* Flag I'm completing the RDMA. Even if I fail to send the * completion message, I will have tried my best so further @@ -1196,22 +1203,22 @@ koibnal_start_active_rdma (int type, int status, LASSERT (!rx->rx_rdma); rx->rx_rdma = 1; - if (type == OPENIBNAL_MSG_GET_DONE) { + if (type == IBNAL_MSG_GET_DONE) { access = 0; rdma_op = IB_OP_RDMA_WRITE; - LASSERT (rxmsg->oibm_type == OPENIBNAL_MSG_GET_RDMA); + LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA); } else { access = IB_ACCESS_LOCAL_WRITE; rdma_op = IB_OP_RDMA_READ; - LASSERT (rxmsg->oibm_type == OPENIBNAL_MSG_PUT_RDMA); + LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA); } - tx = koibnal_get_idle_tx (0); /* Mustn't block */ + tx = kibnal_get_idle_tx (0); /* Mustn't block */ if (tx == NULL) { CERROR ("tx descs exhausted on RDMA from "LPX64 " completing locally with failure\n", - rx->rx_conn->ibc_peer->ibp_nid); - lib_finalize (&koibnal_lib, NULL, libmsg, PTL_NO_SPACE); + rx->rx_conn->ibc_peer->ibp_nid); + lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE); return; } LASSERT (tx->tx_nsp == 0); @@ -1222,11 +1229,11 @@ koibnal_start_active_rdma (int type, int status, * message is matched) */ if (kiov != NULL) - rc = koibnal_map_kiov (tx, access, - niov, kiov, offset, nob); + rc = kibnal_map_kiov (tx, access, + niov, kiov, offset, nob); else - rc = koibnal_map_iov (tx, access, - niov, iov, offset, nob); + rc = kibnal_map_iov (tx, access, + niov, iov, offset, nob); if (rc != 0) { CERROR ("Can't map RDMA -> "LPX64": %d\n", @@ -1242,12 +1249,12 @@ koibnal_start_active_rdma (int type, int status, }; tx->tx_sp[0] = (struct ib_send_param) { - .work_request_id = (__u64)((unsigned long)tx), + .work_request_id = kibnal_ptr2wreqid(tx, 0), .op = rdma_op, .gather_list = &tx->tx_gl[0], .num_gather_entries = 1, - .remote_address = rxmsg->oibm_u.rdma.oibrm_desc.rd_addr, - .rkey = rxmsg->oibm_u.rdma.oibrm_desc.rd_key, + .remote_address = rxmsg->ibm_u.rdma.ibrm_desc.rd_addr, + .rkey = rxmsg->ibm_u.rdma.ibrm_desc.rd_key, .device_specific = NULL, .solicited_event = 0, .signaled = 1, @@ -1262,10 +1269,10 @@ koibnal_start_active_rdma (int type, int status, txmsg = tx->tx_msg; - txmsg->oibm_u.completion.oibcm_cookie = rxmsg->oibm_u.rdma.oibrm_cookie; - txmsg->oibm_u.completion.oibcm_status = status; + txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie; + txmsg->ibm_u.completion.ibcm_status = status; - koibnal_init_tx_msg(tx, type, sizeof (koib_completion_msg_t)); + kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t)); if (status == 0 && nob != 0) { LASSERT (tx->tx_nsp > 1); @@ -1277,7 +1284,7 @@ koibnal_start_active_rdma (int type, int status, LASSERT (tx->tx_nsp == 1); /* No RDMA: local completion happens now! */ CDEBUG(D_WARNING,"No data: immediate completion\n"); - lib_finalize (&koibnal_lib, NULL, libmsg, + lib_finalize (&kibnal_lib, NULL, libmsg, status == 0 ? PTL_OK : PTL_FAIL); } @@ -1288,11 +1295,11 @@ koibnal_start_active_rdma (int type, int status, atomic_read (&rx->rx_conn->ibc_refcount)); atomic_inc (&rx->rx_conn->ibc_refcount); /* ...and queue it up */ - koibnal_queue_tx(tx, rx->rx_conn); + kibnal_queue_tx(tx, rx->rx_conn); } ptl_err_t -koibnal_sendmsg(lib_nal_t *nal, +kibnal_sendmsg(lib_nal_t *nal, void *private, lib_msg_t *libmsg, ptl_hdr_t *hdr, @@ -1305,8 +1312,8 @@ koibnal_sendmsg(lib_nal_t *nal, size_t payload_offset, size_t payload_nob) { - koib_msg_t *oibmsg; - koib_tx_t *tx; + kib_msg_t *ibmsg; + kib_tx_t *tx; int nob; /* NB 'private' is different depending on what we're sending.... */ @@ -1329,27 +1336,27 @@ koibnal_sendmsg(lib_nal_t *nal, case PTL_MSG_REPLY: { /* reply's 'private' is the incoming receive */ - koib_rx_t *rx = private; + kib_rx_t *rx = private; /* RDMA reply expected? */ - if (rx->rx_msg->oibm_type == OPENIBNAL_MSG_GET_RDMA) { - koibnal_start_active_rdma(OPENIBNAL_MSG_GET_DONE, 0, - rx, libmsg, payload_niov, - payload_iov, payload_kiov, - payload_offset, payload_nob); + if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) { + kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0, + rx, libmsg, payload_niov, + payload_iov, payload_kiov, + payload_offset, payload_nob); return (PTL_OK); } /* Incoming message consistent with immediate reply? */ - if (rx->rx_msg->oibm_type != OPENIBNAL_MSG_IMMEDIATE) { + if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) { CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n", - nid, rx->rx_msg->oibm_type); + nid, rx->rx_msg->ibm_type); return (PTL_FAIL); } /* Will it fit in a message? */ - nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[payload_nob]); - if (nob >= OPENIBNAL_MSG_SIZE) { + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); + if (nob >= IBNAL_MSG_SIZE) { CERROR("REPLY for "LPX64" too big (RDMA not requested): %d\n", nid, payload_nob); return (PTL_FAIL); @@ -1359,10 +1366,10 @@ koibnal_sendmsg(lib_nal_t *nal, case PTL_MSG_GET: /* might the REPLY message be big enough to need RDMA? */ - nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[libmsg->md->length]); - if (nob > OPENIBNAL_MSG_SIZE) - return (koibnal_start_passive_rdma(OPENIBNAL_MSG_GET_RDMA, - nid, libmsg, hdr)); + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]); + if (nob > IBNAL_MSG_SIZE) + return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, + nid, libmsg, hdr)); break; case PTL_MSG_ACK: @@ -1371,181 +1378,181 @@ koibnal_sendmsg(lib_nal_t *nal, case PTL_MSG_PUT: /* Is the payload big enough to need RDMA? */ - nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[payload_nob]); - if (nob > OPENIBNAL_MSG_SIZE) - return (koibnal_start_passive_rdma(OPENIBNAL_MSG_PUT_RDMA, - nid, libmsg, hdr)); + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); + if (nob > IBNAL_MSG_SIZE) + return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA, + nid, libmsg, hdr)); break; } - tx = koibnal_get_idle_tx(!(type == PTL_MSG_ACK || - type == PTL_MSG_REPLY || - in_interrupt())); + tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK || + type == PTL_MSG_REPLY || + in_interrupt())); if (tx == NULL) { CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n", type, nid, in_interrupt() ? " (intr)" : ""); return (PTL_NO_SPACE); } - oibmsg = tx->tx_msg; - oibmsg->oibm_u.immediate.oibim_hdr = *hdr; + ibmsg = tx->tx_msg; + ibmsg->ibm_u.immediate.ibim_hdr = *hdr; if (payload_nob > 0) { if (payload_kiov != NULL) - lib_copy_kiov2buf(oibmsg->oibm_u.immediate.oibim_payload, + lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload, payload_niov, payload_kiov, payload_offset, payload_nob); else - lib_copy_iov2buf(oibmsg->oibm_u.immediate.oibim_payload, + lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload, payload_niov, payload_iov, payload_offset, payload_nob); } - koibnal_init_tx_msg (tx, OPENIBNAL_MSG_IMMEDIATE, - offsetof(koib_immediate_msg_t, - oibim_payload[payload_nob])); + kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, + offsetof(kib_immediate_msg_t, + ibim_payload[payload_nob])); /* libmsg gets finalized when tx completes */ tx->tx_libmsg[0] = libmsg; - koibnal_launch_tx(tx, nid); + kibnal_launch_tx(tx, nid); return (PTL_OK); } ptl_err_t -koibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie, +kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, unsigned int payload_niov, struct iovec *payload_iov, size_t payload_offset, size_t payload_len) { - return (koibnal_sendmsg(nal, private, cookie, - hdr, type, nid, pid, - payload_niov, payload_iov, NULL, - payload_offset, payload_len)); + return (kibnal_sendmsg(nal, private, cookie, + hdr, type, nid, pid, + payload_niov, payload_iov, NULL, + payload_offset, payload_len)); } ptl_err_t -koibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, +kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, unsigned int payload_niov, ptl_kiov_t *payload_kiov, size_t payload_offset, size_t payload_len) { - return (koibnal_sendmsg(nal, private, cookie, - hdr, type, nid, pid, - payload_niov, NULL, payload_kiov, - payload_offset, payload_len)); + return (kibnal_sendmsg(nal, private, cookie, + hdr, type, nid, pid, + payload_niov, NULL, payload_kiov, + payload_offset, payload_len)); } ptl_err_t -koibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, +kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov, size_t offset, size_t mlen, size_t rlen) { - koib_rx_t *rx = private; - koib_msg_t *rxmsg = rx->rx_msg; - int msg_nob; + kib_rx_t *rx = private; + kib_msg_t *rxmsg = rx->rx_msg; + int msg_nob; LASSERT (mlen <= rlen); LASSERT (!in_interrupt ()); /* Either all pages or all vaddrs */ LASSERT (!(kiov != NULL && iov != NULL)); - switch (rxmsg->oibm_type) { + switch (rxmsg->ibm_type) { default: LBUG(); return (PTL_FAIL); - case OPENIBNAL_MSG_IMMEDIATE: - msg_nob = offsetof(koib_msg_t, oibm_u.immediate.oibim_payload[rlen]); - if (msg_nob > OPENIBNAL_MSG_SIZE) { + case IBNAL_MSG_IMMEDIATE: + msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]); + if (msg_nob > IBNAL_MSG_SIZE) { CERROR ("Immediate message from "LPX64" too big: %d\n", - rxmsg->oibm_u.immediate.oibim_hdr.src_nid, rlen); + rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen); return (PTL_FAIL); } if (kiov != NULL) lib_copy_buf2kiov(niov, kiov, offset, - rxmsg->oibm_u.immediate.oibim_payload, + rxmsg->ibm_u.immediate.ibim_payload, mlen); else lib_copy_buf2iov(niov, iov, offset, - rxmsg->oibm_u.immediate.oibim_payload, + rxmsg->ibm_u.immediate.ibim_payload, mlen); lib_finalize (nal, NULL, libmsg, PTL_OK); return (PTL_OK); - case OPENIBNAL_MSG_GET_RDMA: + case IBNAL_MSG_GET_RDMA: /* We get called here just to discard any junk after the * GET hdr. */ LASSERT (libmsg == NULL); lib_finalize (nal, NULL, libmsg, PTL_OK); return (PTL_OK); - case OPENIBNAL_MSG_PUT_RDMA: - koibnal_start_active_rdma (OPENIBNAL_MSG_PUT_DONE, 0, - rx, libmsg, - niov, iov, kiov, offset, mlen); + case IBNAL_MSG_PUT_RDMA: + kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0, + rx, libmsg, + niov, iov, kiov, offset, mlen); return (PTL_OK); } } ptl_err_t -koibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, +kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, unsigned int niov, struct iovec *iov, size_t offset, size_t mlen, size_t rlen) { - return (koibnal_recvmsg (nal, private, msg, niov, iov, NULL, - offset, mlen, rlen)); + return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL, + offset, mlen, rlen)); } ptl_err_t -koibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg, +kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg, unsigned int niov, ptl_kiov_t *kiov, size_t offset, size_t mlen, size_t rlen) { - return (koibnal_recvmsg (nal, private, msg, niov, NULL, kiov, - offset, mlen, rlen)); + return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov, + offset, mlen, rlen)); } int -koibnal_thread_start (int (*fn)(void *arg), void *arg) +kibnal_thread_start (int (*fn)(void *arg), void *arg) { long pid = kernel_thread (fn, arg, 0); if (pid < 0) return ((int)pid); - atomic_inc (&koibnal_data.koib_nthreads); + atomic_inc (&kibnal_data.kib_nthreads); return (0); } void -koibnal_thread_fini (void) +kibnal_thread_fini (void) { - atomic_dec (&koibnal_data.koib_nthreads); + atomic_dec (&kibnal_data.kib_nthreads); } void -koibnal_close_conn_locked (koib_conn_t *conn, int error) +kibnal_close_conn_locked (kib_conn_t *conn, int error) { /* This just does the immmediate housekeeping, and schedules the * connection for the connd to finish off. - * Caller holds koib_global_lock exclusively in irq context */ - koib_peer_t *peer = conn->ibc_peer; + * Caller holds kib_global_lock exclusively in irq context */ + kib_peer_t *peer = conn->ibc_peer; CDEBUG (error == 0 ? D_NET : D_ERROR, "closing conn to "LPX64": error %d\n", peer->ibp_nid, error); - LASSERT (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED || - conn->ibc_state == OPENIBNAL_CONN_CONNECTING); + LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED || + conn->ibc_state == IBNAL_CONN_CONNECTING); - if (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED) { - /* koib_connd_conns takes ibc_list's ref */ + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { + /* kib_connd_conns takes ibc_list's ref */ list_del (&conn->ibc_list); } else { - /* new ref for koib_connd_conns */ + /* new ref for kib_connd_conns */ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", conn, conn->ibc_state, conn->ibc_peer->ibp_nid, atomic_read (&conn->ibc_refcount)); @@ -1555,57 +1562,57 @@ koibnal_close_conn_locked (koib_conn_t *conn, int error) if (list_empty (&peer->ibp_conns) && peer->ibp_persistence == 0) { /* Non-persistent peer with no more conns... */ - koibnal_unlink_peer_locked (peer); + kibnal_unlink_peer_locked (peer); } - conn->ibc_state = OPENIBNAL_CONN_DEATHROW; + conn->ibc_state = IBNAL_CONN_DEATHROW; /* Schedule conn for closing/destruction */ - spin_lock (&koibnal_data.koib_connd_lock); + spin_lock (&kibnal_data.kib_connd_lock); - list_add_tail (&conn->ibc_list, &koibnal_data.koib_connd_conns); - wake_up (&koibnal_data.koib_connd_waitq); + list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns); + wake_up (&kibnal_data.kib_connd_waitq); - spin_unlock (&koibnal_data.koib_connd_lock); + spin_unlock (&kibnal_data.kib_connd_lock); } int -koibnal_close_conn (koib_conn_t *conn, int why) +kibnal_close_conn (kib_conn_t *conn, int why) { unsigned long flags; int count = 0; - write_lock_irqsave (&koibnal_data.koib_global_lock, flags); + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - LASSERT (conn->ibc_state >= OPENIBNAL_CONN_CONNECTING); + LASSERT (conn->ibc_state >= IBNAL_CONN_CONNECTING); - if (conn->ibc_state <= OPENIBNAL_CONN_ESTABLISHED) { + if (conn->ibc_state <= IBNAL_CONN_ESTABLISHED) { count = 1; - koibnal_close_conn_locked (conn, why); + kibnal_close_conn_locked (conn, why); } - write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags); + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); return (count); } void -koibnal_peer_connect_failed (koib_peer_t *peer, int active, int rc) +kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc) { LIST_HEAD (zombies); - koib_tx_t *tx; + kib_tx_t *tx; unsigned long flags; LASSERT (rc != 0); - LASSERT (peer->ibp_reconnect_interval >= OPENIBNAL_MIN_RECONNECT_INTERVAL); + LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL); - write_lock_irqsave (&koibnal_data.koib_global_lock, flags); + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); LASSERT (peer->ibp_connecting != 0); peer->ibp_connecting--; if (peer->ibp_connecting != 0) { /* another connection attempt under way (loopback?)... */ - write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags); + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); return; } @@ -1614,50 +1621,50 @@ koibnal_peer_connect_failed (koib_peer_t *peer, int active, int rc) peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval; /* Increase reconnection interval */ peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2, - OPENIBNAL_MAX_RECONNECT_INTERVAL); + IBNAL_MAX_RECONNECT_INTERVAL); /* Take peer's blocked blocked transmits; I'll complete * them with error */ while (!list_empty (&peer->ibp_tx_queue)) { tx = list_entry (peer->ibp_tx_queue.next, - koib_tx_t, tx_list); + kib_tx_t, tx_list); list_del (&tx->tx_list); list_add_tail (&tx->tx_list, &zombies); } - if (koibnal_peer_active(peer) && + if (kibnal_peer_active(peer) && (peer->ibp_persistence == 0)) { /* failed connection attempt on non-persistent peer */ - koibnal_unlink_peer_locked (peer); + kibnal_unlink_peer_locked (peer); } } else { /* Can't have blocked transmits if there are connections */ LASSERT (list_empty(&peer->ibp_tx_queue)); } - write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags); + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); if (!list_empty (&zombies)) CERROR ("Deleting messages for "LPX64": connection failed\n", peer->ibp_nid); while (!list_empty (&zombies)) { - tx = list_entry (zombies.next, koib_tx_t, tx_list); + tx = list_entry (zombies.next, kib_tx_t, tx_list); list_del (&tx->tx_list); /* complete now */ tx->tx_status = -EHOSTUNREACH; - koibnal_tx_done (tx); + kibnal_tx_done (tx); } } void -koibnal_connreq_done (koib_conn_t *conn, int active, int status) +kibnal_connreq_done (kib_conn_t *conn, int active, int status) { int state = conn->ibc_state; - koib_peer_t *peer = conn->ibc_peer; - koib_tx_t *tx; + kib_peer_t *peer = conn->ibc_peer; + kib_tx_t *tx; unsigned long flags; int rc; int i; @@ -1669,31 +1676,31 @@ koibnal_connreq_done (koib_conn_t *conn, int active, int status) conn->ibc_connreq = NULL; } - if (state == OPENIBNAL_CONN_CONNECTING) { + if (state == IBNAL_CONN_CONNECTING) { /* Install common (active/passive) callback for * disconnect/idle notification if I got as far as getting * a CM comm_id */ rc = tsIbCmCallbackModify(conn->ibc_comm_id, - koibnal_conn_callback, conn); + kibnal_conn_callback, conn); LASSERT (rc == 0); } - write_lock_irqsave (&koibnal_data.koib_global_lock, flags); + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); LASSERT (peer->ibp_connecting != 0); if (status == 0) { /* connection established... */ - LASSERT (state == OPENIBNAL_CONN_CONNECTING); - conn->ibc_state = OPENIBNAL_CONN_ESTABLISHED; + LASSERT (state == IBNAL_CONN_CONNECTING); + conn->ibc_state = IBNAL_CONN_ESTABLISHED; - if (!koibnal_peer_active(peer)) { + if (!kibnal_peer_active(peer)) { /* ...but peer deleted meantime */ status = -ECONNABORTED; } } else { - LASSERT (state == OPENIBNAL_CONN_INIT_QP || - state == OPENIBNAL_CONN_CONNECTING); + LASSERT (state == IBNAL_CONN_INIT_QP || + state == IBNAL_CONN_CONNECTING); } if (status == 0) { @@ -1710,14 +1717,14 @@ koibnal_connreq_done (koib_conn_t *conn, int active, int status) list_add (&conn->ibc_list, &peer->ibp_conns); /* reset reconnect interval for next attempt */ - peer->ibp_reconnect_interval = OPENIBNAL_MIN_RECONNECT_INTERVAL; + peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; /* post blocked sends to the new connection */ spin_lock (&conn->ibc_lock); while (!list_empty (&peer->ibp_tx_queue)) { tx = list_entry (peer->ibp_tx_queue.next, - koib_tx_t, tx_list); + kib_tx_t, tx_list); list_del (&tx->tx_list); @@ -1726,19 +1733,19 @@ koibnal_connreq_done (koib_conn_t *conn, int active, int status) conn, conn->ibc_state, conn->ibc_peer->ibp_nid, atomic_read (&conn->ibc_refcount)); atomic_inc (&conn->ibc_refcount); - koibnal_queue_tx_locked (tx, conn); + kibnal_queue_tx_locked (tx, conn); } spin_unlock (&conn->ibc_lock); /* Nuke any dangling conns from a different peer instance... */ - koibnal_close_stale_conns_locked (conn->ibc_peer, - conn->ibc_incarnation); + kibnal_close_stale_conns_locked (conn->ibc_peer, + conn->ibc_incarnation); - write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags); + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); /* queue up all the receives */ - for (i = 0; i < OPENIBNAL_RX_MSGS; i++) { + for (i = 0; i < IBNAL_RX_MSGS; i++) { /* +1 ref for rx desc */ CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", conn, conn->ibc_state, conn->ibc_peer->ibp_nid, @@ -1749,71 +1756,71 @@ koibnal_connreq_done (koib_conn_t *conn, int active, int status) i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg, conn->ibc_rxs[i].rx_vaddr); - koibnal_post_rx (&conn->ibc_rxs[i], 0); + kibnal_post_rx (&conn->ibc_rxs[i], 0); } - koibnal_check_sends (conn); + kibnal_check_sends (conn); return; } /* connection failed */ - if (state == OPENIBNAL_CONN_CONNECTING) { + if (state == IBNAL_CONN_CONNECTING) { /* schedule for connd to close */ - koibnal_close_conn_locked (conn, status); + kibnal_close_conn_locked (conn, status); } else { /* Don't have a CM comm_id; just wait for refs to drain */ - conn->ibc_state = OPENIBNAL_CONN_ZOMBIE; + conn->ibc_state = IBNAL_CONN_ZOMBIE; } - write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags); + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - koibnal_peer_connect_failed (conn->ibc_peer, active, status); + kibnal_peer_connect_failed (conn->ibc_peer, active, status); - if (state != OPENIBNAL_CONN_CONNECTING) { + if (state != IBNAL_CONN_CONNECTING) { /* drop caller's ref if we're not waiting for the * IB_CM_IDLE callback */ - koibnal_put_conn (conn); + kibnal_put_conn (conn); } } int -koibnal_accept (koib_conn_t **connp, tTS_IB_CM_COMM_ID cid, +kibnal_accept (kib_conn_t **connp, tTS_IB_CM_COMM_ID cid, ptl_nid_t nid, __u64 incarnation, int queue_depth) { - koib_conn_t *conn = koibnal_create_conn(); - koib_peer_t *peer; - koib_peer_t *peer2; + kib_conn_t *conn = kibnal_create_conn(); + kib_peer_t *peer; + kib_peer_t *peer2; unsigned long flags; if (conn == NULL) return (-ENOMEM); - if (queue_depth != OPENIBNAL_MSG_QUEUE_SIZE) { + if (queue_depth != IBNAL_MSG_QUEUE_SIZE) { CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n", - nid, queue_depth, OPENIBNAL_MSG_QUEUE_SIZE); + nid, queue_depth, IBNAL_MSG_QUEUE_SIZE); return (-EPROTO); } /* assume 'nid' is a new peer */ - peer = koibnal_create_peer (nid); + peer = kibnal_create_peer (nid); if (peer == NULL) { CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n", conn, conn->ibc_state, conn->ibc_peer->ibp_nid, atomic_read (&conn->ibc_refcount)); atomic_dec (&conn->ibc_refcount); - koibnal_destroy_conn(conn); + kibnal_destroy_conn(conn); return (-ENOMEM); } - write_lock_irqsave (&koibnal_data.koib_global_lock, flags); + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - peer2 = koibnal_find_peer_locked(nid); + peer2 = kibnal_find_peer_locked(nid); if (peer2 == NULL) { /* peer table takes my ref on peer */ list_add_tail (&peer->ibp_list, - koibnal_nid2peerlist(nid)); + kibnal_nid2peerlist(nid)); } else { - koibnal_put_peer (peer); + kibnal_put_peer (peer); peer = peer2; } @@ -1821,20 +1828,20 @@ koibnal_accept (koib_conn_t **connp, tTS_IB_CM_COMM_ID cid, atomic_inc (&peer->ibp_refcount); peer->ibp_connecting++; - write_unlock_irqrestore (&koibnal_data.koib_global_lock, flags); + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); conn->ibc_peer = peer; - conn->ibc_state = OPENIBNAL_CONN_CONNECTING; + conn->ibc_state = IBNAL_CONN_CONNECTING; conn->ibc_comm_id = cid; conn->ibc_incarnation = incarnation; - conn->ibc_credits = OPENIBNAL_MSG_QUEUE_SIZE; + conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; *connp = conn; return (0); } tTS_IB_CM_CALLBACK_RETURN -koibnal_idle_conn_callback (tTS_IB_CM_EVENT event, +kibnal_idle_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid, void *param, void *arg) @@ -1846,13 +1853,19 @@ koibnal_idle_conn_callback (tTS_IB_CM_EVENT event, } tTS_IB_CM_CALLBACK_RETURN -koibnal_conn_callback (tTS_IB_CM_EVENT event, +kibnal_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid, void *param, void *arg) { - koib_conn_t *conn = arg; - int rc; + kib_conn_t *conn = arg; + LIST_HEAD (zombies); + struct list_head *tmp; + struct list_head *nxt; + kib_tx_t *tx; + unsigned long flags; + int done; + int rc; /* Established Connection Notifier */ @@ -1860,24 +1873,72 @@ koibnal_conn_callback (tTS_IB_CM_EVENT event, default: CERROR("Connection %p -> "LPX64" ERROR %d\n", conn, conn->ibc_peer->ibp_nid, event); - koibnal_close_conn (conn, -ECONNABORTED); + kibnal_close_conn (conn, -ECONNABORTED); break; case TS_IB_CM_DISCONNECTED: CDEBUG(D_WARNING, "Connection %p -> "LPX64" DISCONNECTED.\n", conn, conn->ibc_peer->ibp_nid); - koibnal_close_conn (conn, 0); + kibnal_close_conn (conn, 0); break; case TS_IB_CM_IDLE: CDEBUG(D_NET, "Connection %p -> "LPX64" IDLE.\n", conn, conn->ibc_peer->ibp_nid); - koibnal_put_conn (conn); /* Lose CM's ref */ + kibnal_put_conn (conn); /* Lose CM's ref */ /* LASSERT (no further callbacks) */ rc = tsIbCmCallbackModify(cid, - koibnal_idle_conn_callback, conn); + kibnal_idle_conn_callback, conn); LASSERT (rc == 0); + + /* NB we wait until the connection has closed before + * completing outstanding passive RDMAs so we can be sure + * the network can't touch the mapped memory any more. */ + + spin_lock_irqsave (&conn->ibc_lock, flags); + + /* grab passive RDMAs not waiting for the tx callback */ + list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) { + tx = list_entry (tmp, kib_tx_t, tx_list); + + LASSERT (tx->tx_passive_rdma || + !tx->tx_passive_rdma_wait); + + LASSERT (tx->tx_passive_rdma_wait || + tx->tx_sending != 0); + + /* still waiting for tx callback? */ + if (!tx->tx_passive_rdma_wait) + continue; + + tx->tx_status = -ECONNABORTED; + tx->tx_passive_rdma_wait = 0; + done = (tx->tx_sending == 0); + + if (!done) + continue; + + list_del (&tx->tx_list); + list_add (&tx->tx_list, &zombies); + } + + /* grab all blocked transmits */ + list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) { + tx = list_entry (tmp, kib_tx_t, tx_list); + + list_del (&tx->tx_list); + list_add (&tx->tx_list, &zombies); + } + + spin_unlock_irqrestore (&conn->ibc_lock, flags); + + while (!list_empty(&zombies)) { + tx = list_entry (zombies.next, kib_tx_t, tx_list); + + list_del(&tx->tx_list); + kibnal_tx_done (tx); + } break; } @@ -1885,12 +1946,12 @@ koibnal_conn_callback (tTS_IB_CM_EVENT event, } tTS_IB_CM_CALLBACK_RETURN -koibnal_passive_conn_callback (tTS_IB_CM_EVENT event, +kibnal_passive_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid, void *param, void *arg) { - koib_conn_t *conn = arg; + kib_conn_t *conn = arg; int rc; switch (event) { @@ -1903,12 +1964,12 @@ koibnal_passive_conn_callback (tTS_IB_CM_EVENT event, CERROR ("Unexpected event %p -> "LPX64": %d\n", conn, conn->ibc_peer->ibp_nid, event); - koibnal_connreq_done (conn, 0, -ECONNABORTED); + kibnal_connreq_done (conn, 0, -ECONNABORTED); break; case TS_IB_CM_REQ_RECEIVED: { struct ib_cm_req_received_param *req = param; - koib_wire_connreq_t *wcr = req->remote_private_data; + kib_wire_connreq_t *wcr = req->remote_private_data; LASSERT (conn == NULL); @@ -1920,23 +1981,23 @@ koibnal_passive_conn_callback (tTS_IB_CM_EVENT event, return TS_IB_CM_CALLBACK_ABORT; } - if (wcr->wcr_magic != cpu_to_le32(OPENIBNAL_MSG_MAGIC)) { + if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) { CERROR ("Can't accept LID %04x: bad magic %08x\n", req->dlid, le32_to_cpu(wcr->wcr_magic)); return TS_IB_CM_CALLBACK_ABORT; } - if (wcr->wcr_version != cpu_to_le16(OPENIBNAL_MSG_VERSION)) { + if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) { CERROR ("Can't accept LID %04x: bad version %d\n", req->dlid, le16_to_cpu(wcr->wcr_magic)); return TS_IB_CM_CALLBACK_ABORT; } - rc = koibnal_accept(&conn, - cid, - le64_to_cpu(wcr->wcr_nid), - le64_to_cpu(wcr->wcr_incarnation), - le16_to_cpu(wcr->wcr_queue_depth)); + rc = kibnal_accept(&conn, + cid, + le64_to_cpu(wcr->wcr_nid), + le64_to_cpu(wcr->wcr_incarnation), + le16_to_cpu(wcr->wcr_queue_depth)); if (rc != 0) { CERROR ("Can't accept "LPX64": %d\n", le64_to_cpu(wcr->wcr_nid), rc); @@ -1945,23 +2006,23 @@ koibnal_passive_conn_callback (tTS_IB_CM_EVENT event, /* update 'arg' for next callback */ rc = tsIbCmCallbackModify(cid, - koibnal_passive_conn_callback, conn); + kibnal_passive_conn_callback, conn); LASSERT (rc == 0); req->accept_param.qp = conn->ibc_qp; - *((koib_wire_connreq_t *)req->accept_param.reply_private_data) - = (koib_wire_connreq_t) { - .wcr_magic = cpu_to_le32(OPENIBNAL_MSG_MAGIC), - .wcr_version = cpu_to_le16(OPENIBNAL_MSG_VERSION), - .wcr_queue_depth = cpu_to_le32(OPENIBNAL_MSG_QUEUE_SIZE), - .wcr_nid = cpu_to_le64(koibnal_data.koib_nid), - .wcr_incarnation = cpu_to_le64(koibnal_data.koib_incarnation), + *((kib_wire_connreq_t *)req->accept_param.reply_private_data) + = (kib_wire_connreq_t) { + .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC), + .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION), + .wcr_queue_depth = cpu_to_le32(IBNAL_MSG_QUEUE_SIZE), + .wcr_nid = cpu_to_le64(kibnal_data.kib_nid), + .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation), }; - req->accept_param.reply_private_data_len = sizeof(koib_wire_connreq_t); - req->accept_param.responder_resources = OPENIBNAL_RESPONDER_RESOURCES; - req->accept_param.initiator_depth = OPENIBNAL_RESPONDER_RESOURCES; - req->accept_param.rnr_retry_count = OPENIBNAL_RNR_RETRY; - req->accept_param.flow_control = OPENIBNAL_FLOW_CONTROL; + req->accept_param.reply_private_data_len = sizeof(kib_wire_connreq_t); + req->accept_param.responder_resources = IBNAL_RESPONDER_RESOURCES; + req->accept_param.initiator_depth = IBNAL_RESPONDER_RESOURCES; + req->accept_param.rnr_retry_count = IBNAL_RNR_RETRY; + req->accept_param.flow_control = IBNAL_FLOW_CONTROL; CDEBUG(D_NET, "Proceeding\n"); break; @@ -1972,60 +2033,60 @@ koibnal_passive_conn_callback (tTS_IB_CM_EVENT event, CDEBUG(D_WARNING, "Connection %p -> "LPX64" ESTABLISHED.\n", conn, conn->ibc_peer->ibp_nid); - koibnal_connreq_done (conn, 0, 0); + kibnal_connreq_done (conn, 0, 0); break; } - /* NB if the connreq is done, we switch to koibnal_conn_callback */ + /* NB if the connreq is done, we switch to kibnal_conn_callback */ return TS_IB_CM_CALLBACK_PROCEED; } tTS_IB_CM_CALLBACK_RETURN -koibnal_active_conn_callback (tTS_IB_CM_EVENT event, +kibnal_active_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid, void *param, void *arg) { - koib_conn_t *conn = arg; + kib_conn_t *conn = arg; switch (event) { case TS_IB_CM_REP_RECEIVED: { struct ib_cm_rep_received_param *rep = param; - koib_wire_connreq_t *wcr = rep->remote_private_data; + kib_wire_connreq_t *wcr = rep->remote_private_data; if (rep->remote_private_data_len < sizeof (*wcr)) { CERROR ("Short reply from "LPX64": %d\n", conn->ibc_peer->ibp_nid, rep->remote_private_data_len); - koibnal_connreq_done (conn, 1, -EPROTO); + kibnal_connreq_done (conn, 1, -EPROTO); break; } - if (wcr->wcr_magic != cpu_to_le32(OPENIBNAL_MSG_MAGIC)) { + if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) { CERROR ("Can't connect "LPX64": bad magic %08x\n", conn->ibc_peer->ibp_nid, le32_to_cpu(wcr->wcr_magic)); - koibnal_connreq_done (conn, 1, -EPROTO); + kibnal_connreq_done (conn, 1, -EPROTO); break; } - if (wcr->wcr_version != cpu_to_le16(OPENIBNAL_MSG_VERSION)) { + if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) { CERROR ("Can't connect "LPX64": bad version %d\n", conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_magic)); - koibnal_connreq_done (conn, 1, -EPROTO); + kibnal_connreq_done (conn, 1, -EPROTO); break; } - if (wcr->wcr_queue_depth != cpu_to_le16(OPENIBNAL_MSG_QUEUE_SIZE)) { + if (wcr->wcr_queue_depth != cpu_to_le16(IBNAL_MSG_QUEUE_SIZE)) { CERROR ("Can't connect "LPX64": bad queue depth %d\n", conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_queue_depth)); - koibnal_connreq_done (conn, 1, -EPROTO); + kibnal_connreq_done (conn, 1, -EPROTO); break; } if (le64_to_cpu(wcr->wcr_nid) != conn->ibc_peer->ibp_nid) { CERROR ("Unexpected NID "LPX64" from "LPX64"\n", le64_to_cpu(wcr->wcr_nid), conn->ibc_peer->ibp_nid); - koibnal_connreq_done (conn, 1, -EPROTO); + kibnal_connreq_done (conn, 1, -EPROTO); break; } @@ -2033,7 +2094,7 @@ koibnal_active_conn_callback (tTS_IB_CM_EVENT event, conn, conn->ibc_peer->ibp_nid); conn->ibc_incarnation = le64_to_cpu(wcr->wcr_incarnation); - conn->ibc_credits = OPENIBNAL_MSG_QUEUE_SIZE; + conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; break; } @@ -2041,86 +2102,86 @@ koibnal_active_conn_callback (tTS_IB_CM_EVENT event, CDEBUG(D_WARNING, "Connection %p -> "LPX64" Established\n", conn, conn->ibc_peer->ibp_nid); - koibnal_connreq_done (conn, 1, 0); + kibnal_connreq_done (conn, 1, 0); break; case TS_IB_CM_IDLE: CERROR("Connection %p -> "LPX64" IDLE\n", conn, conn->ibc_peer->ibp_nid); /* Back out state change: I'm disengaged from CM */ - conn->ibc_state = OPENIBNAL_CONN_INIT_QP; + conn->ibc_state = IBNAL_CONN_INIT_QP; - koibnal_connreq_done (conn, 1, -ECONNABORTED); + kibnal_connreq_done (conn, 1, -ECONNABORTED); break; default: CERROR("Connection %p -> "LPX64" ERROR %d\n", conn, conn->ibc_peer->ibp_nid, event); - koibnal_connreq_done (conn, 1, -ECONNABORTED); + kibnal_connreq_done (conn, 1, -ECONNABORTED); break; } - /* NB if the connreq is done, we switch to koibnal_conn_callback */ + /* NB if the connreq is done, we switch to kibnal_conn_callback */ return TS_IB_CM_CALLBACK_PROCEED; } int -koibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status, +kibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status, struct ib_path_record *resp, int remaining, void *arg) { - koib_conn_t *conn = arg; + kib_conn_t *conn = arg; if (status != 0) { CERROR ("status %d\n", status); - koibnal_connreq_done (conn, 1, status); + kibnal_connreq_done (conn, 1, status); goto out; } conn->ibc_connreq->cr_path = *resp; - conn->ibc_connreq->cr_wcr = (koib_wire_connreq_t) { - .wcr_magic = cpu_to_le32(OPENIBNAL_MSG_MAGIC), - .wcr_version = cpu_to_le16(OPENIBNAL_MSG_VERSION), - .wcr_queue_depth = cpu_to_le16(OPENIBNAL_MSG_QUEUE_SIZE), - .wcr_nid = cpu_to_le64(koibnal_data.koib_nid), - .wcr_incarnation = cpu_to_le64(koibnal_data.koib_incarnation), + conn->ibc_connreq->cr_wcr = (kib_wire_connreq_t) { + .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC), + .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION), + .wcr_queue_depth = cpu_to_le16(IBNAL_MSG_QUEUE_SIZE), + .wcr_nid = cpu_to_le64(kibnal_data.kib_nid), + .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation), }; conn->ibc_connreq->cr_connparam = (struct ib_cm_active_param) { .qp = conn->ibc_qp, .req_private_data = &conn->ibc_connreq->cr_wcr, .req_private_data_len = sizeof(conn->ibc_connreq->cr_wcr), - .responder_resources = OPENIBNAL_RESPONDER_RESOURCES, - .initiator_depth = OPENIBNAL_RESPONDER_RESOURCES, - .retry_count = OPENIBNAL_RETRY, - .rnr_retry_count = OPENIBNAL_RNR_RETRY, - .cm_response_timeout = koibnal_tunables.koib_io_timeout, - .max_cm_retries = OPENIBNAL_CM_RETRY, - .flow_control = OPENIBNAL_FLOW_CONTROL, + .responder_resources = IBNAL_RESPONDER_RESOURCES, + .initiator_depth = IBNAL_RESPONDER_RESOURCES, + .retry_count = IBNAL_RETRY, + .rnr_retry_count = IBNAL_RNR_RETRY, + .cm_response_timeout = kibnal_tunables.kib_io_timeout, + .max_cm_retries = IBNAL_CM_RETRY, + .flow_control = IBNAL_FLOW_CONTROL, }; /* XXX set timeout just like SDP!!!*/ conn->ibc_connreq->cr_path.packet_life = 13; /* Flag I'm getting involved with the CM... */ - conn->ibc_state = OPENIBNAL_CONN_CONNECTING; + conn->ibc_state = IBNAL_CONN_CONNECTING; CDEBUG(D_NET, "Connecting to, service id "LPX64", on "LPX64"\n", conn->ibc_connreq->cr_service.service_id, - *koibnal_service_nid_field(&conn->ibc_connreq->cr_service)); + *kibnal_service_nid_field(&conn->ibc_connreq->cr_service)); - /* koibnal_connect_callback gets my conn ref */ + /* kibnal_connect_callback gets my conn ref */ status = ib_cm_connect (&conn->ibc_connreq->cr_connparam, &conn->ibc_connreq->cr_path, NULL, conn->ibc_connreq->cr_service.service_id, 0, - koibnal_active_conn_callback, conn, + kibnal_active_conn_callback, conn, &conn->ibc_comm_id); if (status != 0) { CERROR ("Connect: %d\n", status); /* Back out state change: I've not got a CM comm_id yet... */ - conn->ibc_state = OPENIBNAL_CONN_INIT_QP; - koibnal_connreq_done (conn, 1, status); + conn->ibc_state = IBNAL_CONN_INIT_QP; + kibnal_connreq_done (conn, 1, status); } out: @@ -2129,58 +2190,58 @@ koibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status, } void -koibnal_service_get_callback (tTS_IB_CLIENT_QUERY_TID tid, int status, - struct ib_common_attrib_service *resp, void *arg) +kibnal_service_get_callback (tTS_IB_CLIENT_QUERY_TID tid, int status, + struct ib_common_attrib_service *resp, void *arg) { - koib_conn_t *conn = arg; + kib_conn_t *conn = arg; if (status != 0) { CERROR ("status %d\n", status); - koibnal_connreq_done (conn, 1, status); + kibnal_connreq_done (conn, 1, status); return; } CDEBUG(D_NET, "Got status %d, service id "LPX64", on "LPX64"\n", status, resp->service_id, - *koibnal_service_nid_field(resp)); + *kibnal_service_nid_field(resp)); conn->ibc_connreq->cr_service = *resp; - status = ib_cached_gid_get(koibnal_data.koib_device, - koibnal_data.koib_port, 0, + status = ib_cached_gid_get(kibnal_data.kib_device, + kibnal_data.kib_port, 0, conn->ibc_connreq->cr_gid); LASSERT (status == 0); - /* koibnal_pathreq_callback gets my conn ref */ - status = tsIbPathRecordRequest (koibnal_data.koib_device, - koibnal_data.koib_port, + /* kibnal_pathreq_callback gets my conn ref */ + status = tsIbPathRecordRequest (kibnal_data.kib_device, + kibnal_data.kib_port, conn->ibc_connreq->cr_gid, conn->ibc_connreq->cr_service.service_gid, conn->ibc_connreq->cr_service.service_pkey, 0, - koibnal_tunables.koib_io_timeout * HZ, + kibnal_tunables.kib_io_timeout * HZ, 0, - koibnal_pathreq_callback, conn, + kibnal_pathreq_callback, conn, &conn->ibc_connreq->cr_tid); if (status == 0) return; CERROR ("Path record request: %d\n", status); - koibnal_connreq_done (conn, 1, status); + kibnal_connreq_done (conn, 1, status); } void -koibnal_connect_peer (koib_peer_t *peer) +kibnal_connect_peer (kib_peer_t *peer) { - koib_conn_t *conn = koibnal_create_conn(); + kib_conn_t *conn = kibnal_create_conn(); int rc; LASSERT (peer->ibp_connecting != 0); if (conn == NULL) { CERROR ("Can't allocate conn\n"); - koibnal_peer_connect_failed (peer, 1, -ENOMEM); + kibnal_peer_connect_failed (peer, 1, -ENOMEM); return; } @@ -2190,85 +2251,101 @@ koibnal_connect_peer (koib_peer_t *peer) PORTAL_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq)); if (conn->ibc_connreq == NULL) { CERROR ("Can't allocate connreq\n"); - koibnal_connreq_done (conn, 1, -ENOMEM); + kibnal_connreq_done (conn, 1, -ENOMEM); return; } memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq)); - koibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid); + kibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid); - /* koibnal_service_get_callback gets my conn ref */ - rc = ib_service_get (koibnal_data.koib_device, - koibnal_data.koib_port, + /* kibnal_service_get_callback gets my conn ref */ + rc = ib_service_get (kibnal_data.kib_device, + kibnal_data.kib_port, &conn->ibc_connreq->cr_service, - KOIBNAL_SERVICE_KEY_MASK, - koibnal_tunables.koib_io_timeout * HZ, - koibnal_service_get_callback, conn, + KIBNAL_SERVICE_KEY_MASK, + kibnal_tunables.kib_io_timeout * HZ, + kibnal_service_get_callback, conn, &conn->ibc_connreq->cr_tid); if (rc == 0) return; CERROR ("ib_service_get: %d\n", rc); - koibnal_connreq_done (conn, 1, rc); + kibnal_connreq_done (conn, 1, rc); } int -koibnal_conn_timed_out (koib_conn_t *conn) +kibnal_conn_timed_out (kib_conn_t *conn) { - koib_tx_t *tx; + kib_tx_t *tx; struct list_head *ttmp; unsigned long flags; - int rc = 0; spin_lock_irqsave (&conn->ibc_lock, flags); - list_for_each (ttmp, &conn->ibc_rdma_queue) { - tx = list_entry (ttmp, koib_tx_t, tx_list); + list_for_each (ttmp, &conn->ibc_tx_queue) { + tx = list_entry (ttmp, kib_tx_t, tx_list); - LASSERT (tx->tx_passive_rdma); - LASSERT (tx->tx_passive_rdma_wait); + LASSERT (!tx->tx_passive_rdma_wait); + LASSERT (tx->tx_sending == 0); - if (time_after_eq (jiffies, tx->tx_passive_rdma_deadline)) { - rc = 1; - break; + if (time_after_eq (jiffies, tx->tx_deadline)) { + spin_unlock_irqrestore (&conn->ibc_lock, flags); + return 1; } } + + list_for_each (ttmp, &conn->ibc_active_txs) { + tx = list_entry (ttmp, kib_tx_t, tx_list); + + LASSERT (tx->tx_passive_rdma || + !tx->tx_passive_rdma_wait); + + LASSERT (tx->tx_passive_rdma_wait || + tx->tx_sending != 0); + + if (time_after_eq (jiffies, tx->tx_deadline)) { + spin_unlock_irqrestore (&conn->ibc_lock, flags); + return 1; + } + } + spin_unlock_irqrestore (&conn->ibc_lock, flags); - return rc; + return 0; } void -koibnal_check_conns (int idx) +kibnal_check_conns (int idx) { - struct list_head *peers = &koibnal_data.koib_peers[idx]; + struct list_head *peers = &kibnal_data.kib_peers[idx]; struct list_head *ptmp; - koib_peer_t *peer; - koib_conn_t *conn; + kib_peer_t *peer; + kib_conn_t *conn; struct list_head *ctmp; again: /* NB. We expect to have a look at all the peers and not find any * rdmas to time out, so we just use a shared lock while we * take a look... */ - read_lock (&koibnal_data.koib_global_lock); + read_lock (&kibnal_data.kib_global_lock); list_for_each (ptmp, peers) { - peer = list_entry (ptmp, koib_peer_t, ibp_list); + peer = list_entry (ptmp, kib_peer_t, ibp_list); list_for_each (ctmp, &peer->ibp_conns) { - conn = list_entry (ctmp, koib_conn_t, ibc_list); + conn = list_entry (ctmp, kib_conn_t, ibc_list); + + LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED); - LASSERT (conn->ibc_state == OPENIBNAL_CONN_ESTABLISHED); /* In case we have enough credits to return via a * NOOP, but there were no non-blocking tx descs * free to do it last time... */ - koibnal_check_sends(conn); + kibnal_check_sends(conn); - if (!koibnal_conn_timed_out(conn)) + if (!kibnal_conn_timed_out(conn)) continue; CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", @@ -2276,108 +2353,76 @@ koibnal_check_conns (int idx) atomic_read (&conn->ibc_refcount)); atomic_inc (&conn->ibc_refcount); - read_unlock (&koibnal_data.koib_global_lock); + read_unlock (&kibnal_data.kib_global_lock); CERROR("Timed out RDMA with "LPX64"\n", peer->ibp_nid); - koibnal_close_conn (conn, -ETIMEDOUT); - koibnal_put_conn (conn); + kibnal_close_conn (conn, -ETIMEDOUT); + kibnal_put_conn (conn); /* start again now I've dropped the lock */ goto again; } } - read_unlock (&koibnal_data.koib_global_lock); + read_unlock (&kibnal_data.kib_global_lock); } void -koibnal_terminate_conn (koib_conn_t *conn) +kibnal_terminate_conn (kib_conn_t *conn) { - unsigned long flags; int rc; - int done; CDEBUG(D_NET, "conn %p\n", conn); - LASSERT (conn->ibc_state == OPENIBNAL_CONN_DEATHROW); - conn->ibc_state = OPENIBNAL_CONN_ZOMBIE; + LASSERT (conn->ibc_state == IBNAL_CONN_DEATHROW); + conn->ibc_state = IBNAL_CONN_ZOMBIE; rc = ib_cm_disconnect (conn->ibc_comm_id); if (rc != 0) CERROR ("Error %d disconnecting conn %p -> "LPX64"\n", rc, conn, conn->ibc_peer->ibp_nid); - - /* complete blocked passive RDMAs */ - spin_lock_irqsave (&conn->ibc_lock, flags); - - while (!list_empty (&conn->ibc_rdma_queue)) { - koib_tx_t *tx = list_entry (conn->ibc_rdma_queue.next, - koib_tx_t, tx_list); - - LASSERT (tx->tx_passive_rdma); - LASSERT (tx->tx_passive_rdma_wait); - - list_del (&tx->tx_list); - - tx->tx_passive_rdma_wait = 0; - done = (tx->tx_sending == 0); - - tx->tx_status = -ECONNABORTED; - - spin_unlock_irqrestore (&conn->ibc_lock, flags); - - if (done) - koibnal_tx_done (tx); - - spin_lock_irqsave (&conn->ibc_lock, flags); - } - - spin_unlock_irqrestore (&conn->ibc_lock, flags); - - /* Complete all blocked transmits */ - koibnal_check_sends(conn); } int -koibnal_connd (void *arg) +kibnal_connd (void *arg) { wait_queue_t wait; unsigned long flags; - koib_conn_t *conn; - koib_peer_t *peer; + kib_conn_t *conn; + kib_peer_t *peer; int timeout; int i; int peer_index = 0; unsigned long deadline = jiffies; - kportal_daemonize ("koibnal_connd"); + kportal_daemonize ("kibnal_connd"); kportal_blockallsigs (); init_waitqueue_entry (&wait, current); - spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags); + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); for (;;) { - if (!list_empty (&koibnal_data.koib_connd_conns)) { - conn = list_entry (koibnal_data.koib_connd_conns.next, - koib_conn_t, ibc_list); + if (!list_empty (&kibnal_data.kib_connd_conns)) { + conn = list_entry (kibnal_data.kib_connd_conns.next, + kib_conn_t, ibc_list); list_del (&conn->ibc_list); - spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags); + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); switch (conn->ibc_state) { - case OPENIBNAL_CONN_DEATHROW: + case IBNAL_CONN_DEATHROW: LASSERT (conn->ibc_comm_id != TS_IB_CM_COMM_ID_INVALID); /* Disconnect: conn becomes a zombie in the * callback and last ref reschedules it * here... */ - koibnal_terminate_conn(conn); - koibnal_put_conn (conn); + kibnal_terminate_conn(conn); + kibnal_put_conn (conn); break; - case OPENIBNAL_CONN_ZOMBIE: - koibnal_destroy_conn (conn); + case IBNAL_CONN_ZOMBIE: + kibnal_destroy_conn (conn); break; default: @@ -2386,35 +2431,35 @@ koibnal_connd (void *arg) LBUG(); } - spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags); + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); continue; } - if (!list_empty (&koibnal_data.koib_connd_peers)) { - peer = list_entry (koibnal_data.koib_connd_peers.next, - koib_peer_t, ibp_connd_list); + if (!list_empty (&kibnal_data.kib_connd_peers)) { + peer = list_entry (kibnal_data.kib_connd_peers.next, + kib_peer_t, ibp_connd_list); list_del_init (&peer->ibp_connd_list); - spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags); + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - koibnal_connect_peer (peer); - koibnal_put_peer (peer); + kibnal_connect_peer (peer); + kibnal_put_peer (peer); - spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags); + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); } /* shut down and nobody left to reap... */ - if (koibnal_data.koib_shutdown && - atomic_read(&koibnal_data.koib_nconns) == 0) + if (kibnal_data.kib_shutdown && + atomic_read(&kibnal_data.kib_nconns) == 0) break; - spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags); + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); /* careful with the jiffy wrap... */ while ((timeout = (int)(deadline - jiffies)) <= 0) { const int n = 4; const int p = 1; - int chunk = koibnal_data.koib_peer_hash_size; + int chunk = kibnal_data.kib_peer_hash_size; /* Time to check for RDMA timeouts on a few more * peers: I do checks every 'p' seconds on a @@ -2424,129 +2469,129 @@ koibnal_connd (void *arg) * connection within (n+1)/n times the timeout * interval. */ - if (koibnal_tunables.koib_io_timeout > n * p) + if (kibnal_tunables.kib_io_timeout > n * p) chunk = (chunk * n * p) / - koibnal_tunables.koib_io_timeout; + kibnal_tunables.kib_io_timeout; if (chunk == 0) chunk = 1; for (i = 0; i < chunk; i++) { - koibnal_check_conns (peer_index); + kibnal_check_conns (peer_index); peer_index = (peer_index + 1) % - koibnal_data.koib_peer_hash_size; + kibnal_data.kib_peer_hash_size; } deadline += p * HZ; } - koibnal_data.koib_connd_waketime = jiffies + timeout; + kibnal_data.kib_connd_waketime = jiffies + timeout; set_current_state (TASK_INTERRUPTIBLE); - add_wait_queue (&koibnal_data.koib_connd_waitq, &wait); + add_wait_queue (&kibnal_data.kib_connd_waitq, &wait); - if (!koibnal_data.koib_shutdown && - list_empty (&koibnal_data.koib_connd_conns) && - list_empty (&koibnal_data.koib_connd_peers)) + if (!kibnal_data.kib_shutdown && + list_empty (&kibnal_data.kib_connd_conns) && + list_empty (&kibnal_data.kib_connd_peers)) schedule_timeout (timeout); set_current_state (TASK_RUNNING); - remove_wait_queue (&koibnal_data.koib_connd_waitq, &wait); + remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait); - spin_lock_irqsave (&koibnal_data.koib_connd_lock, flags); + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); } - spin_unlock_irqrestore (&koibnal_data.koib_connd_lock, flags); + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - koibnal_thread_fini (); + kibnal_thread_fini (); return (0); } int -koibnal_scheduler(void *arg) +kibnal_scheduler(void *arg) { long id = (long)arg; char name[16]; - koib_rx_t *rx; - koib_tx_t *tx; + kib_rx_t *rx; + kib_tx_t *tx; unsigned long flags; int rc; int counter = 0; int did_something; - snprintf(name, sizeof(name), "koibnal_sd_%02ld", id); + snprintf(name, sizeof(name), "kibnal_sd_%02ld", id); kportal_daemonize(name); kportal_blockallsigs(); - spin_lock_irqsave(&koibnal_data.koib_sched_lock, flags); + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); for (;;) { did_something = 0; - while (!list_empty(&koibnal_data.koib_sched_txq)) { - tx = list_entry(koibnal_data.koib_sched_txq.next, - koib_tx_t, tx_list); + while (!list_empty(&kibnal_data.kib_sched_txq)) { + tx = list_entry(kibnal_data.kib_sched_txq.next, + kib_tx_t, tx_list); list_del(&tx->tx_list); - spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); - koibnal_tx_done(tx); + kibnal_tx_done(tx); - spin_lock_irqsave(&koibnal_data.koib_sched_lock, + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); } - if (!list_empty(&koibnal_data.koib_sched_rxq)) { - rx = list_entry(koibnal_data.koib_sched_rxq.next, - koib_rx_t, rx_list); + if (!list_empty(&kibnal_data.kib_sched_rxq)) { + rx = list_entry(kibnal_data.kib_sched_rxq.next, + kib_rx_t, rx_list); list_del(&rx->rx_list); - spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); - koibnal_rx(rx); + kibnal_rx(rx); did_something = 1; - spin_lock_irqsave(&koibnal_data.koib_sched_lock, + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); } /* shut down and no receives to complete... */ - if (koibnal_data.koib_shutdown && - atomic_read(&koibnal_data.koib_nconns) == 0) + if (kibnal_data.kib_shutdown && + atomic_read(&kibnal_data.kib_nconns) == 0) break; /* nothing to do or hogging CPU */ - if (!did_something || counter++ == OPENIBNAL_RESCHED) { - spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, + if (!did_something || counter++ == IBNAL_RESCHED) { + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); counter = 0; if (!did_something) { rc = wait_event_interruptible( - koibnal_data.koib_sched_waitq, - !list_empty(&koibnal_data.koib_sched_txq) || - !list_empty(&koibnal_data.koib_sched_rxq) || - (koibnal_data.koib_shutdown && - atomic_read (&koibnal_data.koib_nconns) == 0)); + kibnal_data.kib_sched_waitq, + !list_empty(&kibnal_data.kib_sched_txq) || + !list_empty(&kibnal_data.kib_sched_rxq) || + (kibnal_data.kib_shutdown && + atomic_read (&kibnal_data.kib_nconns) == 0)); } else { our_cond_resched(); } - spin_lock_irqsave(&koibnal_data.koib_sched_lock, + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); } } - spin_unlock_irqrestore(&koibnal_data.koib_sched_lock, flags); + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); - koibnal_thread_fini(); + kibnal_thread_fini(); return (0); } -lib_nal_t koibnal_lib = { - libnal_data: &koibnal_data, /* NAL private data */ - libnal_send: koibnal_send, - libnal_send_pages: koibnal_send_pages, - libnal_recv: koibnal_recv, - libnal_recv_pages: koibnal_recv_pages, - libnal_dist: koibnal_dist +lib_nal_t kibnal_lib = { + libnal_data: &kibnal_data, /* NAL private data */ + libnal_send: kibnal_send, + libnal_send_pages: kibnal_send_pages, + libnal_recv: kibnal_recv, + libnal_recv_pages: kibnal_recv_pages, + libnal_dist: kibnal_dist }; diff --git a/lustre/portals/knals/qswnal/qswnal.c b/lustre/portals/knals/qswnal/qswnal.c index 16123c2..5aff4e9 100644 --- a/lustre/portals/knals/qswnal/qswnal.c +++ b/lustre/portals/knals/qswnal/qswnal.c @@ -40,10 +40,10 @@ kpr_nal_interface_t kqswnal_router_interface = { #define QSWNAL_SYSCTL 201 #define QSWNAL_SYSCTL_OPTIMIZED_GETS 1 -#define QSWNAL_SYSCTL_COPY_SMALL_FWD 2 +#define QSWNAL_SYSCTL_OPTIMIZED_PUTS 2 static ctl_table kqswnal_ctl_table[] = { - {QSWNAL_SYSCTL_OPTIMIZED_GETS, "optimized_puts", + {QSWNAL_SYSCTL_OPTIMIZED_PUTS, "optimized_puts", &kqswnal_tunables.kqn_optimized_puts, sizeof (int), 0644, NULL, &proc_dointvec}, {QSWNAL_SYSCTL_OPTIMIZED_GETS, "optimized_gets", @@ -121,6 +121,8 @@ static void kqswnal_shutdown(nal_t *nal) { unsigned long flags; + kqswnal_tx_t *ktx; + kqswnal_rx_t *krx; int do_lib_fini = 0; /* NB The first ref was this module! */ @@ -267,37 +269,25 @@ kqswnal_shutdown(nal_t *nal) * ep_dvma_release() get fixed (and releases any mappings in the * region), we can delete all the code from here --------> */ - if (kqswnal_data.kqn_txds != NULL) { - int i; + for (ktx = kqswnal_data.kqn_txds; ktx != NULL; ktx = ktx->ktx_alloclist) { + /* If ktx has a buffer, it got mapped; unmap now. NB only + * the pre-mapped stuff is still mapped since all tx descs + * must be idle */ - for (i = 0; i < KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS; i++) { - kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i]; - - /* If ktx has a buffer, it got mapped; unmap now. - * NB only the pre-mapped stuff is still mapped - * since all tx descs must be idle */ - - if (ktx->ktx_buffer != NULL) - ep_dvma_unload(kqswnal_data.kqn_ep, - kqswnal_data.kqn_ep_tx_nmh, - &ktx->ktx_ebuffer); - } + if (ktx->ktx_buffer != NULL) + ep_dvma_unload(kqswnal_data.kqn_ep, + kqswnal_data.kqn_ep_tx_nmh, + &ktx->ktx_ebuffer); } - if (kqswnal_data.kqn_rxds != NULL) { - int i; - - for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++) { - kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i]; - - /* If krx_kiov[0].kiov_page got allocated, it got mapped. - * NB subsequent pages get merged */ + for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) { + /* If krx_kiov[0].kiov_page got allocated, it got mapped. + * NB subsequent pages get merged */ - if (krx->krx_kiov[0].kiov_page != NULL) - ep_dvma_unload(kqswnal_data.kqn_ep, - kqswnal_data.kqn_ep_rx_nmh, - &krx->krx_elanbuffer); - } + if (krx->krx_kiov[0].kiov_page != NULL) + ep_dvma_unload(kqswnal_data.kqn_ep, + kqswnal_data.kqn_ep_rx_nmh, + &krx->krx_elanbuffer); } /* <----------- to here */ @@ -330,41 +320,26 @@ kqswnal_shutdown(nal_t *nal) } #endif - if (kqswnal_data.kqn_txds != NULL) - { - int i; + while (kqswnal_data.kqn_txds != NULL) { + ktx = kqswnal_data.kqn_txds; - for (i = 0; i < KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS; i++) - { - kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i]; - - if (ktx->ktx_buffer != NULL) - PORTAL_FREE(ktx->ktx_buffer, - KQSW_TX_BUFFER_SIZE); - } + if (ktx->ktx_buffer != NULL) + PORTAL_FREE(ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE); - PORTAL_FREE(kqswnal_data.kqn_txds, - sizeof (kqswnal_tx_t) * (KQSW_NTXMSGS + - KQSW_NNBLK_TXMSGS)); + kqswnal_data.kqn_txds = ktx->ktx_alloclist; + PORTAL_FREE(ktx, sizeof(*ktx)); } - if (kqswnal_data.kqn_rxds != NULL) - { - int i; - int j; + while (kqswnal_data.kqn_rxds != NULL) { + int i; - for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++) - { - kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i]; + krx = kqswnal_data.kqn_rxds; + for (i = 0; i < krx->krx_npages; i++) + if (krx->krx_kiov[i].kiov_page != NULL) + __free_page (krx->krx_kiov[i].kiov_page); - for (j = 0; j < krx->krx_npages; j++) - if (krx->krx_kiov[j].kiov_page != NULL) - __free_page (krx->krx_kiov[j].kiov_page); - } - - PORTAL_FREE(kqswnal_data.kqn_rxds, - sizeof(kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL + - KQSW_NRXMSGS_LARGE)); + kqswnal_data.kqn_rxds = krx->krx_alloclist; + PORTAL_FREE(krx, sizeof (*krx)); } /* resets flags, pointers to NULL etc */ @@ -388,6 +363,8 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, #endif int rc; int i; + kqswnal_rx_t *krx; + kqswnal_tx_t *ktx; int elan_page_idx; ptl_process_id_t my_process_id; int pkmem = atomic_read(&portal_kmemory); @@ -560,23 +537,22 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, /**********************************************************************/ /* Allocate/Initialise transmit descriptors */ - PORTAL_ALLOC(kqswnal_data.kqn_txds, - sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS)); - if (kqswnal_data.kqn_txds == NULL) - { - kqswnal_shutdown (nal); - return (PTL_NO_SPACE); - } - - /* clear flags, null pointers etc */ - memset(kqswnal_data.kqn_txds, 0, - sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS)); + kqswnal_data.kqn_txds = NULL; for (i = 0; i < (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS); i++) { int premapped_pages; - kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i]; int basepage = i * KQSW_NTXMSGPAGES; + PORTAL_ALLOC (ktx, sizeof(*ktx)); + if (ktx == NULL) { + kqswnal_shutdown (nal); + return (PTL_NO_SPACE); + } + + memset(ktx, 0, sizeof(*ktx)); /* NULL pointers; zero flags */ + ktx->ktx_alloclist = kqswnal_data.kqn_txds; + kqswnal_data.kqn_txds = ktx; + PORTAL_ALLOC (ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE); if (ktx->ktx_buffer == NULL) { @@ -615,18 +591,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, /**********************************************************************/ /* Allocate/Initialise receive descriptors */ - - PORTAL_ALLOC (kqswnal_data.kqn_rxds, - sizeof (kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE)); - if (kqswnal_data.kqn_rxds == NULL) - { - kqswnal_shutdown (nal); - return (PTL_NO_SPACE); - } - - memset(kqswnal_data.kqn_rxds, 0, /* clear flags, null pointers etc */ - sizeof(kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL+KQSW_NRXMSGS_LARGE)); - + kqswnal_data.kqn_rxds = NULL; elan_page_idx = 0; for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++) { @@ -636,7 +601,16 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, E3_Addr elanbuffer; #endif int j; - kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i]; + + PORTAL_ALLOC(krx, sizeof(*krx)); + if (krx == NULL) { + kqswnal_shutdown(nal); + return (PTL_NO_SPACE); + } + + memset(krx, 0, sizeof(*krx)); /* clear flags, null pointers etc */ + krx->krx_alloclist = kqswnal_data.kqn_rxds; + kqswnal_data.kqn_rxds = krx; if (i < KQSW_NRXMSGS_SMALL) { @@ -717,10 +691,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, /**********************************************************************/ /* Queue receives, now that it's OK to run their completion callbacks */ - for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++) - { - kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i]; - + for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) { /* NB this enqueue can allocate/sleep (attr == 0) */ krx->krx_state = KRX_POSTED; #if MULTIRAIL_EKC diff --git a/lustre/portals/knals/qswnal/qswnal.h b/lustre/portals/knals/qswnal/qswnal.h index 438edc6..b08d710 100644 --- a/lustre/portals/knals/qswnal/qswnal.h +++ b/lustre/portals/knals/qswnal/qswnal.h @@ -99,10 +99,10 @@ typedef unsigned long kqsw_csum_t; #define KQSW_TX_MAXCONTIG (1<<10) /* largest payload that gets made contiguous on transmit */ #define KQSW_NTXMSGS 8 /* # normal transmit messages */ -#define KQSW_NNBLK_TXMSGS 256 /* # reserved transmit messages if can't block */ +#define KQSW_NNBLK_TXMSGS 512 /* # reserved transmit messages if can't block */ #define KQSW_NRXMSGS_LARGE 64 /* # large receive buffers */ -#define KQSW_EP_ENVELOPES_LARGE 128 /* # large ep envelopes */ +#define KQSW_EP_ENVELOPES_LARGE 256 /* # large ep envelopes */ #define KQSW_NRXMSGS_SMALL 256 /* # small receive buffers */ #define KQSW_EP_ENVELOPES_SMALL 2048 /* # small ep envelopes */ @@ -144,9 +144,10 @@ typedef struct #endif } kqswnal_remotemd_t; -typedef struct +typedef struct kqswnal_rx { struct list_head krx_list; /* enqueue -> thread */ + struct kqswnal_rx *krx_alloclist; /* stack in kqn_rxds */ EP_RCVR *krx_eprx; /* port to post receives to */ EP_RXD *krx_rxd; /* receive descriptor (for repost) */ #if MULTIRAIL_EKC @@ -169,10 +170,11 @@ typedef struct #define KRX_COMPLETING 3 /* waiting to be completed */ -typedef struct +typedef struct kqswnal_tx { struct list_head ktx_list; /* enqueue idle/active */ struct list_head ktx_delayed_list; /* enqueue delayedtxds */ + struct kqswnal_tx *ktx_alloclist; /* stack in kqn_txds */ unsigned int ktx_isnblk:1; /* reserved descriptor? */ unsigned int ktx_state:7; /* What I'm doing */ unsigned int ktx_firsttmpfrag:1; /* ktx_frags[0] is in my ebuffer ? 0 : 1 */ @@ -222,8 +224,8 @@ typedef struct char kqn_shuttingdown; /* I'm trying to shut down */ atomic_t kqn_nthreads; /* # threads running */ - kqswnal_rx_t *kqn_rxds; /* all the receive descriptors */ - kqswnal_tx_t *kqn_txds; /* all the transmit descriptors */ + kqswnal_rx_t *kqn_rxds; /* stack of all the receive descriptors */ + kqswnal_tx_t *kqn_txds; /* stack of all the transmit descriptors */ struct list_head kqn_idletxds; /* transmit descriptors free to use */ struct list_head kqn_nblk_idletxds; /* reserved free transmit descriptors */ diff --git a/lustre/portals/knals/scimacnal/scimacnal.c b/lustre/portals/knals/scimacnal/scimacnal.c index 75188e9..e77bd8e 100644 --- a/lustre/portals/knals/scimacnal/scimacnal.c +++ b/lustre/portals/knals/scimacnal/scimacnal.c @@ -205,7 +205,7 @@ static int kscimacnal_startup(nal_t *nal, ptl_pid_t requested_pid, } kscimacnal_data.ksci_nid = (ptl_nid_t)(ntohl(mac_physaddr)); - process_id.pid = requested_pid; + process_id.pid = 0; process_id.nid = kscimacnal_data.ksci_nid; CDEBUG(D_NET, "calling lib_init with nid "LPX64"\n", diff --git a/lustre/portals/knals/socknal/socknal.c b/lustre/portals/knals/socknal/socknal.c index 2a0ef11..7642770 100644 --- a/lustre/portals/knals/socknal/socknal.c +++ b/lustre/portals/knals/socknal/socknal.c @@ -1226,9 +1226,9 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type) conn2->ksnc_type != conn->ksnc_type || conn2->ksnc_incarnation != incarnation) continue; - + CWARN("Not creating duplicate connection to " - "%u.%u.%u.%u type %d\n", + "%u.%u.%u.%u type %d\n", HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_type); rc = -EALREADY; goto failed_2; @@ -1260,6 +1260,9 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type) break; } + /* Give conn a ref on sock->file since we're going to return success */ + get_file(sock->file); + conn->ksnc_peer = peer; /* conn takes my ref on peer */ conn->ksnc_incarnation = incarnation; peer->ksnp_last_alive = jiffies; @@ -1311,9 +1314,9 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type) ksocknal_putconnsock(conn); } - CWARN("New conn nid:"LPX64" [type:%d] %u.%u.%u.%u -> %u.%u.%u.%u/%d" + CWARN("New conn nid:"LPX64" %u.%u.%u.%u -> %u.%u.%u.%u/%d" " incarnation:"LPX64" sched[%d]/%d\n", - nid, conn->ksnc_type, HIPQUAD(conn->ksnc_myipaddr), + nid, HIPQUAD(conn->ksnc_myipaddr), HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port, incarnation, (int)(conn->ksnc_scheduler - ksocknal_data.ksnd_schedulers), irq); @@ -2054,8 +2057,7 @@ ksocknal_cmd(struct portals_cfg *pcfg, void * private) rc = -EINVAL; break; } - if (rc != 0) - fput (sock->file); + fput (sock->file); break; } case NAL_CMD_CLOSE_CONNECTION: { diff --git a/lustre/portals/knals/socknal/socknal.h b/lustre/portals/knals/socknal/socknal.h index 0a5266a..b8bbefd 100644 --- a/lustre/portals/knals/socknal/socknal.h +++ b/lustre/portals/knals/socknal/socknal.h @@ -66,9 +66,7 @@ #include #include #include -#include -#include #define SOCKNAL_N_AUTOCONNECTD 4 /* # socknal autoconnect daemons */ #define SOCKNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */ diff --git a/lustre/portals/knals/socknal/socknal_cb.c b/lustre/portals/knals/socknal/socknal_cb.c index b22d501..762133e 100644 --- a/lustre/portals/knals/socknal/socknal_cb.c +++ b/lustre/portals/knals/socknal/socknal_cb.c @@ -2324,17 +2324,34 @@ ksocknal_setup_sock (struct socket *sock) return (0); } -int -ksocknal_connect_peer (ksock_route_t *route, int type) +static int +ksocknal_connect_sock(struct socket **sockp, int *may_retry, + ksock_route_t *route, int local_port) { - struct sockaddr_in ipaddr; - mm_segment_t oldmm = get_fs(); - struct timeval tv; - int fd; + struct sockaddr_in locaddr; + struct sockaddr_in srvaddr; struct socket *sock; int rc; - + int option; + mm_segment_t oldmm = get_fs(); + struct timeval tv; + + memset(&locaddr, 0, sizeof(locaddr)); + locaddr.sin_family = AF_INET; + locaddr.sin_port = htons(local_port); + locaddr.sin_addr.s_addr = + (route->ksnr_myipaddr != 0) ? htonl(route->ksnr_myipaddr) + : INADDR_ANY; + + memset (&srvaddr, 0, sizeof (srvaddr)); + srvaddr.sin_family = AF_INET; + srvaddr.sin_port = htons (route->ksnr_port); + srvaddr.sin_addr.s_addr = htonl (route->ksnr_ipaddr); + + *may_retry = 0; + rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock); + *sockp = sock; if (rc != 0) { CERROR ("Can't create autoconnect socket: %d\n", rc); return (rc); @@ -2344,17 +2361,23 @@ ksocknal_connect_peer (ksock_route_t *route, int type) * from userspace. And we actually need the sock->file refcounting * that this gives you :) */ - fd = sock_map_fd (sock); - if (fd < 0) { + rc = sock_map_fd (sock); + if (rc < 0) { sock_release (sock); - CERROR ("sock_map_fd error %d\n", fd); - return (fd); + CERROR ("sock_map_fd error %d\n", rc); + return (rc); } - /* NB the fd now owns the ref on sock->file */ + /* NB the file descriptor (rc) now owns the ref on sock->file */ LASSERT (sock->file != NULL); LASSERT (file_count(sock->file) == 1); + get_file(sock->file); /* extra ref makes sock->file */ + sys_close(rc); /* survive this close */ + + /* Still got a single ref on sock->file */ + LASSERT (file_count(sock->file) == 1); + /* Set the socket timeouts, so our connection attempt completes in * finite time */ tv.tv_sec = ksocknal_tunables.ksnd_io_timeout; @@ -2367,7 +2390,7 @@ ksocknal_connect_peer (ksock_route_t *route, int type) if (rc != 0) { CERROR ("Can't set send timeout %d: %d\n", ksocknal_tunables.ksnd_io_timeout, rc); - goto out; + goto failed; } set_fs (KERNEL_DS); @@ -2377,53 +2400,83 @@ ksocknal_connect_peer (ksock_route_t *route, int type) if (rc != 0) { CERROR ("Can't set receive timeout %d: %d\n", ksocknal_tunables.ksnd_io_timeout, rc); - goto out; + goto failed; } - if (route->ksnr_myipaddr != 0) { - /* Bind to the local IP address */ - memset (&ipaddr, 0, sizeof (ipaddr)); - ipaddr.sin_family = AF_INET; - ipaddr.sin_port = htons (0); /* ANY */ - ipaddr.sin_addr.s_addr = htonl(route->ksnr_myipaddr); + set_fs (KERNEL_DS); + option = 1; + rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc); + goto failed; + } - rc = sock->ops->bind (sock, (struct sockaddr *)&ipaddr, - sizeof (ipaddr)); - if (rc != 0) { - CERROR ("Can't bind to local IP %u.%u.%u.%u: %d\n", - HIPQUAD(route->ksnr_myipaddr), rc); - goto out; - } + rc = sock->ops->bind(sock, + (struct sockaddr *)&locaddr, sizeof(locaddr)); + if (rc == -EADDRINUSE) { + CDEBUG(D_NET, "Port %d already in use\n", local_port); + *may_retry = 1; + goto failed; } - - memset (&ipaddr, 0, sizeof (ipaddr)); - ipaddr.sin_family = AF_INET; - ipaddr.sin_port = htons (route->ksnr_port); - ipaddr.sin_addr.s_addr = htonl (route->ksnr_ipaddr); - - rc = sock->ops->connect (sock, (struct sockaddr *)&ipaddr, - sizeof (ipaddr), sock->file->f_flags); if (rc != 0) { - CERROR ("Can't connect to nid "LPX64 - " local IP: %u.%u.%u.%u," - " remote IP: %u.%u.%u.%u/%d: %d\n", - route->ksnr_peer->ksnp_nid, - HIPQUAD(route->ksnr_myipaddr), - HIPQUAD(route->ksnr_ipaddr), - route->ksnr_port, rc); - goto out; + CERROR("Error trying to bind to reserved port %d: %d\n", + local_port, rc); + goto failed; } - rc = ksocknal_create_conn (route, sock, type); - if (rc == 0) { - /* Take an extra ref on sock->file to compensate for the - * upcoming close which will lose fd's ref on it. */ - get_file (sock->file); + rc = sock->ops->connect(sock, + (struct sockaddr *)&srvaddr, sizeof(srvaddr), + sock->file->f_flags); + if (rc == 0) + return 0; + + /* EADDRNOTAVAIL probably means we're already connected to the same + * peer/port on the same local port on a differently typed + * connection. Let our caller retry with a different local + * port... */ + *may_retry = (rc == -EADDRNOTAVAIL); + + CDEBUG(*may_retry ? D_NET : D_ERROR, + "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc, + HIPQUAD(route->ksnr_myipaddr), local_port, + HIPQUAD(route->ksnr_ipaddr), route->ksnr_port); + + failed: + fput(sock->file); + return rc; +} + +int +ksocknal_connect_peer (ksock_route_t *route, int type) +{ + struct socket *sock; + int rc; + int port; + int may_retry; + + /* Iterate through reserved ports. When typed connections are + * used, we will need to bind to multiple ports, but we only know + * this at connect time. But, by that time we've already called + * bind() so we need a new socket. */ + + for (port = 1023; port > 512; --port) { + + rc = ksocknal_connect_sock(&sock, &may_retry, route, port); + + if (rc == 0) { + rc = ksocknal_create_conn(route, sock, type); + fput(sock->file); + return rc; + } + + if (!may_retry) + return rc; } - out: - sys_close (fd); - return (rc); + CERROR("Out of ports trying to bind to a reserved port\n"); + return (-EADDRINUSE); } void @@ -2443,7 +2496,6 @@ ksocknal_autoconnect (ksock_route_t *route) LASSERT (type < SOCKNAL_CONN_NTYPES); rc = ksocknal_connect_peer (route, type); - if (rc != 0) break; diff --git a/lustre/portals/libcfs/debug.c b/lustre/portals/libcfs/debug.c index c56f76f..f571958 100644 --- a/lustre/portals/libcfs/debug.c +++ b/lustre/portals/libcfs/debug.c @@ -60,7 +60,7 @@ #endif unsigned int portal_subsystem_debug = ~0 - (S_PORTALS | S_QSWNAL | S_SOCKNAL | - S_GMNAL | S_OPENIBNAL); + S_GMNAL | S_IBNAL); EXPORT_SYMBOL(portal_subsystem_debug); unsigned int portal_debug = (D_WARNING | D_DLMTRACE | D_ERROR | D_EMERG | D_HA | @@ -97,6 +97,7 @@ int portals_do_debug_dumplog(void *arg) snprintf(debug_file_name, sizeof(debug_file_path) - 1, "%s.%ld.%ld", debug_file_path, CURRENT_SECONDS, (long)arg); + printk(KERN_ALERT "LustreError: dumping log to %s\n", debug_file_name); tracefile_dump_all_pages(debug_file_name); current->journal_info = journal_info; @@ -180,7 +181,7 @@ int portals_debug_clear_buffer(void) int portals_debug_mark_buffer(char *text) { CDEBUG(D_TRACE,"***************************************************\n"); - CWARN("DEBUG MARKER: %s\n", text); + CDEBUG(D_WARNING, "DEBUG MARKER: %s\n", text); CDEBUG(D_TRACE,"***************************************************\n"); return 0; @@ -251,62 +252,46 @@ void portals_run_lbug_upcall(char *file, const char *fn, const int line) char *portals_nid2str(int nal, ptl_nid_t nid, char *str) { if (nid == PTL_NID_ANY) { - snprintf(str, PTL_NALFMT_SIZE - 1, "%s", - "PTL_NID_ANY"); + snprintf(str, PTL_NALFMT_SIZE, "%s", "PTL_NID_ANY"); return str; } switch(nal){ /* XXX this could be a nal method of some sort, 'cept it's config * dependent whether (say) socknal NIDs are actually IP addresses... */ -#ifndef CRAY_PORTALS +#if !CRAY_PORTALS case TCPNAL: /* userspace NAL */ + case IIBNAL: case OPENIBNAL: case SOCKNAL: - snprintf(str, PTL_NALFMT_SIZE - 1, "%u:%u.%u.%u.%u", + snprintf(str, PTL_NALFMT_SIZE, "%u:%u.%u.%u.%u", (__u32)(nid >> 32), HIPQUAD(nid)); break; case QSWNAL: case GMNAL: - snprintf(str, PTL_NALFMT_SIZE - 1, "%u:%u", + snprintf(str, PTL_NALFMT_SIZE, "%u:%u", (__u32)(nid >> 32), (__u32)nid); break; #endif default: - snprintf(str, PTL_NALFMT_SIZE - 1, "?%d? %llx", + snprintf(str, PTL_NALFMT_SIZE, "?%x? %llx", nal, (long long)nid); break; } return str; } -/* bug #4615 */ + char *portals_id2str(int nal, ptl_process_id_t id, char *str) { - switch(nal){ -#ifndef CRAY_PORTALS - case TCPNAL: - /* userspace NAL */ - case OPENIBNAL: - case SOCKNAL: - snprintf(str, PTL_NALFMT_SIZE - 1, "%u:%u.%u.%u.%u,%u", - (__u32)(id.nid >> 32), HIPQUAD((id.nid)) , id.pid); - break; - case QSWNAL: - case GMNAL: - snprintf(str, PTL_NALFMT_SIZE - 1, "%u:%u,%u", - (__u32)(id.nid >> 32), (__u32)id.nid, id.pid); - break; -#endif - default: - snprintf(str, PTL_NALFMT_SIZE - 1, "?%d? %llx,%lx", - nal, (long long)id.nid, (long)id.pid ); - break; - } + int len; + + portals_nid2str(nal, id.nid, str); + len = strlen(str); + snprintf(str + len, PTL_NALFMT_SIZE, "-%u", id.pid); return str; } - #ifdef __KERNEL__ char stack_backtrace[LUSTRE_TRACE_SIZE]; spinlock_t stack_backtrace_lock = SPIN_LOCK_UNLOCKED; diff --git a/lustre/portals/libcfs/module.c b/lustre/portals/libcfs/module.c index 3703013..a2422e3 100644 --- a/lustre/portals/libcfs/module.c +++ b/lustre/portals/libcfs/module.c @@ -327,6 +327,8 @@ libcfs_nal_cmd(struct portals_cfg *pcfg) CDEBUG(D_IOCTL, "calling handler nal: %d, cmd: %d\n", nal, pcfg->pcfg_command); rc = cmd->nch_handler(pcfg, cmd->nch_private); + } else { + CERROR("invalid nal: %d, cmd: %d\n", nal, pcfg->pcfg_command); } up(&nal_cmd_sem); @@ -413,15 +415,15 @@ static int libcfs_ioctl(struct inode *inode, struct file *file, portals_debug_mark_buffer(data->ioc_inlbuf1); RETURN(0); #if LWT_SUPPORT - case IOC_PORTAL_LWT_CONTROL: + case IOC_PORTAL_LWT_CONTROL: err = lwt_control (data->ioc_flags, data->ioc_misc); break; - + case IOC_PORTAL_LWT_SNAPSHOT: { cycles_t now; int ncpu; int total_size; - + err = lwt_snapshot (&now, &ncpu, &total_size, data->ioc_pbuf1, data->ioc_plen1); data->ioc_nid = now; @@ -429,15 +431,15 @@ static int libcfs_ioctl(struct inode *inode, struct file *file, data->ioc_misc = total_size; /* Hedge against broken user/kernel typedefs (e.g. cycles_t) */ - data->ioc_nid = sizeof(lwt_event_t); - data->ioc_nid2 = offsetof(lwt_event_t, lwte_where); + data->ioc_nid2 = sizeof(lwt_event_t); + data->ioc_nid3 = offsetof(lwt_event_t, lwte_where); if (err == 0 && copy_to_user((char *)arg, data, sizeof (*data))) err = -EFAULT; break; } - + case IOC_PORTAL_LWT_LOOKUP_STRING: err = lwt_lookup_string (&data->ioc_count, data->ioc_pbuf1, data->ioc_pbuf2, data->ioc_plen2); @@ -456,7 +458,7 @@ static int libcfs_ioctl(struct inode *inode, struct file *file, break; } - if (copy_from_user(&pcfg, (void *)data->ioc_pbuf1, + if (copy_from_user(&pcfg, (void *)data->ioc_pbuf1, sizeof(pcfg))) { err = -EFAULT; break; @@ -467,7 +469,7 @@ static int libcfs_ioctl(struct inode *inode, struct file *file, err = libcfs_nal_cmd(&pcfg); if (err == 0 && - copy_to_user((char *)data->ioc_pbuf1, &pcfg, + copy_to_user((char *)data->ioc_pbuf1, &pcfg, sizeof (pcfg))) err = -EFAULT; break; diff --git a/lustre/portals/libcfs/tracefile.c b/lustre/portals/libcfs/tracefile.c index 562abcf..5759316 100644 --- a/lustre/portals/libcfs/tracefile.c +++ b/lustre/portals/libcfs/tracefile.c @@ -38,7 +38,6 @@ #include #include -#include #include #define TCD_MAX_PAGES 1280 @@ -190,7 +189,7 @@ static void print_to_console(struct ptldebug_header *hdr, int mask, char *buf, prefix = "Lustre"; ptype = KERN_INFO; } - + printk("%s%s: %d:%d:(%s:%d:%s()) %.*s", ptype, prefix, hdr->ph_pid, hdr->ph_extern_pid, file, hdr->ph_line_num, fn, len, buf); } @@ -455,7 +454,7 @@ int tracefile_dump_all_pages(char *filename) if (IS_ERR(filp)) { rc = PTR_ERR(filp); printk(KERN_ERR "LustreError: can't open %s for dump: rc %d\n", - filename, rc); + filename, rc); goto out; } @@ -773,6 +772,7 @@ int trace_write_debug_size(struct file *file, const char *buffer, "(%lu).\n", max * smp_num_cpus, num_physpages / 5 * 4); return count; } + for (i = 0; i < NR_CPUS; i++) { struct trace_cpu_data *tcd; tcd = &trace_data[i].tcd; diff --git a/lustre/portals/portals/lib-move.c b/lustre/portals/portals/lib-move.c index 13451d9..d584f1c 100644 --- a/lustre/portals/portals/lib-move.c +++ b/lustre/portals/portals/lib-move.c @@ -83,7 +83,8 @@ lib_match_md(lib_nal_t *nal, int index, int op_mask, me->match_id.nid != src_nid) continue; - CDEBUG(D_NET,"match_id.pid [%x], src_pid [%x]\n", me->match_id.pid, src_pid); + CDEBUG(D_NET, "match_id.pid [%x], src_pid [%x]\n", + me->match_id.pid, src_pid); if (me->match_id.pid != PTL_PID_ANY && me->match_id.pid != src_pid) diff --git a/lustre/portals/portals/module.c b/lustre/portals/portals/module.c index eb41dfd..61ef372 100644 --- a/lustre/portals/portals/module.c +++ b/lustre/portals/portals/module.c @@ -83,7 +83,8 @@ static int kportal_ioctl(struct portal_ioctl_data *data, CDEBUG (D_IOCTL, "Getting nid for nal [%d]\n", data->ioc_nal); - err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL, NULL, &nih); + err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL, + NULL, &nih); if (!(err == PTL_OK || err == PTL_IFACE_DUP)) RETURN (-EINVAL); @@ -104,7 +105,8 @@ static int kportal_ioctl(struct portal_ioctl_data *data, CDEBUG (D_IOCTL, "fail nid: [%d] "LPU64" count %d\n", data->ioc_nal, data->ioc_nid, data->ioc_count); - err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL, NULL, &nih); + err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL, + NULL, &nih); if (!(err == PTL_OK || err == PTL_IFACE_DUP)) return (-EINVAL); diff --git a/lustre/portals/router/proc.c b/lustre/portals/router/proc.c index 0fe3b90..a1397d2 100644 --- a/lustre/portals/router/proc.c +++ b/lustre/portals/router/proc.c @@ -132,7 +132,7 @@ static int kpr_proc_routes_read(char *page, char **start, off_t off, *start = page + prd->skip; user_len = -prd->skip; - for (; prd->curr != &kpr_routes; prd->curr = prd->curr->next) { + while ((prd->curr != NULL) && (prd->curr != &kpr_routes)) { re = list_entry(prd->curr, kpr_route_entry_t, kpre_list); ge = re->kpre_gateway; @@ -144,11 +144,20 @@ static int kpr_proc_routes_read(char *page, char **start, off_t off, chunk_len += line_len; user_len += line_len; - /* The route table will exceed one page */ - if ((chunk_len > (PAGE_SIZE - 80)) || (user_len > count)) { - prd->curr = prd->curr->next; - break; + /* Abort the route list changed */ + if (prd->curr->next == NULL) { + prd->curr = NULL; + read_unlock(&kpr_rwlock); + return sprintf(page, "\nError: Routes Changed\n"); } + + prd->curr = prd->curr->next; + + /* The route table will exceed one page, break the while loop + * so the function can be re-called with a new page. + */ + if ((chunk_len > (PAGE_SIZE - 80)) || (user_len > count)) + break; } *eof = 0; diff --git a/lustre/portals/unals/connection.c b/lustre/portals/unals/connection.c index ed8dc08..b399fcf 100644 --- a/lustre/portals/unals/connection.c +++ b/lustre/portals/unals/connection.c @@ -331,10 +331,17 @@ connection force_tcp_connection(manager m, { connection conn; struct sockaddr_in addr; + struct sockaddr_in locaddr; unsigned int id[2]; struct timeval tv; __u64 incarnation; + int fd; + int option; + int rc; + int rport; + ptl_nid_t peernid = PTL_NID_ANY; + port = tcpnal_acceptor_port; id[0] = ip; @@ -343,49 +350,82 @@ connection force_tcp_connection(manager m, pthread_mutex_lock(&m->conn_lock); conn = hash_table_find(m->connections, id); - if (!conn) { - int fd; - int option; - ptl_nid_t peernid = PTL_NID_ANY; - - bzero((char *) &addr, sizeof(addr)); - addr.sin_family = AF_INET; - addr.sin_addr.s_addr = htonl(ip); - addr.sin_port = htons(port); - - if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { - perror("tcpnal socket failed"); - exit(-1); - } - if (connect(fd, (struct sockaddr *)&addr, - sizeof(struct sockaddr_in))) { - perror("tcpnal connect"); - return(0); - } + if (conn) + goto out; + memset(&addr, 0, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = htonl(ip); + addr.sin_port = htons(port); + + memset(&locaddr, 0, sizeof(locaddr)); + locaddr.sin_family = AF_INET; + locaddr.sin_addr.s_addr = INADDR_ANY; + + for (rport = IPPORT_RESERVED - 1; rport > IPPORT_RESERVED / 2; --rport) { + fd = socket(AF_INET, SOCK_STREAM, 0); + if (fd < 0) { + perror("tcpnal socket failed"); + goto out; + } + + option = 1; + rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, + &option, sizeof(option)); + if (rc != 0) { + perror ("Can't set SO_REUSEADDR for socket"); + close(fd); + goto out; + } + + locaddr.sin_port = htons(rport); + rc = bind(fd, (struct sockaddr *)&locaddr, sizeof(locaddr)); + if (rc == 0 || errno == EACCES) { + rc = connect(fd, (struct sockaddr *)&addr, + sizeof(struct sockaddr_in)); + if (rc == 0) { + break; + } else if (errno != EADDRINUSE && errno != EADDRNOTAVAIL) { + perror("Error connecting to remote host"); + close(fd); + goto out; + } + } else if (errno != EADDRINUSE) { + perror("Error binding to privileged port"); + close(fd); + goto out; + } + close(fd); + } + + if (rport == IPPORT_RESERVED / 2) { + fprintf(stderr, "Out of ports trying to bind to a reserved port\n"); + goto out; + } + #if 1 - option = 1; - setsockopt(fd, SOL_TCP, TCP_NODELAY, &option, sizeof(option)); - option = 1<<20; - setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &option, sizeof(option)); - option = 1<<20; - setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &option, sizeof(option)); + option = 1; + setsockopt(fd, SOL_TCP, TCP_NODELAY, &option, sizeof(option)); + option = 1<<20; + setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &option, sizeof(option)); + option = 1<<20; + setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &option, sizeof(option)); #endif - gettimeofday(&tv, NULL); - incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; + gettimeofday(&tv, NULL); + incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; - /* say hello */ - if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, incarnation)) + /* say hello */ + if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, incarnation)) exit(-1); + + conn = allocate_connection(m, ip, port, fd); + + /* let nal thread know this event right away */ + if (conn) + procbridge_wakeup_nal(pb); - conn = allocate_connection(m, ip, port, fd); - - /* let nal thread know this event right away */ - if (conn) - procbridge_wakeup_nal(pb); - } - +out: pthread_mutex_unlock(&m->conn_lock); return (conn); } diff --git a/lustre/portals/unals/dispatch.h b/lustre/portals/unals/dispatch.h index 34dd070..a8f916d9 100644 --- a/lustre/portals/unals/dispatch.h +++ b/lustre/portals/unals/dispatch.h @@ -37,3 +37,10 @@ void remove_io_handler (io_handler i); void init_unix_timer(void); void select_timer_block(when until); when now(void); + +/* + * hacking for CFS internal MPI testing + */ +#if !CRAY_PORTALS +#define ENABLE_SELECT_DISPATCH +#endif diff --git a/lustre/portals/unals/procapi.c b/lustre/portals/unals/procapi.c index f3843d7..6b471c0 100644 --- a/lustre/portals/unals/procapi.c +++ b/lustre/portals/unals/procapi.c @@ -107,6 +107,10 @@ nal_t procapi_nal = { ptl_nid_t tcpnal_mynid; +#ifdef ENABLE_SELECT_DISPATCH +procbridge __global_procbridge = NULL; +#endif + /* Function: procbridge_startup * * Arguments: pid: requested process id (port offset) @@ -163,6 +167,10 @@ int procbridge_startup (nal_t *nal, ptl_pid_t requested_pid, return PTL_FAIL; } +#ifdef ENABLE_SELECT_DISPATCH + __global_procbridge = p; +#endif + /* create nal thread */ if (pthread_create(&p->t, NULL, nal_thread, &args)) { perror("nal_init: pthread_create"); diff --git a/lustre/portals/unals/select.c b/lustre/portals/unals/select.c index c4ccae1..09e1542 100644 --- a/lustre/portals/unals/select.c +++ b/lustre/portals/unals/select.c @@ -34,8 +34,12 @@ #include #include #include +#include +#include +#include #include #include +#include static struct timeval beginning_of_epoch; @@ -95,40 +99,22 @@ void remove_io_handler (io_handler i) i->disabled=1; } -static void set_flag(io_handler n,fd_set *fds) +static void set_flag(io_handler n,fd_set *r, fd_set *w, fd_set *e) { - if (n->type & READ_HANDLER) FD_SET(n->fd, &fds[0]); - if (n->type & WRITE_HANDLER) FD_SET(n->fd,&fds[1]); - if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd, &fds[2]); + if (n->type & READ_HANDLER) FD_SET(n->fd, r); + if (n->type & WRITE_HANDLER) FD_SET(n->fd, w); + if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd, e); } - -/* Function: select_timer_block - * Arguments: until: an absolute time when the select should return - * - * This function dispatches the various file descriptors' handler - * functions, if the kernel indicates there is io available. - */ -void select_timer_block(when until) +static int prepare_fd_sets(fd_set *r, fd_set *w, fd_set *e) { - fd_set fds[3]; - struct timeval timeout; - struct timeval *timeout_pointer; - int result; io_handler j; io_handler *k; + int max = 0; - /* TODO: loop until the entire interval is expired*/ - if (until){ - when interval=until-now(); - timeout.tv_sec=(interval>>32); - timeout.tv_usec=((interval<<32)/1000000)>>32; - timeout_pointer=&timeout; - } else timeout_pointer=0; - - FD_ZERO(&fds[0]); - FD_ZERO(&fds[1]); - FD_ZERO(&fds[2]); + FD_ZERO(r); + FD_ZERO(w); + FD_ZERO(e); for (k=&io_handlers;*k;){ if ((*k)->disabled){ j=*k; @@ -136,24 +122,291 @@ void select_timer_block(when until) free(j); } if (*k) { - set_flag(*k,fds); + set_flag(*k,r,w,e); + if ((*k)->fd > max) + max = (*k)->fd; k=&(*k)->next; } } + return max + 1; +} + +static int execute_callbacks(fd_set *r, fd_set *w, fd_set *e) +{ + io_handler j; + int n = 0, t; + + for (j = io_handlers; j; j = j->next) { + if (j->disabled) + continue; + + t = 0; + if (FD_ISSET(j->fd, r) && (j->type & READ_HANDLER)) { + FD_CLR(j->fd, r); + t++; + } + if (FD_ISSET(j->fd, w) && (j->type & WRITE_HANDLER)) { + FD_CLR(j->fd, w); + t++; + } + if (FD_ISSET(j->fd, e) && (j->type & EXCEPTION_HANDLER)) { + FD_CLR(j->fd, e); + t++; + } + if (t == 0) + continue; + + if (!(*j->function)(j->argument)) + j->disabled = 1; + + n += t; + } + + return n; +} - result=select(FD_SETSIZE, &fds[0], &fds[1], &fds[2], timeout_pointer); +#ifdef ENABLE_SELECT_DISPATCH - if (result > 0) - for (j=io_handlers;j;j=j->next){ - if (!(j->disabled) && - ((FD_ISSET(j->fd, &fds[0]) && (j->type & READ_HANDLER)) || - (FD_ISSET(j->fd, &fds[1]) && (j->type & WRITE_HANDLER)) || - (FD_ISSET(j->fd, &fds[2]) && (j->type & EXCEPTION_HANDLER)))){ - if (!(*j->function)(j->argument)) - j->disabled=1; +static struct { + pthread_mutex_t mutex; + pthread_cond_t cond; + int submitted; + int nready; + int maxfd; + fd_set *rset; + fd_set *wset; + fd_set *eset; + struct timeval *timeout; + struct timeval submit_time; +} fd_extra = { + PTHREAD_MUTEX_INITIALIZER, + PTHREAD_COND_INITIALIZER, + 0, 0, 0, + NULL, NULL, NULL, NULL, +}; + +extern int liblustre_wait_event(int timeout); +extern procbridge __global_procbridge; + +/* + * this will intercept syscall select() of user apps + * such as MPI libs. + */ +int select(int n, fd_set *rset, fd_set *wset, fd_set *eset, + struct timeval *timeout) +{ + LASSERT(fd_extra.submitted == 0); + + fd_extra.nready = 0; + fd_extra.maxfd = n; + fd_extra.rset = rset; + fd_extra.wset = wset; + fd_extra.eset = eset; + fd_extra.timeout = timeout; + + liblustre_wait_event(0); + pthread_mutex_lock(&fd_extra.mutex); + gettimeofday(&fd_extra.submit_time, NULL); + fd_extra.submitted = 1; + LASSERT(__global_procbridge); + procbridge_wakeup_nal(__global_procbridge); + +again: + if (fd_extra.submitted) + pthread_cond_wait(&fd_extra.cond, &fd_extra.mutex); + pthread_mutex_unlock(&fd_extra.mutex); + + liblustre_wait_event(0); + + pthread_mutex_lock(&fd_extra.mutex); + if (fd_extra.submitted) + goto again; + pthread_mutex_unlock(&fd_extra.mutex); + + LASSERT(fd_extra.nready >= 0); + LASSERT(fd_extra.submitted == 0); + return fd_extra.nready; +} + +static int merge_fds(int max, fd_set *rset, fd_set *wset, fd_set *eset) +{ + int i; + + LASSERT(rset); + LASSERT(wset); + LASSERT(eset); + + for (i = 0; i < __FD_SETSIZE/__NFDBITS; i++) { + LASSERT(!fd_extra.rset || + !(__FDS_BITS(rset)[i] & __FDS_BITS(fd_extra.rset)[i])); + LASSERT(!fd_extra.wset || + !(__FDS_BITS(wset)[i] & __FDS_BITS(fd_extra.wset)[i])); + LASSERT(!fd_extra.eset || + !(__FDS_BITS(eset)[i] & __FDS_BITS(fd_extra.eset)[i])); + + if (fd_extra.rset && __FDS_BITS(fd_extra.rset)[i]) + __FDS_BITS(rset)[i] |= __FDS_BITS(fd_extra.rset)[i]; + if (fd_extra.wset && __FDS_BITS(fd_extra.wset)[i]) + __FDS_BITS(wset)[i] |= __FDS_BITS(fd_extra.wset)[i]; + if (fd_extra.eset && __FDS_BITS(fd_extra.eset)[i]) + __FDS_BITS(eset)[i] |= __FDS_BITS(fd_extra.eset)[i]; + } + + return (fd_extra.maxfd > max ? fd_extra.maxfd : max); +} + +static inline +int timeval_ge(struct timeval *tv1, struct timeval *tv2) +{ + LASSERT(tv1 && tv2); + return ((tv1->tv_sec - tv2->tv_sec) * 1000000 + + (tv1->tv_usec - tv2->tv_usec) >= 0); +} + +/* + * choose the most recent timeout value + */ +static struct timeval *choose_timeout(struct timeval *tv1, + struct timeval *tv2) +{ + if (!tv1) + return tv2; + else if (!tv2) + return tv1; + + if (timeval_ge(tv1, tv2)) + return tv2; + else + return tv1; +} + +/* Function: select_timer_block + * Arguments: until: an absolute time when the select should return + * + * This function dispatches the various file descriptors' handler + * functions, if the kernel indicates there is io available. + */ +void select_timer_block(when until) +{ + fd_set fds[3]; + struct timeval timeout; + struct timeval *timeout_pointer, *select_timeout; + int max, nready, nexec; + int fd_handling; + +again: + if (until) { + when interval; + + interval = until - now(); + timeout.tv_sec = (interval >> 32); + timeout.tv_usec = ((interval << 32) / 1000000) >> 32; + timeout_pointer = &timeout; + } else + timeout_pointer = NULL; + + fd_handling = 0; + max = prepare_fd_sets(&fds[0], &fds[1], &fds[2]); + select_timeout = timeout_pointer; + + pthread_mutex_lock(&fd_extra.mutex); + fd_handling = fd_extra.submitted; + pthread_mutex_unlock(&fd_extra.mutex); + if (fd_handling) { + max = merge_fds(max, &fds[0], &fds[1], &fds[2]); + select_timeout = choose_timeout(timeout_pointer, fd_extra.timeout); + } + + /* XXX only compile for linux */ +#if __WORDSIZE == 64 + nready = syscall(SYS_select, max, &fds[0], &fds[1], &fds[2], + select_timeout); +#else + nready = syscall(SYS__newselect, max, &fds[0], &fds[1], &fds[2], + select_timeout); +#endif + if (nready < 0) { + CERROR("select return err %d, errno %d\n", nready, errno); + return; + } + + if (nready) { + nexec = execute_callbacks(&fds[0], &fds[1], &fds[2]); + nready -= nexec; + } else + nexec = 0; + + /* even both nready & nexec are 0, we still need try to wakeup + * upper thread since it may have timed out + */ + if (fd_handling) { + LASSERT(nready >= 0); + + pthread_mutex_lock(&fd_extra.mutex); + if (nready) { + if (fd_extra.rset) + *fd_extra.rset = fds[0]; + if (fd_extra.wset) + *fd_extra.wset = fds[1]; + if (fd_extra.eset) + *fd_extra.eset = fds[2]; + fd_extra.nready = nready; + fd_extra.submitted = 0; + } else { + struct timeval t; + + fd_extra.nready = 0; + if (fd_extra.timeout) { + gettimeofday(&t, NULL); + if (timeval_ge(&t, &fd_extra.submit_time)) + fd_extra.submitted = 0; } } + + pthread_cond_signal(&fd_extra.cond); + pthread_mutex_unlock(&fd_extra.mutex); + } + + /* haven't found portals event, go back to loop if time + * is not expired */ + if (!nexec) { + if (timeout_pointer == NULL || now() >= until) + goto again; + } +} + +#else /* !ENABLE_SELECT_DISPATCH */ + +/* Function: select_timer_block + * Arguments: until: an absolute time when the select should return + * + * This function dispatches the various file descriptors' handler + * functions, if the kernel indicates there is io available. + */ +void select_timer_block(when until) +{ + fd_set fds[3]; + struct timeval timeout; + struct timeval *timeout_pointer; + int max, nready; + +again: + if (until) { + when interval; + interval = until - now(); + timeout.tv_sec = (interval >> 32); + timeout.tv_usec = ((interval << 32) / 1000000) >> 32; + timeout_pointer = &timeout; + } else + timeout_pointer = NULL; + + max = prepare_fd_sets(&fds[0], &fds[1], &fds[2]); + + nready = select(max, &fds[0], &fds[1], &fds[2], timeout_pointer); + if (nready > 0) + execute_callbacks(&fds[0], &fds[1], &fds[2]); } +#endif /* ENABLE_SELECT_DISPATCH */ /* Function: init_unix_timer() * is called to initialize the library diff --git a/lustre/portals/unals/tcpnal.c b/lustre/portals/unals/tcpnal.c index 6e9cca9..abb6d01 100644 --- a/lustre/portals/unals/tcpnal.c +++ b/lustre/portals/unals/tcpnal.c @@ -251,8 +251,6 @@ int tcpnal_init(bridge b) newly created junk */ return(PTL_NAL_FAILED); } - /* XXX cfs hack */ -// b->lib_nal->libnal_ni.ni_pid.pid=0; b->lower=m; return(PTL_OK); } diff --git a/lustre/portals/utils/acceptor.c b/lustre/portals/utils/acceptor.c index 8aea457..524d128 100644 --- a/lustre/portals/utils/acceptor.c +++ b/lustre/portals/utils/acceptor.c @@ -89,7 +89,11 @@ show_connection (int fd, __u32 net_ip) void usage (char *myname) { - fprintf (stderr, "Usage: %s [-N nal_id] port\n", myname); + fprintf (stderr, + "Usage: %s [-N nal_id] [-p] [-l] port\n\n" + " -l\tKeep stdin/stdout open\n" + " -p\tAllow connections from non-privileged ports\n", + myname); exit (1); } @@ -100,24 +104,27 @@ int main(int argc, char **argv) int c; int noclose = 0; int nal = SOCKNAL; + int rport; + int require_privports = 1; - while ((c = getopt (argc, argv, "N:l")) != -1) - switch (c) - { - case 'l': - noclose = 1; - break; - + while ((c = getopt (argc, argv, "N:lp")) != -1) { + switch (c) { case 'N': if (sscanf(optarg, "%d", &nal) != 1 || nal < 0 || nal > NAL_MAX_NR) usage(argv[0]); break; - + case 'l': + noclose = 1; + break; + case 'p': + require_privports = 0; + break; default: usage (argv[0]); break; } + } if (optind >= argc) usage (argv[0]); @@ -162,7 +169,7 @@ int main(int argc, char **argv) exit(1); } - rc = daemon(1, noclose); + rc = daemon(0, noclose); if (rc < 0) { perror("daemon(): "); exit(1); @@ -180,8 +187,8 @@ int main(int argc, char **argv) struct portals_cfg pcfg; #ifdef HAVE_LIBWRAP struct request_info request; - char addrstr[INET_ADDRSTRLEN]; #endif + char addrstr[INET_ADDRSTRLEN]; cfd = accept(fd, (struct sockaddr *)&clntaddr, &len); if ( cfd < 0 ) { @@ -203,6 +210,18 @@ int main(int argc, char **argv) continue; } #endif + + if (require_privports && ntohs(clntaddr.sin_port) >= IPPORT_RESERVED) { + inet_ntop(AF_INET, &clntaddr.sin_addr, + addrstr, INET_ADDRSTRLEN); + syslog(LOG_ERR, "Closing non-privileged connection from %s:%d\n", + addrstr, ntohs(clntaddr.sin_port)); + rc = close(cfd); + if (rc) + perror ("close un-privileged client failed"); + continue; + } + show_connection (cfd, clntaddr.sin_addr.s_addr); PCFG_INIT(pcfg, NAL_CMD_REGISTER_PEER_FD); diff --git a/lustre/portals/utils/debug.c b/lustre/portals/utils/debug.c index 36d8a04..5b65f24 100644 --- a/lustre/portals/utils/debug.c +++ b/lustre/portals/utils/debug.c @@ -29,9 +29,12 @@ #include #include +#ifdef HAVE_NETDB_H #include +#endif #include #include +#include "ioctl.h" #include #include #include @@ -45,12 +48,15 @@ #include #include +#ifdef HAVE_LINUX_VERSION_H #include #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) #define BUG() /* workaround for module.h includes */ #include #endif +#endif /* !HAVE_LINUX_VERSION_H */ + #include #include @@ -62,7 +68,7 @@ static char rawbuf[8192]; static char *buf = rawbuf; static int max = 8192; -//static int g_pfd = -1; +/*static int g_pfd = -1;*/ static int subsystem_mask = ~0; static int debug_mask = ~0; @@ -72,7 +78,7 @@ static const char *portal_debug_subsystems[] = {"undefined", "mdc", "mds", "osc", "ost", "class", "log", "llite", "rpc", "mgmt", "portals", "libcfs", "socknal", "qswnal", "pinger", "filter", "ptlbd", "echo", "ldlm", "lov", "gmnal", "router", "cobd", - "openibnal", "lmv", "smfs", "cmobd", NULL}; + "ibnal", NULL}; static const char *portal_debug_masks[] = {"trace", "inode", "super", "ext2", "malloc", "cache", "info", "ioctl", "blocks", "net", "warning", "buffs", "other", "dentry", "portals", @@ -371,15 +377,24 @@ int jt_dbg_debug_kernel(int argc, char **argv) fprintf(stderr, "usage: %s [file] [raw]\n", argv[0]); return 0; } - sprintf(filename, "%s.%lu.%u", argc > 1 ? argv[1] : "/tmp/lustre-log", - time(NULL), getpid()); - if (argc > 2) + if (argc > 2) { raw = atoi(argv[2]); + } else if (argc > 1 && (argv[1][0] == '0' || argv[1][0] == '1')) { + raw = atoi(argv[1]); + argc--; + } else { + sprintf(filename, "%s.%lu.%u", argc > 1 ? argv[1] : + "/tmp/lustre-log", time(NULL), getpid()); + } + unlink(filename); fd = open("/proc/sys/portals/dump_kernel", O_WRONLY); if (fd < 0) { + if (errno == ENOENT) /* no dump file created */ + return 0; + fprintf(stderr, "open(dump_kernel) failed: %s\n", strerror(errno)); return 1; @@ -477,25 +492,25 @@ const char debug_daemon_usage[]="usage: debug_daemon {start file [MB]|stop}\n"; int jt_dbg_debug_daemon(int argc, char **argv) { int rc, fd; - + if (argc <= 1) { fprintf(stderr, debug_daemon_usage); return 0; } - + fd = open("/proc/sys/portals/daemon_file", O_WRONLY); if (fd < 0) { fprintf(stderr, "open(daemon_file) failed: %s\n", strerror(errno)); return 1; } - + if (strcasecmp(argv[1], "start") == 0) { if (argc != 3) { fprintf(stderr, debug_daemon_usage); return 1; } - + rc = write(fd, argv[2], strlen(argv[2])); if (rc != strlen(argv[2])) { fprintf(stderr, "write(%s) failed: %s\n", argv[2], @@ -515,7 +530,7 @@ int jt_dbg_debug_daemon(int argc, char **argv) fprintf(stderr, debug_daemon_usage); return 1; } - + close(fd); return 0; } @@ -611,7 +626,6 @@ static struct mod_paths { {"obdfilter", "lustre/obdfilter"}, {"extN", "lustre/extN"}, {"lov", "lustre/lov"}, - {"lmv", "lustre/lmv"}, {"fsfilt_ext3", "lustre/lvfs"}, {"fsfilt_extN", "lustre/lvfs"}, {"fsfilt_reiserfs", "lustre/lvfs"}, @@ -623,13 +637,13 @@ static struct mod_paths { {"ptlbd", "lustre/ptlbd"}, {"mgmt_svc", "lustre/mgmt"}, {"mgmt_cli", "lustre/mgmt"}, - {"cobd", "lustre/cobd"}, - {"cmobd", "lustre/cmobd"}, + {"conf_obd", "lustre/obdclass"}, {NULL, NULL} }; static int jt_dbg_modules_2_4(int argc, char **argv) { +#ifdef HAVE_LINUX_VERSION_H #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) struct mod_paths *mp; char *path = ".."; @@ -665,9 +679,9 @@ static int jt_dbg_modules_2_4(int argc, char **argv) } return 0; -#else /* Headers are 2.6-only */ +#endif /* Headers are 2.6-only */ +#endif /* !HAVE_LINUX_VERSION_H */ return -EINVAL; -#endif } static int jt_dbg_modules_2_5(int argc, char **argv) diff --git a/lustre/portals/utils/portals.c b/lustre/portals/utils/portals.c index 1bde59f..d5d29dc 100644 --- a/lustre/portals/utils/portals.c +++ b/lustre/portals/utils/portals.c @@ -22,13 +22,17 @@ #include #include +#ifdef HAVE_NETDB_H #include +#endif #include +#ifdef HAVE_NETINET_TCP_H #include -#include +#endif #include #include #include +#include "ioctl.h" #include #include #include @@ -54,10 +58,6 @@ unsigned int portal_printk; static unsigned int g_nal = 0; -static int g_socket_txmem = 0; -static int g_socket_rxmem = 0; -static int g_socket_nonagle = 1; - typedef struct { char *name; @@ -70,6 +70,7 @@ static name2num_t nalnames[] = { {"elan", QSWNAL}, {"gm", GMNAL}, {"openib", OPENIBNAL}, + {"iib", IIBNAL}, {NULL, -1} }; @@ -209,6 +210,7 @@ nal2name (int nal) return ((e == NULL) ? "???" : e->name); } +#ifdef HAVE_GETHOSTBYNAME static struct hostent * ptl_gethostbyname(char * hname) { struct hostent *he; @@ -229,6 +231,7 @@ ptl_gethostbyname(char * hname) { } return he; } +#endif int ptl_parse_port (int *port, char *str) @@ -295,7 +298,9 @@ ptl_parse_ipquad (__u32 *ipaddrp, char *str) int ptl_parse_ipaddr (__u32 *ipaddrp, char *str) { +#ifdef HAVE_GETHOSTBYNAME struct hostent *he; +#endif if (!strcmp (str, "_all_")) { @@ -305,7 +310,8 @@ ptl_parse_ipaddr (__u32 *ipaddrp, char *str) if (ptl_parse_ipquad(ipaddrp, str) == 0) return (0); - + +#if HAVE_GETHOSTBYNAME if ((('a' <= str[0] && str[0] <= 'z') || ('A' <= str[0] && str[0] <= 'Z')) && (he = ptl_gethostbyname (str)) != NULL) @@ -315,6 +321,7 @@ ptl_parse_ipaddr (__u32 *ipaddrp, char *str) *ipaddrp = ntohl(addr); /* HOST byte order */ return (0); } +#endif return (-1); } @@ -322,6 +329,7 @@ ptl_parse_ipaddr (__u32 *ipaddrp, char *str) char * ptl_ipaddr_2_str (__u32 ipaddr, char *str, int lookup) { +#ifdef HAVE_GETHOSTBYNAME __u32 net_ip; struct hostent *he; @@ -333,7 +341,8 @@ ptl_ipaddr_2_str (__u32 ipaddr, char *str, int lookup) return (str); } } - +#endif + sprintf (str, "%d.%d.%d.%d", (ipaddr >> 24) & 0xff, (ipaddr >> 16) & 0xff, (ipaddr >> 8) & 0xff, ipaddr & 0xff); @@ -386,6 +395,7 @@ char * ptl_nid2str (char *buffer, ptl_nid_t nid) { __u64 nid64 = ptl_nid2u64(nid); +#ifdef HAVE_GETHOSTBYNAME struct hostent *he = 0; /* Don't try to resolve NIDs that are e.g. Elan host IDs. Assume @@ -400,6 +410,7 @@ ptl_nid2str (char *buffer, ptl_nid_t nid) if (he != NULL) sprintf(buffer, "%#x:%s", (int)(nid64 >> 32), he->h_name); else +#endif /* HAVE_GETHOSTBYNAME */ sprintf(buffer, LPX64, nid64); return (buffer); @@ -524,7 +535,6 @@ int jt_ptl_network(int argc, char **argv) return (-1); } - int jt_ptl_print_interfaces (int argc, char **argv) { @@ -563,6 +573,9 @@ jt_ptl_add_interface (int argc, char **argv) __u32 ipaddr; int rc; __u32 netmask = 0xffffff00; + int i; + int count; + char *end; if (argc < 2 || argc > 3) { fprintf (stderr, "usage: %s ipaddr [netmask]\n", argv[0]); @@ -576,13 +589,19 @@ jt_ptl_add_interface (int argc, char **argv) fprintf (stderr, "Can't parse ip: %s\n", argv[1]); return -1; } - - if (argc > 2 && - ptl_parse_ipquad(&netmask, argv[2]) != 0) { - fprintf (stderr, "Can't parse netmask: %s\n", argv[2]); - return -1; + + if (argc > 2 ) { + count = strtol(argv[2], &end, 0); + if (count > 0 && count < 32 && *end == 0) { + netmask = 0; + for (i = count; i > 0; i--) + netmask = netmask|(1<<(32-i)); + } else if (ptl_parse_ipquad(&netmask, argv[2]) != 0) { + fprintf (stderr, "Can't parse netmask: %s\n", argv[2]); + return -1; + } } - + PCFG_INIT(pcfg, NAL_CMD_ADD_INTERFACE); pcfg.pcfg_id = ipaddr; pcfg.pcfg_misc = netmask; @@ -593,7 +612,7 @@ jt_ptl_add_interface (int argc, char **argv) strerror (errno)); return -1; } - + return 0; } @@ -627,11 +646,11 @@ jt_ptl_del_interface (int argc, char **argv) strerror (errno)); return -1; } - + return 0; } -int +int jt_ptl_print_peers (int argc, char **argv) { struct portals_cfg pcfg; @@ -639,7 +658,7 @@ jt_ptl_print_peers (int argc, char **argv) int index; int rc; - if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, 0)) + if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0)) return -1; for (index = 0;;index++) { @@ -675,7 +694,7 @@ jt_ptl_add_peer (int argc, char **argv) int port = 0; int rc; - if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, 0)) + if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0)) return -1; if (g_nal_is_compatible(NULL, SOCKNAL, 0)) { @@ -685,7 +704,7 @@ jt_ptl_add_peer (int argc, char **argv) return 0; } } else if (argc != 2) { - fprintf (stderr, "usage(openib): %s nid\n", argv[0]); + fprintf (stderr, "usage(openib,iib): %s nid\n", argv[0]); return 0; } @@ -732,7 +751,7 @@ jt_ptl_del_peer (int argc, char **argv) int argidx; int rc; - if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, 0)) + if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0)) return -1; if (g_nal_is_compatible(NULL, SOCKNAL, 0)) { @@ -764,7 +783,7 @@ jt_ptl_del_peer (int argc, char **argv) } if (argc > argidx) { - if (!strcmp (argv[3], "single_share")) { + if (!strcmp (argv[argidx], "single_share")) { single_share = 1; } else { fprintf (stderr, "Unrecognised arg %s'\n", argv[3]); @@ -795,7 +814,7 @@ jt_ptl_print_connections (int argc, char **argv) int index; int rc; - if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, 0)) + if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, 0)) return -1; for (index = 0;;index++) { @@ -832,13 +851,19 @@ jt_ptl_print_connections (int argc, char **argv) int jt_ptl_connect(int argc, char **argv) { +#ifndef HAVE_CONNECT + /* no connect() support */ + return -1; +#else /* HAVE_CONNECT */ struct portals_cfg pcfg; struct sockaddr_in srvaddr; + struct sockaddr_in locaddr; __u32 ipaddr; char *flag; int fd, rc; int type = SOCKNAL_CONN_ANY; - int port; + int port, rport; + int o; if (argc < 3) { fprintf(stderr, "usage: %s ip port [type]\n", argv[0]); @@ -893,20 +918,48 @@ int jt_ptl_connect(int argc, char **argv) return (-1); } + memset(&locaddr, 0, sizeof(locaddr)); + locaddr.sin_family = AF_INET; + locaddr.sin_addr.s_addr = INADDR_ANY; + memset(&srvaddr, 0, sizeof(srvaddr)); srvaddr.sin_family = AF_INET; srvaddr.sin_port = htons(port); srvaddr.sin_addr.s_addr = htonl(ipaddr); - fd = socket(PF_INET, SOCK_STREAM, 0); - if ( fd < 0 ) { - fprintf(stderr, "socket() failed: %s\n", strerror(errno)); - return -1; + + for (rport = IPPORT_RESERVED - 1; rport > IPPORT_RESERVED / 2; --rport) { + fd = socket(PF_INET, SOCK_STREAM, 0); + if ( fd < 0 ) { + fprintf(stderr, "socket() failed: %s\n", strerror(errno)); + return -1; + } + + o = 1; + rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, + &o, sizeof(o)); + + locaddr.sin_port = htons(rport); + rc = bind(fd, (struct sockaddr *)&locaddr, sizeof(locaddr)); + if (rc == 0 || errno == EACCES) { + rc = connect(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr)); + if (rc == 0) { + break; + } else if (errno != EADDRINUSE) { + fprintf(stderr, "Error connecting to host: %s\n", strerror(errno)); + close(fd); + return -1; + } + } else if (errno != EADDRINUSE) { + fprintf(stderr, "Error binding to port %d: %d: %s\n", port, errno, strerror(errno)); + close(fd); + return -1; + } } - rc = connect(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr)); - if ( rc == -1 ) { - fprintf(stderr, "connect() failed: %s\n", strerror(errno)); + if (rport == IPPORT_RESERVED / 2) { + fprintf(stderr, + "Warning: all privileged ports are in use.\n"); return -1; } @@ -937,6 +990,7 @@ int jt_ptl_connect(int argc, char **argv) fprintf(stderr, "close failed: %d\n", rc); return 0; +#endif /* HAVE_CONNECT */ } int jt_ptl_disconnect(int argc, char **argv) @@ -951,7 +1005,7 @@ int jt_ptl_disconnect(int argc, char **argv) return 0; } - if (!g_nal_is_compatible (NULL, SOCKNAL, OPENIBNAL, 0)) + if (!g_nal_is_compatible (NULL, SOCKNAL, OPENIBNAL, IIBNAL, 0)) return 0; if (argc >= 2 && @@ -1491,11 +1545,11 @@ lwt_snapshot(cycles_t *now, int *ncpu, int *totalsize, } /* crappy overloads */ - if (data.ioc_nid != sizeof(lwt_event_t) || - data.ioc_nid2 != offsetof(lwt_event_t, lwte_where)) { + if (data.ioc_nid2 != sizeof(lwt_event_t) || + data.ioc_nid3 != offsetof(lwt_event_t, lwte_where)) { fprintf(stderr,"kernel/user LWT event mismatch %d(%d),%d(%d)\n", - (int)data.ioc_nid, sizeof(lwt_event_t), - (int)data.ioc_nid2, + (int)data.ioc_nid2, sizeof(lwt_event_t), + (int)data.ioc_nid3, (int)offsetof(lwt_event_t, lwte_where)); return (-1); } @@ -1573,12 +1627,21 @@ lwt_put_string(char *ustr) static int lwt_print(FILE *f, cycles_t t0, cycles_t tlast, double mhz, int cpu, lwt_event_t *e) { +#ifndef __WORDSIZE +# error "__WORDSIZE not defined" +#elif __WORDSIZE == 32 +# define XFMT "%#010lx" +#elif __WORDSIZE== 64 +# define XFMT "%#018lx" +#else +# error "Unexpected __WORDSIZE" +#endif char *where = lwt_get_string(e->lwte_where); if (where == NULL) return (-1); - fprintf(f, "%#010lx %#010lx %#010lx %#010lx: %#010lx %1d %10.6f %10.2f %s\n", + fprintf(f, XFMT" "XFMT" "XFMT" "XFMT": "XFMT" %2d %10.6f %10.2f %s\n", e->lwte_p1, e->lwte_p2, e->lwte_p3, e->lwte_p4, (long)e->lwte_task, cpu, (e->lwte_when - t0) / (mhz * 1000000.0), (t0 == e->lwte_when) ? 0.0 : (e->lwte_when - tlast) / mhz, @@ -1587,6 +1650,7 @@ lwt_print(FILE *f, cycles_t t0, cycles_t tlast, double mhz, int cpu, lwt_event_t lwt_put_string(where); return (0); +#undef XFMT } double diff --git a/lustre/utils/lconf b/lustre/utils/lconf index 828db61..6319775 100755 --- a/lustre/utils/lconf +++ b/lustre/utils/lconf @@ -23,7 +23,7 @@ # lconf is the main driver script for starting and stopping # lustre filesystem services. # -# Based in part on the XML obdctl modifications done by Brian Behlendorf +# Based in part on the XML obdctl modifications done by Brian Behlendorf import sys, getopt, types import string, os, stat, popen2, socket, time, random, fcntl, select @@ -61,10 +61,10 @@ MAX_LOOP_DEVICES = 256 PORTALS_DIR = 'portals' # Needed to call lconf --record -CONFIG_FILE = "" +CONFIG_FILE = "" # Please keep these in sync with the values in portals/kp30.h -ptldebug_names = { +ptldebug_names = { "trace" : (1 << 0), "inode" : (1 << 1), "super" : (1 << 2), @@ -88,8 +88,8 @@ ptldebug_names = { "rpctrace" : (1 << 20), "vfstrace" : (1 << 21), "reada" : (1 << 22), - "config" : (1 << 23), - "mmap" : (1 << 24), + "mmap" : (1 << 23), + "config" : (1 << 24), } subsystem_names = { @@ -115,8 +115,11 @@ subsystem_names = { "gmnal" : (1 << 19), "ptlrouter" : (1 << 20), "cobd" : (1 << 21), - "openibnal" : (1 << 22), - "cmobd" : (1 << 23), + "ibnal" : (1 << 22), + "sm" : (1 << 23), + "asobd" : (1 << 24), + "lmv" : (1 << 25), + "cmobd" : (1 << 26), } @@ -126,7 +129,7 @@ def cleanup_error(rc): if not first_cleanup_error: first_cleanup_error = rc -# ============================================================ +# ============================================================ # debugging and error funcs def fixme(msg = "this feature"): @@ -243,7 +246,7 @@ class DaemonHandler: return pid except IOError: return 0 - + def clean_pidfile(self): """ Remove a stale pidfile """ log("removing stale pidfile:", self.pidfile()) @@ -251,7 +254,7 @@ class DaemonHandler: os.unlink(self.pidfile()) except OSError, e: log(self.pidfile(), e) - + class AcceptorHandler(DaemonHandler): def __init__(self, port, net_type): DaemonHandler.__init__(self, "acceptor") @@ -262,8 +265,8 @@ class AcceptorHandler(DaemonHandler): return "/var/run/%s-%d.pid" % (self.command, self.port) def command_line(self): - return string.join(map(str,(self.flags, self.port))) - + return string.join(map(str,(self.flags, self.port))) + acceptors = {} # start the acceptors @@ -283,14 +286,14 @@ def run_one_acceptor(port): if not daemon.running(): daemon.start() else: - panic("run_one_acceptor: No acceptor defined for port:", port) - + panic("run_one_acceptor: No acceptor defined for port:", port) + def stop_acceptor(port): if acceptors.has_key(port): daemon = acceptors[port] if daemon.running(): daemon.stop() - + # ============================================================ # handle lctl interface @@ -315,7 +318,7 @@ class LCTLInterface: def use_save_file(self, file): self.save_file = file - + def record(self, dev_name, logname): log("Recording log", logname, "on", dev_name) self.record_device = dev_name @@ -347,7 +350,7 @@ class LCTLInterface: device $%s record %s %s""" % (self.record_device, self.record_log, cmds) - + debug("+", cmd_line, cmds) if config.noexec: return (0, []) @@ -399,7 +402,7 @@ class LCTLInterface: raise CommandError(self.lctl, out, rc) return rc, out - + def clear_log(self, dev, log): """ clear an existing log """ cmds = """ @@ -409,6 +412,13 @@ class LCTLInterface: quit """ % (dev, log) self.run(cmds) + def root_squash(self, name, uid, nid): + cmds = """ + device $%s + root_squash %s %s + quit""" % (name, uid, nid) + self.run(cmds) + def network(self, net, nid): """ set mynid """ cmds = """ @@ -417,11 +427,22 @@ class LCTLInterface: quit """ % (net, nid) self.run(cmds) - def root_squash(self, name, uid, nid): + # add an interface + def add_interface(self, net, ip, netmask = ""): + """ add an interface """ cmds = """ - device $%s - root_squash %s %s - quit""" % (name, uid, nid) + network %s + add_interface %s %s + quit """ % (net, ip, netmask) + self.run(cmds) + + # delete an interface + def del_interface(self, net, ip): + """ delete an interface """ + cmds = """ + network %s + del_interface %s + quit """ % (net, ip) self.run(cmds) # create a new connection @@ -429,26 +450,28 @@ class LCTLInterface: cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type) self.run(cmds) - def add_peer(self, net_type, nid, hostaddr, port): - if net_type in ('tcp',) and not config.lctl_dump: + def add_peer(self, net_type, nid, hostaddr, port): + if net_type in ('tcp',) and not config.lctl_dump: cmds = """ network %s add_peer %s %s %d quit""" % (net_type, nid, hostaddr, port ) self.run(cmds) - elif net_type in ('openib',) and not config.lctl_dump: + elif net_type in ('openib','iib',) and not config.lctl_dump: cmds = """ network %s add_peer %s quit""" % (net_type, - nid) - self.run(cmds) - + nid ) + self.run(cmds) + def connect(self, srv): self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid) - if srv.net_type in ('tcp','openib',) and not config.lctl_dump: - self.add_peer(srv.net_type, srv.nid, srv.hostaddr, srv.port) + if srv.net_type in ('tcp','openib','iib',) and not config.lctl_dump: + if srv.hostaddr[0]: + hostaddr = string.split(srv.hostaddr[0], '/')[0] + self.add_peer(srv.net_type, srv.nid, hostaddr, srv.port) # Recover a device def recover(self, dev_name, new_conn): @@ -456,7 +479,7 @@ class LCTLInterface: device $%s recover %s""" %(dev_name, new_conn) self.run(cmds) - + # add a route to a range def add_route(self, net, gw, lo, hi): cmds = """ @@ -469,7 +492,7 @@ class LCTLInterface: except CommandError, e: log ("ignore: ") e.dump() - + def del_route(self, net, gw, lo, hi): cmds = """ ignore_errors @@ -502,6 +525,7 @@ class LCTLInterface: quit """ % (net, gw, tgt) self.run(cmds) + def del_peer(self, net_type, nid, hostaddr): if net_type in ('tcp',) and not config.lctl_dump: cmds = """ @@ -511,7 +535,7 @@ class LCTLInterface: quit""" % (net_type, nid, hostaddr) self.run(cmds) - elif net_type in ('openib',) and not config.lctl_dump: + elif net_type in ('openib','iib',) and not config.lctl_dump: cmds = """ ignore_errors network %s @@ -519,12 +543,14 @@ class LCTLInterface: quit""" % (net_type, nid) self.run(cmds) - + # disconnect one connection def disconnect(self, srv): self.del_uuid(srv.nid_uuid) - if srv.net_type in ('tcp','openib',) and not config.lctl_dump: - self.del_peer(srv.net_type, srv.nid, srv.hostaddr) + if srv.net_type in ('tcp','openib','iib',) and not config.lctl_dump: + if srv.hostaddr[0]: + hostaddr = string.split(srv.hostaddr[0], '/')[0] + self.del_peer(srv.net_type, srv.nid, hostaddr) def del_uuid(self, uuid): cmds = """ @@ -554,7 +580,7 @@ class LCTLInterface: setup %s quit""" % (name, setup) self.run(cmds) - + def add_conn(self, name, conn_uuid): cmds = """ cfg_device %s @@ -571,7 +597,7 @@ class LCTLInterface: except CommandError, e: self.cleanup(name, uuid, 0) raise e - + # cleanup a device def cleanup(self, name, uuid, force, failover = 0): @@ -591,8 +617,8 @@ class LCTLInterface: cmds = """ attach lov %s %s lov_setup %s %d %d %d %s %s - quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off, - pattern, devlist) + quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off, + pattern, devlist) self.run(cmds) # add an OBD to a LOV @@ -741,7 +767,7 @@ def find_module(src_dir, dev_dir, modname): modbase = src_dir +'/'+ dev_dir +'/'+ modname for modext in '.ko', '.o': module = modbase + modext - try: + try: if os.access(module, os.R_OK): return module except OSError: @@ -770,8 +796,6 @@ def jdev(opts): i=i+1 return '' - - # build fs according to type # fixme: dangerous def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1): @@ -789,7 +813,7 @@ def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1): # ext3 journal size is in megabytes # but don't set jsize if mkfsoptions indicates a separate journal device if jsize == 0 and jdev(mkfsoptions) == '': - if devsize == 0: + if devsize == 0: if not is_block(dev): ret, out = runcmd("ls -l %s" %dev) devsize = int(string.split(out[0])[4]) / 1024 @@ -801,7 +825,7 @@ def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1): else: # sfdisk -s will fail for too large block device, # then, read the size of partition from /proc/partitions - + # get the realpath of the device # it may be the real device, such as /dev/hda7 # or the hardlink created via mknod for a device @@ -819,14 +843,14 @@ def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1): real_dev = os.path.join(os.path.dirname(real_dev), dev_link) if link_count > 19: panic("Entountered too many symbolic links resolving block device:", dev) - + # get the major and minor number of the realpath via ls - # it seems python(os.stat) does not return + # it seems python(os.stat) does not return # the st_rdev member of the stat structure ret, out = runcmd("ls -l %s" %real_dev) major = string.split(string.split(out[0])[4], ",")[0] minor = string.split(out[0])[5] - + # get the devsize from /proc/partitions with the major and minor number ret, out = runcmd("cat /proc/partitions") for line in out: @@ -838,7 +862,7 @@ def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1): if devsize > 1024 * 1024: jsize = ((devsize / 102400) * 4) if jsize > 400: - jsize = 400 + jsize = 400 if jsize: jopt = "-J size=%d" %(jsize,) if isize: iopt = "-I %d" %(isize,) mkfs = 'mkfs.ext2 -j -b 4096 ' @@ -848,11 +872,10 @@ def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1): jmkfs = 'mkfs.ext2 -b 4096 -O journal_dev ' if config.force: jmkfs = jmkfs + '-F ' - jmkfs = jmkfs + jdev(mkfsoptions) + jmkfs = jmkfs + jdev(mkfsoptions) (ret, out) = run (jmkfs) if ret: panic("Unable format journal device:", jdev(mkfsoptions), string.join(out)) - elif fstype == 'reiserfs': # reiserfs journal size is in blocks if jsize: jopt = "--journal_size %d" %(jsize,) @@ -954,7 +977,7 @@ def clean_loop(file): # determine if dev is formatted as a filesystem def need_format(fstype, dev): - # FIXME don't know how to implement this + # FIXME don't know how to implement this return 0 # initialize a block device if needed @@ -973,7 +996,7 @@ def block_dev(dev, size, fstype, reformat, autoformat, journal_size, # panic("device:", dev, # "not prepared, and autoformat is not set.\n", # "Rerun with --reformat option to format ALL filesystems") - + return dev def if2addr(iface): @@ -1011,11 +1034,11 @@ def sys_get_local_nid(net_type, wildcard, cluster_id): else: local = sys_get_local_address(net_type, wildcard, cluster_id) return local - + def sys_get_local_address(net_type, wildcard, cluster_id): """Return the local address for the network type.""" local = "" - if net_type in ('tcp','openib',): + if net_type in ('tcp','openib','iib',): if ':' in wildcard: iface, star = string.split(wildcard, ':') local = if2addr(iface) @@ -1039,7 +1062,7 @@ def sys_get_local_address(net_type, wildcard, cluster_id): elan_id = a[1] break try: - nid = my_int(cluster_id) + my_int(elan_id) + nid = my_int(cluster_id) + my_int(elan_id) local = "%d" % (nid) except ValueError, e: local = elan_id @@ -1116,7 +1139,7 @@ def fs_is_mounted(path): except IOError, e: log(e) return 0 - + class kmod: """Manage kernel modules""" @@ -1140,7 +1163,7 @@ class kmod: continue log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir) if src_dir: - module = find_module(src_dir, dev_dir, mod) + module = find_module(src_dir, dev_dir, mod) if not module: panic('module not found:', mod) (rc, out) = run('/sbin/insmod', module) @@ -1182,7 +1205,7 @@ class Module: self._server = None self._connected = 0 self.kmod = kmod(config.lustre, config.portals) - + def info(self, *args): msg = string.join(map(str,args)) print self.module_name + ":", self.name, self.uuid, msg @@ -1196,7 +1219,7 @@ class Module: log(self.module_name, "cleanup failed: ", self.name) e.dump() cleanup_error(e.rc) - + def add_portals_module(self, dev_dir, modname): """Append a module to list of modules to load.""" self.kmod.add_portals_module(dev_dir, modname) @@ -1208,7 +1231,7 @@ class Module: def load_module(self): """Load all the modules in the list in the order they appear.""" self.kmod.load_module() - + def cleanup_module(self): """Unload the modules in the list in reverse order.""" if self.safe_to_clean(): @@ -1216,10 +1239,10 @@ class Module: def safe_to_clean(self): return 1 - + def safe_to_clean_modules(self): return self.safe_to_clean() - + class Network(Module): def __init__(self,db): Module.__init__(self, 'NETWORK', db) @@ -1239,12 +1262,14 @@ class Network(Module): self.nid_uuid = self.nid_to_uuid(self.nid) - self.hostaddr = self.db.get_val('hostaddr', self.nid) - if '*' in self.hostaddr: - self.hostaddr = sys_get_local_address(self.net_type, self.hostaddr, self.cluster_id) - if not self.hostaddr: - panic("unable to set hostaddr for", self.net_type, self.hostaddr, self.cluster_id) - debug("hostaddr:", self.hostaddr) + self.hostaddr = self.db.get_hostaddr() + if len(self.hostaddr) == 0: + self.hostaddr.append(self.nid) + if '*' in self.hostaddr[0]: + self.hostaddr[0] = sys_get_local_address(self.net_type, self.hostaddr[0], self.cluster_id) + if not self.hostaddr[0]: + panic("unable to set hostaddr for", self.net_type, self.hostaddr[0], self.cluster_id) + debug("hostaddr:", self.hostaddr[0]) self.add_portals_module("libcfs", 'libcfs') self.add_portals_module("portals", 'portals') @@ -1258,6 +1283,8 @@ class Network(Module): self.add_portals_module("knals/gmnal", 'kgmnal') if self.net_type == 'openib': self.add_portals_module("knals/openibnal", 'kopenibnal') + if self.net_type == 'iib': + self.add_portals_module("knals/iibnal", 'kiibnal') def nid_to_uuid(self, nid): return "NID_%s_UUID" %(nid,) @@ -1270,6 +1297,13 @@ class Network(Module): lctl.network(self.net_type, self.nid) if self.net_type == 'tcp': sys_tweak_socknal() + for hostaddr in self.db.get_hostaddr(): + ip = string.split(hostaddr, '/')[0] + if len(string.split(hostaddr, '/')) == 2: + netmask = string.split(hostaddr, '/')[1] + else: + netmask = "" + lctl.add_interface(self.net_type, ip, netmask) if self.net_type == 'elan': sys_optimize_elan() if self.port and node_is_router(): @@ -1312,6 +1346,10 @@ class Network(Module): stop_acceptor(self.port) if node_is_router(): self.disconnect_peer_gateways() + if self.net_type == 'tcp': + for hostaddr in self.db.get_hostaddr(): + ip = string.split(hostaddr, '/')[0] + lctl.del_interface(self.net_type, ip) def correct_level(self, level, op=None): return level @@ -1322,10 +1360,9 @@ class RouteTable(Module): def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi): - # only setup connections for tcp and openib NALs - srvdb = None - - if not net_type in ('tcp','openib'): + # only setup connections for tcp, openib, and iib NALs + srvdb = None + if not net_type in ('tcp','openib','iib',): return None # connect to target if route is to single node and this node is the gw @@ -1345,7 +1382,7 @@ class RouteTable(Module): return None return Network(srvdb) - + def prepare(self): if not config.record and is_network_prepared(): return @@ -1412,7 +1449,7 @@ class LOV(Module): self.devlist = self.db.get_lov_tgts('lov_tgt') self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist)) self.osclist = [] - self.obdlist = [] + self.obdlist = [] self.desc_uuid = self.uuid self.uuid = generate_client_uuid(self.name) self.fs_name = fs_name @@ -1506,7 +1543,7 @@ class LMV(Module): self.mdclist.append(mdc) else: panic('mdc not found:', mds_uuid) - + def prepare(self): if is_prepared(self.name): return @@ -1600,7 +1637,7 @@ class MDSDEV(Module): if not self.lmv: panic("No LMV initialized and not lovconfig_uuid found") - + lovconfig_uuid = self.lmv.get_first_ref('lovconfig') lovconfig = self.lmv.lookup(lovconfig_uuid) lov_uuid = lovconfig.get_first_ref('lov') @@ -1624,8 +1661,7 @@ class MDSDEV(Module): stripe_count = lov.stripe_cnt else: stripe_count = len(lov.devlist) - - if stripe_count > 77: + if stripe_count > 77: self.inode_size = 4096 elif stripe_count > 35: self.inode_size = 2048 @@ -1655,14 +1691,14 @@ class MDSDEV(Module): if self.fstype == 'smfs': self.add_lustre_module('smfs', 'smfs') - + if self.fstype == 'ldiskfs': self.add_lustre_module('ldiskfs', 'ldiskfs') if self.fstype: self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype)) - - # if fstype is smfs, then we should also take care about backing + + # if fstype is smfs, then we should also take care about backing # store fs. if self.fstype == 'smfs': self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.backfstype)) @@ -1695,12 +1731,12 @@ class MDSDEV(Module): blkdev = block_dev(self.devpath, self.size, self.fstype, 0, self.format, self.journal_size, self.inode_size, self.mkfsoptions, self.backfstype, self.backdevpath) - + if not is_prepared('MDT'): lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="") - try: + try: mountfsoptions = def_mount_options(self.fstype, 'mds') - + if config.mountfsoptions: if mountfsoptions: mountfsoptions = mountfsoptions + ',' + config.mountfsoptions @@ -1714,28 +1750,28 @@ class MDSDEV(Module): mountfsoptions = mountfsoptions + ',' + self.mountfsoptions else: mountfsoptions = self.mountfsoptions - + if self.fstype == 'smfs': realdev = self.fstype - + if mountfsoptions: - mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions, - self.backfstype, + mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions, + self.backfstype, blkdev) else: - mountfsoptions = "type=%s,dev=%s" % (self.backfstype, + mountfsoptions = "type=%s,dev=%s" % (self.backfstype, blkdev) else: realdev = blkdev - + print 'MDS mount options: ' + mountfsoptions - + if not self.master_mds: - self.master_mds = 'dumb' + self.master_mds = 'dumb' if not self.cachetype: self.cachetype = 'dumb' lctl.newdev("mds", self.name, self.uuid, - setup ="%s %s %s %s %s %s" %(realdev, self.fstype, + setup ="%s %s %s %s %s %s" %(realdev, self.fstype, self.name, mountfsoptions, self.master_mds, self.cachetype)) @@ -1796,28 +1832,28 @@ class MDSDEV(Module): if self.fstype == 'smfs': realdev = self.fstype - + if mountfsoptions: - mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions, - self.backfstype, + mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions, + self.backfstype, blkdev) else: - mountfsoptions = "type=%s,dev=%s" % (self.backfstype, + mountfsoptions = "type=%s,dev=%s" % (self.backfstype, blkdev) else: realdev = blkdev print 'MDS mount options: ' + mountfsoptions - # As mount options are passed by 4th param to config tool, we need + # As mount options are passed by 4th param to config tool, we need # to pass something in 3rd param. But we do not want this 3rd param # be counted as a profile name for reading log on MDS setup, thus, - # we pass there some predefined sign like 'dumb', which will be + # we pass there some predefined sign like 'dumb', which will be # checked in MDS code and skipped. Probably there is more nice way # like pass empty string and check it in config tool and pass null # as 4th param. lctl.newdev("mds", self.name, self.uuid, - setup ="%s %s %s %s" %(realdev, self.fstype, + setup ="%s %s %s %s" %(realdev, self.fstype, 'dumb', mountfsoptions)) do_cleanup = 1 @@ -1829,14 +1865,14 @@ class MDSDEV(Module): # this is ugly, should be organized nice later. target_uuid = self.db.get_first_ref('target') mds = self.db.lookup(target_uuid) - + lovconfig_uuid = mds.get_first_ref('lovconfig') if lovconfig_uuid: lovconfig = mds.lookup(lovconfig_uuid) obd_uuid = lovconfig.get_first_ref('lov') else: obd_uuid = fs.get_first_ref('obd') - + client_uuid = generate_client_uuid(self.name) client = VOSC(self.db.lookup(obd_uuid), client_uuid, self.name, self.name) @@ -1947,7 +1983,7 @@ class MDSDEV(Module): print "cleanup failed: ", self.name e.dump() cleanup_error(e.rc) - + if self.fstype == 'smfs': clean_loop(self.backdevpath) else: @@ -1990,7 +2026,7 @@ class OSD(Module): self.active = 0 if self.active and config.group and config.group != ost.get_val('group'): self.active = 0 - + self.target_dev_uuid = self.uuid self.uuid = target_uuid # modules @@ -2039,7 +2075,7 @@ class OSD(Module): self.backdevpath) mountfsoptions = def_mount_options(self.fstype, 'ost') - + if config.mountfsoptions: if mountfsoptions: mountfsoptions = mountfsoptions + ',' + config.mountfsoptions @@ -2053,25 +2089,25 @@ class OSD(Module): mountfsoptions = mountfsoptions + ',' + self.mountfsoptions else: mountfsoptions = self.mountfsoptions - + if self.fstype == 'smfs': realdev = self.fstype - + if mountfsoptions: - mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions, - self.backfstype, + mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions, + self.backfstype, blkdev) else: - mountfsoptions = "type=%s,dev=%s" % (self.backfstype, + mountfsoptions = "type=%s,dev=%s" % (self.backfstype, blkdev) else: realdev = blkdev - + print 'OSD mount options: ' + mountfsoptions - + lctl.newdev(self.osdtype, self.name, self.uuid, setup ="%s %s %s %s" %(realdev, self.fstype, - self.failover_ost, + self.failover_ost, mountfsoptions)) if not is_prepared('OSS'): lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="") @@ -2127,7 +2163,7 @@ class Client(Module): self.db = tgtdb self.active = 1 self.backup_targets = [] - + self.tgt_dev_uuid = get_active_target(tgtdb) if not self.tgt_dev_uuid: panic("No target device found for target(1):", self.target_name) @@ -2274,15 +2310,15 @@ class VLOV(Module): if name_override != None: self.name = "lov_%s" % name_override self.add_lustre_module('lov', 'lov') - self.stripe_sz = 65536 - self.stripe_off = 0 + self.stripe_sz = 65536 + self.stripe_off = 0 self.pattern = 0 - self.stripe_cnt = 1 + self.stripe_cnt = 1 self.desc_uuid = self.uuid self.uuid = generate_client_uuid(self.name) self.fs_name = fs_name self.osc = get_osc(db, self.uuid, fs_name) - if not self.osc: + if not self.osc: panic('osc not found:', self.uuid) if config_only: self.config_only = 1 @@ -2299,7 +2335,7 @@ class VLOV(Module): self.stripe_sz, self.stripe_off, self.pattern) target_uuid = self.osc.target_uuid try: - self.osc.active = 1 + self.osc.active = 1 self.osc.prepare(ignore_connect_failure=0) except CommandError, e: print "Error preparing OSC %s\n" % osc.uuid @@ -2332,7 +2368,7 @@ class VLOV(Module): class CMOBD(Module): def __init__(self,db): Module.__init__(self, 'CMOBD', db) - self.name = self.db.getName(); + self.name = self.db.getName(); self.uuid = generate_client_uuid(self.name) self.master_uuid = self.db.get_first_ref('masterobd') self.cache_uuid = self.db.get_first_ref('cacheobd') @@ -2345,12 +2381,12 @@ class CMOBD(Module): panic('cache obd not found:', self.cache_uuid) if master_obd.get_class() == 'ost': - self.client_uuid = generate_client_uuid(self.name) - self.master= VLOV(master_obd, self.client_uuid, self.name, + self.client_uuid = generate_client_uuid(self.name) + self.master= VLOV(master_obd, self.client_uuid, self.name, "%s_master" % (self.name)) self.master_uuid = self.master.get_uuid() else: - self.master = get_mdc(db, self.name, self.master_uuid) + self.master = get_mdc(db, self.name, self.master_uuid) # need to check /proc/mounts and /etc/mtab before # formatting anything. # FIXME: check if device is already formatted. @@ -2375,14 +2411,14 @@ class CMOBD(Module): def cleanup_module(self): Module.cleanup_module(self) self.master.cleanup_module() - + def correct_level(self, level, op=None): return level class COBD(Module): def __init__(self, db, uuid, name, type, name_override = None): Module.__init__(self, 'COBD', db) - self.name = self.db.getName(); + self.name = self.db.getName(); self.uuid = generate_client_uuid(self.name) self.real_uuid = self.db.get_first_ref('realobd') self.cache_uuid = self.db.get_first_ref('cacheobd') @@ -2394,13 +2430,13 @@ class COBD(Module): if not cache_obd: panic('cache obd not found:', self.cache_uuid) if type == 'obd': - self.real = LOV(real_obd, self.real_uuid, name, + self.real = LOV(real_obd, self.real_uuid, name, "%s_real" % (self.name)); - self.cache = LOV(cache_obd, self.cache_uuid, name, + self.cache = LOV(cache_obd, self.cache_uuid, name, "%s_cache" % (self.name)); else: - self.real = get_mdc(db, name, self.real_uuid) - self.cache = get_mdc(db, name, self.cache_uuid) + self.real = get_mdc(db, name, self.real_uuid) + self.cache = get_mdc(db, name, self.cache_uuid) # need to check /proc/mounts and /etc/mtab before # formatting anything. # FIXME: check if device is already formatted. @@ -2565,18 +2601,18 @@ class Mountpoint(Module): ost = self.db.lookup(self.obd_uuid) if not ost: panic("no ost: ", self.obd_uuid) - + mds = self.db.lookup(self.mds_uuid) if not mds: panic("no mds: ", self.mds_uuid) - + self.add_lustre_module('mdc', 'mdc') self.add_lustre_module('lmv', 'lmv') self.add_lustre_module('llite', 'llite') - + self.vosc = VOSC(ost, client_uuid, self.name) self.vmdc = VMDC(mds, client_uuid, self.name) - + def prepare(self): if not config.record and fs_is_mounted(self.path): log(self.path, "already mounted.") @@ -2601,16 +2637,16 @@ class Mountpoint(Module): self.clientoptions = ',' + self.clientoptions # Linux kernel will deal with async and not pass it to ll_fill_super, # so replace it with Lustre async - self.clientoptions = string.replace(self.clientoptions, "async", + self.clientoptions = string.replace(self.clientoptions, "async", "lasync") cmd = "mount -t lustre_lite -o osc=%s,mdc=%s%s %s %s" % \ - (self.vosc.get_name(), vmdc_name, self.clientoptions, + (self.vosc.get_name(), vmdc_name, self.clientoptions, config.config, self.path) run("mkdir", self.path) ret, val = run(cmd) if ret: - self.vmdc.cleanup() + self.vmdc.cleanup() self.vosc.cleanup() panic("mount failed:", self.path, ":", string.join(val)) @@ -2664,7 +2700,7 @@ def get_ost_net(self, osd_uuid): return srv_list -# the order of iniitailization is based on level. +# the order of iniitailization is based on level. def getServiceLevel(self): type = self.get_class() ret=0; @@ -2681,7 +2717,7 @@ def getServiceLevel(self): elif type in ('lmv',): ret = 45 elif type in ('cmobd',): - ret = 50 + ret = 50 elif type in ('mountpoint', 'echoclient'): ret = 70 else: @@ -2728,6 +2764,7 @@ def get_mdc(db, fs_name, mds_uuid): ############################################################ # routing ("rooting") + # list of (nettype, cluster_id, nid) local_clusters = [] @@ -2741,7 +2778,7 @@ def find_local_clusters(node_db): if srv.port > 0: if acceptors.has_key(srv.port): panic("duplicate port:", srv.port) - acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type) + acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type) # This node is a gateway. is_router = 0 @@ -2817,7 +2854,7 @@ def find_route(srv_list): if (r[3] <= to and to <= r[4]) and cluster_id == r[2]: result.append((srv, r)) return result - + def get_active_target(db): target_uuid = db.getUUID() target_name = db.getName() @@ -2833,7 +2870,7 @@ def get_server_by_nid_uuid(db, nid_uuid): net = Network(n) if net.nid_uuid == nid_uuid: return net - + ############################################################ # lconf level logic @@ -2870,7 +2907,7 @@ def newService(db): # # Prepare the system to run lustre using a particular profile -# in a the configuration. +# in a the configuration. # * load & the modules # * setup networking for the current node # * make sure partitions are in place and prepared @@ -2881,7 +2918,7 @@ def for_each_profile(db, prof_list, operation): prof_db = db.lookup(prof_uuid) if not prof_db: panic("profile:", prof_uuid, "not found.") - services = getServices(prof_db) + services = getServices(prof_db) operation(services) def magic_get_osc(db, rec, lov): @@ -3077,7 +3114,7 @@ def doUnloadModules(services): n.cleanup_module() # -# Load profile for +# Load profile for def doHost(lustreDB, hosts): global is_router, local_node_name node_db = None @@ -3095,7 +3132,7 @@ def doHost(lustreDB, hosts): timeout = node_db.get_val_int('timeout', 0) ptldebug = node_db.get_val('ptldebug', '') subsystem = node_db.get_val('subsystem', '') - + find_local_clusters(node_db) if not is_router: find_local_routes(lustreDB) @@ -3202,7 +3239,7 @@ def setupModulePath(cmd, portals_dir = PORTALS_DIR): base = os.path.dirname(cmd) if development_mode(): if not config.lustre: - debug('using objdir module paths') + debug('using objdir module paths') config.lustre = (os.path.join(base, "..")) # normalize the portals dir, using command line arg if set if config.portals: @@ -3212,7 +3249,7 @@ def setupModulePath(cmd, portals_dir = PORTALS_DIR): debug('config.portals', config.portals) elif config.lustre and config.portals: # production mode - # if --lustre and --portals, normalize portals + # if --lustre and --portals, normalize portals # can ignore POTRALS_DIR here, since it is probly useless here config.portals = os.path.join(config.lustre, config.portals) debug('config.portals B', config.portals) @@ -3320,8 +3357,8 @@ def sys_set_netmem_max(path, max): fp = open(path, 'w') fp.write('%d\n' %(max)) fp.close() - - + + def sys_make_devices(): if not os.access('/dev/portals', os.R_OK): run('mknod /dev/portals c 10 240') @@ -3335,7 +3372,7 @@ def add_to_path(new_dir): if new_dir in syspath: return os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir - + def default_debug_path(): path = '/tmp/lustre-log' if os.path.isdir('/r'): @@ -3418,7 +3455,7 @@ lconf_options = [ PARAM), ('minlevel', "Minimum level of services to configure/cleanup", INTPARAM, 0), - ('maxlevel', """Maximum level of services to configure/cleanup + ('maxlevel', """Maximum level of services to configure/cleanup Levels are aproximatly like: 10 - netwrk 20 - device, ldlm @@ -3449,14 +3486,14 @@ lconf_options = [ ('inactive', """The name of an inactive service, to be ignored during mounting (currently OST-only). Can be repeated.""", PARAMLIST), - ] + ] def main(): global lctl, config, toplustreDB, CONFIG_FILE # in the upcall this is set to SIG_IGN signal.signal(signal.SIGCHLD, signal.SIG_DFL) - + cl = Lustre.Options("lconf", "config.xml", lconf_options) try: config, args = cl.parse(sys.argv[1:]) @@ -3479,7 +3516,7 @@ def main(): random.seed(seed) sanitise_path() - + init_select(config.select) if len(args) > 0: diff --git a/lustre/utils/lmc b/lustre/utils/lmc index 3fea4e2..e8e5f10 100755 --- a/lustre/utils/lmc +++ b/lustre/utils/lmc @@ -73,9 +73,9 @@ Object creation command summary: --add net --node node_name --nid nid - --cluster_id - --nettype tcp|elan|gm|openib - --hostaddr addr + --cluster_id + --nettype tcp|elan|gm|openib|iib + --hostaddr ip[/netmask] --port port --tcpbuf size --irq_affinity 0|1 @@ -176,6 +176,7 @@ Object creation command summary: """ PARAM = Lustre.Options.PARAM +PARAMLIST = Lustre.Options.PARAMLIST lmc_options = [ # lmc input/output options ('reference', "Print short reference for commands."), @@ -200,11 +201,11 @@ lmc_options = [ ('ptldebug', "Set the portals debug level", PARAM), ('subsystem', "Specify which Lustre subsystems have debug output recorded in the log", PARAM), - # network - ('nettype', "Specify the network type. This can be tcp/elan/gm/openib.", PARAM), + # network + ('nettype', "Specify the network type. This can be tcp/elan/gm/openib/iib.", PARAM), ('nid', "Give the network ID, e.g ElanID/IP Address as used by portals.", PARAM), ('port', "Optional argument to specify the TCP port number.", PARAM, DEFAULT_PORT), - ('hostaddr', "", PARAM,""), + ('hostaddr', "Optional argument to specify the host address.", PARAMLIST), ('cluster_id', "Specify the cluster ID", PARAM, "0"), # routes @@ -392,8 +393,8 @@ class GenConfig: network.setAttribute("nettype", net); self.addElement(network, "nid", nid) self.addElement(network, "clusterid", cluster_id) - if hostaddr: - self.addElement(network, "hostaddr", hostaddr) + for host in hostaddr: + self.addElement(network, "hostaddr", host) if port: self.addElement(network, "port", "%d" %(port)) @@ -922,7 +923,7 @@ def add_net(gen, lustre, options): if net_type in ('tcp',): port = get_option_int(options, 'port') - elif net_type in ('elan', 'gm', 'openib'): + elif net_type in ('elan', 'gm', 'openib','iib'): port = 0 else: print "Unknown net_type: ", net_type diff --git a/lustre/utils/obd.c b/lustre/utils/obd.c index 2cf74f9a..765793b 100644 --- a/lustre/utils/obd.c +++ b/lustre/utils/obd.c @@ -2308,6 +2308,7 @@ int jt_llog_cancel(int argc, char **argv) return rc; } + int jt_llog_check(int argc, char **argv) { struct obd_ioctl_data data; @@ -2375,6 +2376,7 @@ int jt_llog_remove(int argc, char **argv) return rc; } + int jt_obd_reint_sync(int argc, char **argv) { struct obd_ioctl_data data; @@ -2410,6 +2412,7 @@ int jt_obd_cache_on(int argc, char **argv) return rc; } + int jt_obd_cache_off(int argc, char **argv) { struct obd_ioctl_data data; @@ -2425,8 +2428,13 @@ int jt_obd_cache_off(int argc, char **argv) rc); return rc; } + int jt_obd_snap_add(int argc, char **argv) { +#if 1 + return -1; +#else +# error "FIX the missing #defines before committing" struct obd_ioctl_data data; int rc = 0; @@ -2452,7 +2460,9 @@ int jt_obd_snap_add(int argc, char **argv) if (rc) fprintf(stderr, "OBD_IOC_SNAP_ADD failed: rc=%d\n", rc); return rc; +#endif } + static void signal_server(int sig) { if (sig == SIGINT) { -- 1.8.3.1